diff options
Diffstat (limited to 'arch/x86')
217 files changed, 4843 insertions, 5070 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 883da0abf779..9a2849527dd7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -115,6 +115,7 @@ config X86 select GENERIC_CPU_AUTOPROBE select GENERIC_CPU_VULNERABILITIES select GENERIC_EARLY_IOREMAP + select GENERIC_ENTRY select GENERIC_FIND_FIRST_BIT select GENERIC_IOMAP select GENERIC_IRQ_EFFECTIVE_AFF_MASK if SMP @@ -161,7 +162,6 @@ config X86 select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL select HAVE_CONTEXT_TRACKING if X86_64 - select HAVE_COPY_THREAD_TLS select HAVE_C_RECORDMCOUNT select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS @@ -188,6 +188,7 @@ config X86 select HAVE_KERNEL_LZMA select HAVE_KERNEL_LZO select HAVE_KERNEL_XZ + select HAVE_KERNEL_ZSTD select HAVE_KPROBES select HAVE_KPROBES_ON_FTRACE select HAVE_FUNCTION_ERROR_INJECTION @@ -802,6 +803,7 @@ config KVM_GUEST depends on PARAVIRT select PARAVIRT_CLOCK select ARCH_CPUIDLE_HALTPOLL + select X86_HV_CALLBACK_VECTOR default y help This option enables various optimizations for running under the KVM @@ -909,6 +911,7 @@ config DMI config GART_IOMMU bool "Old AMD GART IOMMU support" + select DMA_OPS select IOMMU_HELPER select SWIOTLB depends on X86_64 && PCI && AMD_NB @@ -1292,7 +1295,6 @@ config MICROCODE bool "CPU microcode loading support" default y depends on CPU_SUP_AMD || CPU_SUP_INTEL - select FW_LOADER help If you say Y here, you will be able to update the microcode on Intel and AMD processors. The Intel support is for the IA32 family, @@ -1314,7 +1316,6 @@ config MICROCODE_INTEL bool "Intel microcode loading support" depends on MICROCODE default MICROCODE - select FW_LOADER help This options enables microcode patch loading support for Intel processors. @@ -1326,7 +1327,6 @@ config MICROCODE_INTEL config MICROCODE_AMD bool "AMD microcode loading support" depends on MICROCODE - select FW_LOADER help If you select this option, microcode patch loading support for AMD processors will be enabled. diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 0dd319e6e5b4..ee1d3c5834c6 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -3,6 +3,9 @@ config TRACE_IRQFLAGS_SUPPORT def_bool y +config TRACE_IRQFLAGS_NMI_SUPPORT + def_bool y + config EARLY_PRINTK_USB bool diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 00e378de8bc0..4346ffb2e39f 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -36,8 +36,8 @@ REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -DDISABLE_BRANCH_PROFILING \ -fno-strict-aliasing -fomit-frame-pointer -fno-pic \ -mno-mmx -mno-sse -REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -ffreestanding) -REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -fno-stack-protector) +REALMODE_CFLAGS += -ffreestanding +REALMODE_CFLAGS += -fno-stack-protector REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -Wno-address-of-packed-member) REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align4)) export REALMODE_CFLAGS @@ -47,10 +47,6 @@ export REALMODE_CFLAGS # e.g.: obj-y += foo_$(BITS).o export BITS -ifdef CONFIG_X86_NEED_RELOCS - LDFLAGS_vmlinux := --emit-relocs --discard-none -endif - # # Prevent GCC from generating any FP code by mistake. # @@ -177,17 +173,6 @@ ifeq ($(ACCUMULATE_OUTGOING_ARGS), 1) KBUILD_CFLAGS += $(call cc-option,-maccumulate-outgoing-args,) endif -KBUILD_LDFLAGS := -m elf_$(UTS_MACHINE) - -# -# The 64-bit kernel must be aligned to 2MB. Pass -z max-page-size=0x200000 to -# the linker to force 2MB page size regardless of the default page size used -# by the linker. -# -ifdef CONFIG_X86_64 -KBUILD_LDFLAGS += $(call ld-option, -z max-page-size=0x200000) -endif - # Workaround for a gcc prelease that unfortunately was shipped in a suse release KBUILD_CFLAGS += -Wno-sign-compare # @@ -207,6 +192,23 @@ ifdef CONFIG_RETPOLINE endif endif +KBUILD_LDFLAGS := -m elf_$(UTS_MACHINE) + +ifdef CONFIG_X86_NEED_RELOCS +LDFLAGS_vmlinux := --emit-relocs --discard-none +else +LDFLAGS_vmlinux := +endif + +# +# The 64-bit kernel must be aligned to 2MB. Pass -z max-page-size=0x200000 to +# the linker to force 2MB page size regardless of the default page size used +# by the linker. +# +ifdef CONFIG_X86_64 +LDFLAGS_vmlinux += -z max-page-size=0x200000 +endif + archscripts: scripts_basic $(Q)$(MAKE) $(build)=arch/x86/tools relocs diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 5a828fde7a42..3962f592633d 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -26,7 +26,7 @@ OBJECT_FILES_NON_STANDARD := y KCOV_INSTRUMENT := n targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \ - vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 + vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 vmlinux.bin.zst KBUILD_CFLAGS := -m$(BITS) -O2 KBUILD_CFLAGS += -fno-strict-aliasing $(call cc-option, -fPIE, -fPIC) @@ -35,13 +35,14 @@ cflags-$(CONFIG_X86_32) := -march=i386 cflags-$(CONFIG_X86_64) := -mcmodel=small KBUILD_CFLAGS += $(cflags-y) KBUILD_CFLAGS += -mno-mmx -mno-sse -KBUILD_CFLAGS += $(call cc-option,-ffreestanding) -KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector) +KBUILD_CFLAGS += -ffreestanding +KBUILD_CFLAGS += -fno-stack-protector KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) KBUILD_CFLAGS += $(call cc-disable-warning, gnu) KBUILD_CFLAGS += -Wno-pointer-sign KBUILD_CFLAGS += $(call cc-option,-fmacro-prefix-map=$(srctree)/=) KBUILD_CFLAGS += -fno-asynchronous-unwind-tables +KBUILD_CFLAGS += -D__DISABLE_EXPORTS KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n @@ -145,6 +146,8 @@ $(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE $(call if_changed,lzo) $(obj)/vmlinux.bin.lz4: $(vmlinux.bin.all-y) FORCE $(call if_changed,lz4) +$(obj)/vmlinux.bin.zst: $(vmlinux.bin.all-y) FORCE + $(call if_changed,zstd22) suffix-$(CONFIG_KERNEL_GZIP) := gz suffix-$(CONFIG_KERNEL_BZIP2) := bz2 @@ -152,6 +155,7 @@ suffix-$(CONFIG_KERNEL_LZMA) := lzma suffix-$(CONFIG_KERNEL_XZ) := xz suffix-$(CONFIG_KERNEL_LZO) := lzo suffix-$(CONFIG_KERNEL_LZ4) := lz4 +suffix-$(CONFIG_KERNEL_ZSTD) := zst quiet_cmd_mkpiggy = MKPIGGY $@ cmd_mkpiggy = $(obj)/mkpiggy $< > $@ diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index d7408af55738..0048269180d5 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -19,13 +19,6 @@ */ #define BOOT_CTYPE_H -/* - * _ctype[] in lib/ctype.c is needed by isspace() of linux/ctype.h. - * While both lib/ctype.c and lib/cmdline.c will bring EXPORT_SYMBOL - * which is meaningless and will cause compiling error in some cases. - */ -#define __DISABLE_EXPORTS - #include "misc.h" #include "error.h" #include "../string.h" diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 9652d5c2afda..39e592d0e0b4 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -77,6 +77,10 @@ static int lines, cols; #ifdef CONFIG_KERNEL_LZ4 #include "../../../../lib/decompress_unlz4.c" #endif + +#ifdef CONFIG_KERNEL_ZSTD +#include "../../../../lib/decompress_unzstd.c" +#endif /* * NOTE: When adding a new decompressor, please update the analysis in * ../header.S. diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 735ad7f21ab0..6dbd7e9f74c9 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -539,8 +539,14 @@ pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr # the size-dependent part now grows so fast. # # extra_bytes = (uncompressed_size >> 8) + 65536 +# +# ZSTD compressed data grows by at most 3 bytes per 128K, and only has a 22 +# byte fixed overhead but has a maximum block size of 128K, so it needs a +# larger margin. +# +# extra_bytes = (uncompressed_size >> 8) + 131072 -#define ZO_z_extra_bytes ((ZO_z_output_len >> 8) + 65536) +#define ZO_z_extra_bytes ((ZO_z_output_len >> 8) + 131072) #if ZO_z_output_len > ZO_z_input_len # define ZO_z_extract_offset (ZO_z_output_len + ZO_z_extra_bytes - \ ZO_z_input_len) diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 550904591e94..d7577fece9eb 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -1,39 +1,29 @@ -# CONFIG_64BIT is not set # CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y +CONFIG_AUDIT=y +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_PREEMPT_VOLUNTARY=y CONFIG_BSD_PROCESS_ACCT=y CONFIG_TASKSTATS=y CONFIG_TASK_DELAY_ACCT=y CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y -CONFIG_FHANDLE=y -CONFIG_AUDIT=y -CONFIG_NO_HZ=y -CONFIG_HIGH_RES_TIMERS=y CONFIG_LOG_BUF_SHIFT=18 CONFIG_CGROUPS=y +CONFIG_CGROUP_SCHED=y CONFIG_CGROUP_FREEZER=y CONFIG_CPUSETS=y CONFIG_CGROUP_CPUACCT=y -CONFIG_CGROUP_SCHED=y CONFIG_BLK_DEV_INITRD=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y -CONFIG_KPROBES=y -CONFIG_JUMP_LABEL=y -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_FORCE_UNLOAD=y CONFIG_SMP=y CONFIG_X86_GENERIC=y CONFIG_HPET_TIMER=y -CONFIG_SCHED_SMT=y -CONFIG_PREEMPT_VOLUNTARY=y CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y -CONFIG_X86_MCE=y CONFIG_X86_REBOOTFIXUPS=y -CONFIG_MICROCODE=y CONFIG_MICROCODE_AMD=y CONFIG_X86_MSR=y CONFIG_X86_CPUID=y @@ -41,28 +31,25 @@ CONFIG_HIGHPTE=y CONFIG_X86_CHECK_BIOS_CORRUPTION=y # CONFIG_MTRR_SANITIZER is not set CONFIG_EFI=y +CONFIG_EFI_STUB=y CONFIG_HZ_1000=y CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y -CONFIG_RANDOMIZE_BASE=y -CONFIG_RANDOMIZE_MEMORY=y -# CONFIG_COMPAT_VDSO is not set CONFIG_HIBERNATION=y CONFIG_PM_DEBUG=y CONFIG_PM_TRACE_RTC=y CONFIG_ACPI_DOCK=y -CONFIG_CPU_FREQ=y -# CONFIG_CPU_FREQ_STAT is not set +CONFIG_ACPI_BGRT=y CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y -CONFIG_CPU_FREQ_GOV_PERFORMANCE=y CONFIG_CPU_FREQ_GOV_ONDEMAND=y CONFIG_X86_ACPI_CPUFREQ=y -CONFIG_PCI=y -CONFIG_PCIEPORTBUS=y -CONFIG_PCI_MSI=y -CONFIG_PCCARD=y -CONFIG_YENTA=y -CONFIG_HOTPLUG_PCI=y +CONFIG_EFI_VARS=y +CONFIG_KPROBES=y +CONFIG_JUMP_LABEL=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODULE_FORCE_UNLOAD=y +# CONFIG_UNUSED_SYMBOLS is not set CONFIG_BINFMT_MISC=y CONFIG_NET=y CONFIG_PACKET=y @@ -82,16 +69,12 @@ CONFIG_IP_MROUTE=y CONFIG_IP_PIMSM_V1=y CONFIG_IP_PIMSM_V2=y CONFIG_SYN_COOKIES=y -# CONFIG_INET_XFRM_MODE_TRANSPORT is not set -# CONFIG_INET_XFRM_MODE_TUNNEL is not set -# CONFIG_INET_XFRM_MODE_BEET is not set # CONFIG_INET_DIAG is not set CONFIG_TCP_CONG_ADVANCED=y # CONFIG_TCP_CONG_BIC is not set # CONFIG_TCP_CONG_WESTWOOD is not set # CONFIG_TCP_CONG_HTCP is not set CONFIG_TCP_MD5SIG=y -CONFIG_IPV6=y CONFIG_INET6_AH=y CONFIG_INET6_ESP=y CONFIG_NETLABEL=y @@ -102,6 +85,7 @@ CONFIG_NF_CONNTRACK_FTP=y CONFIG_NF_CONNTRACK_IRC=y CONFIG_NF_CONNTRACK_SIP=y CONFIG_NF_CT_NETLINK=y +CONFIG_NF_NAT=y CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y CONFIG_NETFILTER_XT_TARGET_NFLOG=y CONFIG_NETFILTER_XT_TARGET_SECMARK=y @@ -109,14 +93,11 @@ CONFIG_NETFILTER_XT_TARGET_TCPMSS=y CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y CONFIG_NETFILTER_XT_MATCH_POLICY=y CONFIG_NETFILTER_XT_MATCH_STATE=y -CONFIG_NF_CONNTRACK_IPV4=y CONFIG_IP_NF_IPTABLES=y CONFIG_IP_NF_FILTER=y CONFIG_IP_NF_TARGET_REJECT=y -CONFIG_NF_NAT=y -CONFIG_IP_NF_TARGET_MASQUERADE=y +CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_MANGLE=y -CONFIG_NF_CONNTRACK_IPV6=y CONFIG_IP6_NF_IPTABLES=y CONFIG_IP6_NF_MATCH_IPV6HEADER=y CONFIG_IP6_NF_FILTER=y @@ -129,6 +110,12 @@ CONFIG_CFG80211=y CONFIG_MAC80211=y CONFIG_MAC80211_LEDS=y CONFIG_RFKILL=y +CONFIG_PCI=y +CONFIG_PCIEPORTBUS=y +CONFIG_PCI_MSI=y +CONFIG_HOTPLUG_PCI=y +CONFIG_PCCARD=y +CONFIG_YENTA=y CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_DEBUG_DEVRES=y @@ -170,15 +157,12 @@ CONFIG_8139TOO=y # CONFIG_8139TOO_PIO is not set CONFIG_R8169=y CONFIG_INPUT_POLLDEV=y -# CONFIG_INPUT_MOUSEDEV_PSAUX is not set CONFIG_INPUT_EVDEV=y CONFIG_INPUT_JOYSTICK=y CONFIG_INPUT_TABLET=y CONFIG_INPUT_TOUCHSCREEN=y CONFIG_INPUT_MISC=y -CONFIG_VT_HW_CONSOLE_BINDING=y # CONFIG_LEGACY_PTYS is not set -CONFIG_SERIAL_NONSTANDARD=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_8250_NR_UARTS=32 @@ -187,6 +171,7 @@ CONFIG_SERIAL_8250_MANY_PORTS=y CONFIG_SERIAL_8250_SHARE_IRQ=y CONFIG_SERIAL_8250_DETECT_IRQ=y CONFIG_SERIAL_8250_RSA=y +CONFIG_SERIAL_NONSTANDARD=y CONFIG_HW_RANDOM=y CONFIG_NVRAM=y CONFIG_HPET=y @@ -201,19 +186,15 @@ CONFIG_DRM_I915=y CONFIG_FB_MODE_HELPERS=y CONFIG_FB_TILEBLITTING=y CONFIG_FB_EFI=y -# CONFIG_LCD_CLASS_DEVICE is not set CONFIG_VGACON_SOFT_SCROLLBACK=y CONFIG_LOGO=y # CONFIG_LOGO_LINUX_MONO is not set # CONFIG_LOGO_LINUX_VGA16 is not set CONFIG_SOUND=y CONFIG_SND=y +CONFIG_SND_HRTIMER=y CONFIG_SND_SEQUENCER=y CONFIG_SND_SEQ_DUMMY=y -CONFIG_SND_MIXER_OSS=y -CONFIG_SND_PCM_OSS=y -CONFIG_SND_SEQUENCER_OSS=y -CONFIG_SND_HRTIMER=y CONFIG_SND_HDA_INTEL=y CONFIG_SND_HDA_HWDEP=y CONFIG_HIDRAW=y @@ -234,17 +215,14 @@ CONFIG_USB_ANNOUNCE_NEW_DEVICES=y CONFIG_USB_MON=y CONFIG_USB_XHCI_HCD=y CONFIG_USB_EHCI_HCD=y -CONFIG_USB_EHCI_TT_NEWSCHED=y CONFIG_USB_OHCI_HCD=y CONFIG_USB_UHCI_HCD=y CONFIG_USB_PRINTER=y CONFIG_USB_STORAGE=y -CONFIG_EDAC=y CONFIG_RTC_CLASS=y # CONFIG_RTC_HCTOSYS is not set CONFIG_DMADEVICES=y CONFIG_EEEPC_LAPTOP=y -CONFIG_EFI_VARS=y CONFIG_EXT4_FS=y CONFIG_EXT4_FS_POSIX_ACL=y CONFIG_EXT4_FS_SECURITY=y @@ -270,27 +248,19 @@ CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ASCII=y CONFIG_NLS_ISO8859_1=y CONFIG_NLS_UTF8=y +CONFIG_SECURITY=y +CONFIG_SECURITY_NETWORK=y +CONFIG_SECURITY_SELINUX=y +CONFIG_SECURITY_SELINUX_BOOTPARAM=y +CONFIG_SECURITY_SELINUX_DISABLE=y CONFIG_PRINTK_TIME=y -CONFIG_FRAME_WARN=1024 CONFIG_MAGIC_SYSRQ=y -# CONFIG_UNUSED_SYMBOLS is not set CONFIG_DEBUG_KERNEL=y +CONFIG_DEBUG_STACK_USAGE=y +CONFIG_DEBUG_STACKOVERFLOW=y # CONFIG_SCHED_DEBUG is not set CONFIG_SCHEDSTATS=y -CONFIG_TIMER_STATS=y -CONFIG_DEBUG_STACK_USAGE=y CONFIG_BLK_DEV_IO_TRACE=y CONFIG_PROVIDE_OHCI1394_DMA_INIT=y CONFIG_EARLY_PRINTK_DBGP=y -CONFIG_DEBUG_STACKOVERFLOW=y -# CONFIG_DEBUG_RODATA_TEST is not set CONFIG_DEBUG_BOOT_PARAMS=y -CONFIG_SECURITY=y -CONFIG_SECURITY_NETWORK=y -CONFIG_SECURITY_SELINUX=y -CONFIG_SECURITY_SELINUX_BOOTPARAM=y -CONFIG_SECURITY_SELINUX_DISABLE=y -CONFIG_CRYPTO_AES_586=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set -CONFIG_EFI_STUB=y -CONFIG_ACPI_BGRT=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 614961009075..f85600143747 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -1,36 +1,26 @@ # CONFIG_LOCALVERSION_AUTO is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y +CONFIG_AUDIT=y +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_PREEMPT_VOLUNTARY=y CONFIG_BSD_PROCESS_ACCT=y CONFIG_TASKSTATS=y CONFIG_TASK_DELAY_ACCT=y CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y -CONFIG_FHANDLE=y -CONFIG_AUDIT=y -CONFIG_NO_HZ=y -CONFIG_HIGH_RES_TIMERS=y CONFIG_LOG_BUF_SHIFT=18 CONFIG_CGROUPS=y +CONFIG_CGROUP_SCHED=y CONFIG_CGROUP_FREEZER=y CONFIG_CPUSETS=y CONFIG_CGROUP_CPUACCT=y -CONFIG_CGROUP_SCHED=y CONFIG_BLK_DEV_INITRD=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y -CONFIG_KPROBES=y -CONFIG_JUMP_LABEL=y -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_FORCE_UNLOAD=y CONFIG_SMP=y -CONFIG_NR_CPUS=64 -CONFIG_SCHED_SMT=y -CONFIG_PREEMPT_VOLUNTARY=y CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y -CONFIG_X86_MCE=y -CONFIG_MICROCODE=y CONFIG_MICROCODE_AMD=y CONFIG_X86_MSR=y CONFIG_X86_CPUID=y @@ -38,30 +28,28 @@ CONFIG_NUMA=y CONFIG_X86_CHECK_BIOS_CORRUPTION=y # CONFIG_MTRR_SANITIZER is not set CONFIG_EFI=y +CONFIG_EFI_STUB=y +CONFIG_EFI_MIXED=y CONFIG_HZ_1000=y CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y -CONFIG_RANDOMIZE_BASE=y -CONFIG_RANDOMIZE_MEMORY=y -# CONFIG_COMPAT_VDSO is not set CONFIG_HIBERNATION=y CONFIG_PM_DEBUG=y CONFIG_PM_TRACE_RTC=y CONFIG_ACPI_DOCK=y -CONFIG_CPU_FREQ=y -# CONFIG_CPU_FREQ_STAT is not set +CONFIG_ACPI_BGRT=y CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y -CONFIG_CPU_FREQ_GOV_PERFORMANCE=y CONFIG_CPU_FREQ_GOV_ONDEMAND=y CONFIG_X86_ACPI_CPUFREQ=y -CONFIG_PCI=y -CONFIG_PCI_MMCONFIG=y -CONFIG_PCIEPORTBUS=y -CONFIG_PCCARD=y -CONFIG_YENTA=y -CONFIG_HOTPLUG_PCI=y -CONFIG_BINFMT_MISC=y CONFIG_IA32_EMULATION=y +CONFIG_EFI_VARS=y +CONFIG_KPROBES=y +CONFIG_JUMP_LABEL=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODULE_FORCE_UNLOAD=y +# CONFIG_UNUSED_SYMBOLS is not set +CONFIG_BINFMT_MISC=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -80,16 +68,12 @@ CONFIG_IP_MROUTE=y CONFIG_IP_PIMSM_V1=y CONFIG_IP_PIMSM_V2=y CONFIG_SYN_COOKIES=y -# CONFIG_INET_XFRM_MODE_TRANSPORT is not set -# CONFIG_INET_XFRM_MODE_TUNNEL is not set -# CONFIG_INET_XFRM_MODE_BEET is not set # CONFIG_INET_DIAG is not set CONFIG_TCP_CONG_ADVANCED=y # CONFIG_TCP_CONG_BIC is not set # CONFIG_TCP_CONG_WESTWOOD is not set # CONFIG_TCP_CONG_HTCP is not set CONFIG_TCP_MD5SIG=y -CONFIG_IPV6=y CONFIG_INET6_AH=y CONFIG_INET6_ESP=y CONFIG_NETLABEL=y @@ -100,6 +84,7 @@ CONFIG_NF_CONNTRACK_FTP=y CONFIG_NF_CONNTRACK_IRC=y CONFIG_NF_CONNTRACK_SIP=y CONFIG_NF_CT_NETLINK=y +CONFIG_NF_NAT=y CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y CONFIG_NETFILTER_XT_TARGET_NFLOG=y CONFIG_NETFILTER_XT_TARGET_SECMARK=y @@ -107,14 +92,11 @@ CONFIG_NETFILTER_XT_TARGET_TCPMSS=y CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y CONFIG_NETFILTER_XT_MATCH_POLICY=y CONFIG_NETFILTER_XT_MATCH_STATE=y -CONFIG_NF_CONNTRACK_IPV4=y CONFIG_IP_NF_IPTABLES=y CONFIG_IP_NF_FILTER=y CONFIG_IP_NF_TARGET_REJECT=y -CONFIG_NF_NAT=y -CONFIG_IP_NF_TARGET_MASQUERADE=y +CONFIG_IP_NF_TARGET_MASQUERADE=m CONFIG_IP_NF_MANGLE=y -CONFIG_NF_CONNTRACK_IPV6=y CONFIG_IP6_NF_IPTABLES=y CONFIG_IP6_NF_MATCH_IPV6HEADER=y CONFIG_IP6_NF_FILTER=y @@ -127,6 +109,11 @@ CONFIG_CFG80211=y CONFIG_MAC80211=y CONFIG_MAC80211_LEDS=y CONFIG_RFKILL=y +CONFIG_PCI=y +CONFIG_PCIEPORTBUS=y +CONFIG_HOTPLUG_PCI=y +CONFIG_PCCARD=y +CONFIG_YENTA=y CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_DEBUG_DEVRES=y @@ -163,15 +150,12 @@ CONFIG_FORCEDETH=y CONFIG_8139TOO=y CONFIG_R8169=y CONFIG_INPUT_POLLDEV=y -# CONFIG_INPUT_MOUSEDEV_PSAUX is not set CONFIG_INPUT_EVDEV=y CONFIG_INPUT_JOYSTICK=y CONFIG_INPUT_TABLET=y CONFIG_INPUT_TOUCHSCREEN=y CONFIG_INPUT_MISC=y -CONFIG_VT_HW_CONSOLE_BINDING=y # CONFIG_LEGACY_PTYS is not set -CONFIG_SERIAL_NONSTANDARD=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_8250_NR_UARTS=32 @@ -180,6 +164,7 @@ CONFIG_SERIAL_8250_MANY_PORTS=y CONFIG_SERIAL_8250_SHARE_IRQ=y CONFIG_SERIAL_8250_DETECT_IRQ=y CONFIG_SERIAL_8250_RSA=y +CONFIG_SERIAL_NONSTANDARD=y CONFIG_HW_RANDOM=y # CONFIG_HW_RANDOM_INTEL is not set # CONFIG_HW_RANDOM_AMD is not set @@ -196,19 +181,15 @@ CONFIG_DRM_I915=y CONFIG_FB_MODE_HELPERS=y CONFIG_FB_TILEBLITTING=y CONFIG_FB_EFI=y -# CONFIG_LCD_CLASS_DEVICE is not set CONFIG_VGACON_SOFT_SCROLLBACK=y CONFIG_LOGO=y # CONFIG_LOGO_LINUX_MONO is not set # CONFIG_LOGO_LINUX_VGA16 is not set CONFIG_SOUND=y CONFIG_SND=y +CONFIG_SND_HRTIMER=y CONFIG_SND_SEQUENCER=y CONFIG_SND_SEQ_DUMMY=y -CONFIG_SND_MIXER_OSS=y -CONFIG_SND_PCM_OSS=y -CONFIG_SND_SEQUENCER_OSS=y -CONFIG_SND_HRTIMER=y CONFIG_SND_HDA_INTEL=y CONFIG_SND_HDA_HWDEP=y CONFIG_HIDRAW=y @@ -229,12 +210,10 @@ CONFIG_USB_ANNOUNCE_NEW_DEVICES=y CONFIG_USB_MON=y CONFIG_USB_XHCI_HCD=y CONFIG_USB_EHCI_HCD=y -CONFIG_USB_EHCI_TT_NEWSCHED=y CONFIG_USB_OHCI_HCD=y CONFIG_USB_UHCI_HCD=y CONFIG_USB_PRINTER=y CONFIG_USB_STORAGE=y -CONFIG_EDAC=y CONFIG_RTC_CLASS=y # CONFIG_RTC_HCTOSYS is not set CONFIG_DMADEVICES=y @@ -242,7 +221,6 @@ CONFIG_EEEPC_LAPTOP=y CONFIG_AMD_IOMMU=y CONFIG_INTEL_IOMMU=y # CONFIG_INTEL_IOMMU_DEFAULT_ON is not set -CONFIG_EFI_VARS=y CONFIG_EXT4_FS=y CONFIG_EXT4_FS_POSIX_ACL=y CONFIG_EXT4_FS_SECURITY=y @@ -268,27 +246,18 @@ CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ASCII=y CONFIG_NLS_ISO8859_1=y CONFIG_NLS_UTF8=y +CONFIG_SECURITY=y +CONFIG_SECURITY_NETWORK=y +CONFIG_SECURITY_SELINUX=y +CONFIG_SECURITY_SELINUX_BOOTPARAM=y +CONFIG_SECURITY_SELINUX_DISABLE=y CONFIG_PRINTK_TIME=y CONFIG_MAGIC_SYSRQ=y -# CONFIG_UNUSED_SYMBOLS is not set CONFIG_DEBUG_KERNEL=y +CONFIG_DEBUG_STACK_USAGE=y # CONFIG_SCHED_DEBUG is not set CONFIG_SCHEDSTATS=y -CONFIG_TIMER_STATS=y -CONFIG_DEBUG_STACK_USAGE=y CONFIG_BLK_DEV_IO_TRACE=y CONFIG_PROVIDE_OHCI1394_DMA_INIT=y CONFIG_EARLY_PRINTK_DBGP=y -CONFIG_DEBUG_STACKOVERFLOW=y -# CONFIG_DEBUG_RODATA_TEST is not set CONFIG_DEBUG_BOOT_PARAMS=y -CONFIG_UNWINDER_ORC=y -CONFIG_SECURITY=y -CONFIG_SECURITY_NETWORK=y -CONFIG_SECURITY_SELINUX=y -CONFIG_SECURITY_SELINUX_BOOTPARAM=y -CONFIG_SECURITY_SELINUX_DISABLE=y -# CONFIG_CRYPTO_ANSI_CPRNG is not set -CONFIG_EFI_STUB=y -CONFIG_EFI_MIXED=y -CONFIG_ACPI_BGRT=y diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S index ec437db1fa54..3f0fc7dd87d7 100644 --- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S +++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S @@ -63,7 +63,6 @@ */ #include <linux/linkage.h> -#include <asm/inst.h> #define VMOVDQ vmovdqu @@ -127,10 +126,6 @@ ddq_add_8: /* generate a unique variable for ddq_add_x */ -.macro setddq n - var_ddq_add = ddq_add_\n -.endm - /* generate a unique variable for xmm register */ .macro setxdata n var_xdata = %xmm\n @@ -140,9 +135,7 @@ ddq_add_8: .macro club name, id .altmacro - .if \name == DDQ_DATA - setddq %\id - .elseif \name == XDATA + .if \name == XDATA setxdata %\id .endif .noaltmacro @@ -165,9 +158,8 @@ ddq_add_8: .set i, 1 .rept (by - 1) - club DDQ_DATA, i club XDATA, i - vpaddq var_ddq_add(%rip), xcounter, var_xdata + vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata vptest ddq_low_msk(%rip), var_xdata jnz 1f vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata @@ -180,8 +172,7 @@ ddq_add_8: vmovdqa 1*16(p_keys), xkeyA vpxor xkey0, xdata0, xdata0 - club DDQ_DATA, by - vpaddq var_ddq_add(%rip), xcounter, xcounter + vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter vptest ddq_low_msk(%rip), xcounter jnz 1f vpaddq ddq_high_add_1(%rip), xcounter, xcounter diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 54e7d15dbd0d..1852b19a73a0 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -26,7 +26,6 @@ */ #include <linux/linkage.h> -#include <asm/inst.h> #include <asm/frame.h> #include <asm/nospec-branch.h> @@ -201,7 +200,7 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff mov \SUBKEY, %r12 movdqu (%r12), \TMP3 movdqa SHUF_MASK(%rip), \TMP2 - PSHUFB_XMM \TMP2, \TMP3 + pshufb \TMP2, \TMP3 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH) @@ -263,10 +262,10 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv movdqa SHUF_MASK(%rip), %xmm2 - PSHUFB_XMM %xmm2, %xmm0 + pshufb %xmm2, %xmm0 movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv - PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, + PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7 movdqu HashKey(%arg2), %xmm13 CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \ @@ -347,7 +346,7 @@ _zero_cipher_left_\@: paddd ONE(%rip), %xmm0 # INCR CNT to get Yn movdqu %xmm0, CurCount(%arg2) movdqa SHUF_MASK(%rip), %xmm10 - PSHUFB_XMM %xmm10, %xmm0 + pshufb %xmm10, %xmm0 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) movdqu %xmm0, PBlockEncKey(%arg2) @@ -377,7 +376,7 @@ _large_enough_update_\@: # get the appropriate shuffle mask movdqu (%r12), %xmm2 # shift right 16-r13 bytes - PSHUFB_XMM %xmm2, %xmm1 + pshufb %xmm2, %xmm1 _data_read_\@: lea ALL_F+16(%rip), %r12 @@ -393,12 +392,12 @@ _data_read_\@: .ifc \operation, dec pand %xmm1, %xmm2 movdqa SHUF_MASK(%rip), %xmm10 - PSHUFB_XMM %xmm10 ,%xmm2 + pshufb %xmm10 ,%xmm2 pxor %xmm2, %xmm8 .else movdqa SHUF_MASK(%rip), %xmm10 - PSHUFB_XMM %xmm10,%xmm0 + pshufb %xmm10,%xmm0 pxor %xmm0, %xmm8 .endif @@ -408,17 +407,17 @@ _data_read_\@: # GHASH computation for the last <16 byte block movdqa SHUF_MASK(%rip), %xmm10 # shuffle xmm0 back to output as ciphertext - PSHUFB_XMM %xmm10, %xmm0 + pshufb %xmm10, %xmm0 .endif # Output %r13 bytes - MOVQ_R64_XMM %xmm0, %rax + movq %xmm0, %rax cmp $8, %r13 jle _less_than_8_bytes_left_\@ mov %rax, (%arg3 , %r11, 1) add $8, %r11 psrldq $8, %xmm0 - MOVQ_R64_XMM %xmm0, %rax + movq %xmm0, %rax sub $8, %r13 _less_than_8_bytes_left_\@: mov %al, (%arg3, %r11, 1) @@ -449,7 +448,7 @@ _partial_done\@: movd %r12d, %xmm15 # len(A) in %xmm15 mov InLen(%arg2), %r12 shl $3, %r12 # len(C) in bits (*128) - MOVQ_R64_XMM %r12, %xmm1 + movq %r12, %xmm1 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) @@ -457,7 +456,7 @@ _partial_done\@: GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation movdqa SHUF_MASK(%rip), %xmm10 - PSHUFB_XMM %xmm10, %xmm8 + pshufb %xmm10, %xmm8 movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) @@ -470,7 +469,7 @@ _return_T_\@: cmp $8, %r11 jl _T_4_\@ _T_8_\@: - MOVQ_R64_XMM %xmm0, %rax + movq %xmm0, %rax mov %rax, (%r10) add $8, %r10 sub $8, %r11 @@ -518,9 +517,9 @@ _return_T_done_\@: pshufd $78, \HK, \TMP3 pxor \GH, \TMP2 # TMP2 = a1+a0 pxor \HK, \TMP3 # TMP3 = b1+b0 - PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1 - PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0 - PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) + pclmulqdq $0x11, \HK, \TMP1 # TMP1 = a1*b1 + pclmulqdq $0x00, \HK, \GH # GH = a0*b0 + pclmulqdq $0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) pxor \GH, \TMP2 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) movdqa \TMP2, \TMP3 @@ -570,7 +569,7 @@ _return_T_done_\@: cmp $8, \DLEN jl _read_lt8_\@ mov (\DPTR), %rax - MOVQ_R64_XMM %rax, \XMMDst + movq %rax, \XMMDst sub $8, \DLEN jz _done_read_partial_block_\@ xor %eax, %eax @@ -579,7 +578,7 @@ _read_next_byte_\@: mov 7(\DPTR, \DLEN, 1), %al dec \DLEN jnz _read_next_byte_\@ - MOVQ_R64_XMM %rax, \XMM1 + movq %rax, \XMM1 pslldq $8, \XMM1 por \XMM1, \XMMDst jmp _done_read_partial_block_\@ @@ -590,7 +589,7 @@ _read_next_byte_lt8_\@: mov -1(\DPTR, \DLEN, 1), %al dec \DLEN jnz _read_next_byte_lt8_\@ - MOVQ_R64_XMM %rax, \XMMDst + movq %rax, \XMMDst _done_read_partial_block_\@: .endm @@ -608,7 +607,7 @@ _done_read_partial_block_\@: jl _get_AAD_rest\@ _get_AAD_blocks\@: movdqu (%r10), \TMP7 - PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data + pshufb %xmm14, \TMP7 # byte-reflect the AAD data pxor \TMP7, \TMP6 GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 add $16, %r10 @@ -624,7 +623,7 @@ _get_AAD_rest\@: je _get_AAD_done\@ READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7 - PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data + pshufb %xmm14, \TMP7 # byte-reflect the AAD data pxor \TMP6, \TMP7 GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 movdqu \TMP7, \TMP6 @@ -667,7 +666,7 @@ _data_read_\@: # Finished reading in data # r16-r13 is the number of bytes in plaintext mod 16) add %r13, %r12 movdqu (%r12), %xmm2 # get the appropriate shuffle mask - PSHUFB_XMM %xmm2, %xmm9 # shift right r13 bytes + pshufb %xmm2, %xmm9 # shift right r13 bytes .ifc \operation, dec movdqa %xmm1, %xmm3 @@ -689,8 +688,8 @@ _no_extra_mask_1_\@: pand %xmm1, %xmm3 movdqa SHUF_MASK(%rip), %xmm10 - PSHUFB_XMM %xmm10, %xmm3 - PSHUFB_XMM %xmm2, %xmm3 + pshufb %xmm10, %xmm3 + pshufb %xmm2, %xmm3 pxor %xmm3, \AAD_HASH cmp $0, %r10 @@ -724,8 +723,8 @@ _no_extra_mask_2_\@: pand %xmm1, %xmm9 movdqa SHUF_MASK(%rip), %xmm1 - PSHUFB_XMM %xmm1, %xmm9 - PSHUFB_XMM %xmm2, %xmm9 + pshufb %xmm1, %xmm9 + pshufb %xmm2, %xmm9 pxor %xmm9, \AAD_HASH cmp $0, %r10 @@ -744,8 +743,8 @@ _encode_done_\@: movdqa SHUF_MASK(%rip), %xmm10 # shuffle xmm9 back to output as ciphertext - PSHUFB_XMM %xmm10, %xmm9 - PSHUFB_XMM %xmm2, %xmm9 + pshufb %xmm10, %xmm9 + pshufb %xmm2, %xmm9 .endif # output encrypted Bytes cmp $0, %r10 @@ -759,14 +758,14 @@ _partial_fill_\@: mov \PLAIN_CYPH_LEN, %r13 _count_set_\@: movdqa %xmm9, %xmm0 - MOVQ_R64_XMM %xmm0, %rax + movq %xmm0, %rax cmp $8, %r13 jle _less_than_8_bytes_left_\@ mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) add $8, \DATA_OFFSET psrldq $8, %xmm0 - MOVQ_R64_XMM %xmm0, %rax + movq %xmm0, %rax sub $8, %r13 _less_than_8_bytes_left_\@: movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) @@ -810,7 +809,7 @@ _partial_block_done_\@: .else MOVADQ \XMM0, %xmm\index .endif - PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap + pshufb %xmm14, %xmm\index # perform a 16 byte swap pxor \TMP2, %xmm\index .endr lea 0x10(%arg1),%r10 @@ -821,7 +820,7 @@ _partial_block_done_\@: aes_loop_initial_\@: MOVADQ (%r10),\TMP1 .irpc index, \i_seq - AESENC \TMP1, %xmm\index + aesenc \TMP1, %xmm\index .endr add $16,%r10 sub $1,%eax @@ -829,7 +828,7 @@ aes_loop_initial_\@: MOVADQ (%r10), \TMP1 .irpc index, \i_seq - AESENCLAST \TMP1, %xmm\index # Last Round + aesenclast \TMP1, %xmm\index # Last Round .endr .irpc index, \i_seq movdqu (%arg4 , %r11, 1), \TMP1 @@ -841,7 +840,7 @@ aes_loop_initial_\@: .ifc \operation, dec movdqa \TMP1, %xmm\index .endif - PSHUFB_XMM %xmm14, %xmm\index + pshufb %xmm14, %xmm\index # prepare plaintext/ciphertext for GHASH computation .endr @@ -876,19 +875,19 @@ aes_loop_initial_\@: MOVADQ ONE(%RIP),\TMP1 paddd \TMP1, \XMM0 # INCR Y0 MOVADQ \XMM0, \XMM1 - PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap + pshufb %xmm14, \XMM1 # perform a 16 byte swap paddd \TMP1, \XMM0 # INCR Y0 MOVADQ \XMM0, \XMM2 - PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap + pshufb %xmm14, \XMM2 # perform a 16 byte swap paddd \TMP1, \XMM0 # INCR Y0 MOVADQ \XMM0, \XMM3 - PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap + pshufb %xmm14, \XMM3 # perform a 16 byte swap paddd \TMP1, \XMM0 # INCR Y0 MOVADQ \XMM0, \XMM4 - PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap + pshufb %xmm14, \XMM4 # perform a 16 byte swap MOVADQ 0(%arg1),\TMP1 pxor \TMP1, \XMM1 @@ -897,17 +896,17 @@ aes_loop_initial_\@: pxor \TMP1, \XMM4 .irpc index, 1234 # do 4 rounds movaps 0x10*\index(%arg1), \TMP1 - AESENC \TMP1, \XMM1 - AESENC \TMP1, \XMM2 - AESENC \TMP1, \XMM3 - AESENC \TMP1, \XMM4 + aesenc \TMP1, \XMM1 + aesenc \TMP1, \XMM2 + aesenc \TMP1, \XMM3 + aesenc \TMP1, \XMM4 .endr .irpc index, 56789 # do next 5 rounds movaps 0x10*\index(%arg1), \TMP1 - AESENC \TMP1, \XMM1 - AESENC \TMP1, \XMM2 - AESENC \TMP1, \XMM3 - AESENC \TMP1, \XMM4 + aesenc \TMP1, \XMM1 + aesenc \TMP1, \XMM2 + aesenc \TMP1, \XMM3 + aesenc \TMP1, \XMM4 .endr lea 0xa0(%arg1),%r10 mov keysize,%eax @@ -918,7 +917,7 @@ aes_loop_initial_\@: aes_loop_pre_\@: MOVADQ (%r10),\TMP2 .irpc index, 1234 - AESENC \TMP2, %xmm\index + aesenc \TMP2, %xmm\index .endr add $16,%r10 sub $1,%eax @@ -926,10 +925,10 @@ aes_loop_pre_\@: aes_loop_pre_done\@: MOVADQ (%r10), \TMP2 - AESENCLAST \TMP2, \XMM1 - AESENCLAST \TMP2, \XMM2 - AESENCLAST \TMP2, \XMM3 - AESENCLAST \TMP2, \XMM4 + aesenclast \TMP2, \XMM1 + aesenclast \TMP2, \XMM2 + aesenclast \TMP2, \XMM3 + aesenclast \TMP2, \XMM4 movdqu 16*0(%arg4 , %r11 , 1), \TMP1 pxor \TMP1, \XMM1 .ifc \operation, dec @@ -961,12 +960,12 @@ aes_loop_pre_done\@: .endif add $64, %r11 - PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap + pshufb %xmm14, \XMM1 # perform a 16 byte swap pxor \XMMDst, \XMM1 # combine GHASHed value with the corresponding ciphertext - PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap - PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap - PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap + pshufb %xmm14, \XMM2 # perform a 16 byte swap + pshufb %xmm14, \XMM3 # perform a 16 byte swap + pshufb %xmm14, \XMM4 # perform a 16 byte swap _initial_blocks_done\@: @@ -978,7 +977,7 @@ _initial_blocks_done\@: * arg1, %arg3, %arg4 are used as pointers only, not modified * %r11 is the data offset value */ -.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \ +.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation movdqa \XMM1, \XMM5 @@ -994,7 +993,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation pxor \XMM5, \TMP6 paddd ONE(%rip), \XMM0 # INCR CNT movdqu HashKey_4(%arg2), \TMP5 - PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 + pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1 movdqa \XMM0, \XMM1 paddd ONE(%rip), \XMM0 # INCR CNT movdqa \XMM0, \XMM2 @@ -1002,51 +1001,51 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation movdqa \XMM0, \XMM3 paddd ONE(%rip), \XMM0 # INCR CNT movdqa \XMM0, \XMM4 - PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap - PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 - PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap - PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap - PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap + pshufb %xmm15, \XMM1 # perform a 16 byte swap + pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0 + pshufb %xmm15, \XMM2 # perform a 16 byte swap + pshufb %xmm15, \XMM3 # perform a 16 byte swap + pshufb %xmm15, \XMM4 # perform a 16 byte swap pxor (%arg1), \XMM1 pxor (%arg1), \XMM2 pxor (%arg1), \XMM3 pxor (%arg1), \XMM4 movdqu HashKey_4_k(%arg2), \TMP5 - PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) movaps 0x10(%arg1), \TMP1 - AESENC \TMP1, \XMM1 # Round 1 - AESENC \TMP1, \XMM2 - AESENC \TMP1, \XMM3 - AESENC \TMP1, \XMM4 + aesenc \TMP1, \XMM1 # Round 1 + aesenc \TMP1, \XMM2 + aesenc \TMP1, \XMM3 + aesenc \TMP1, \XMM4 movaps 0x20(%arg1), \TMP1 - AESENC \TMP1, \XMM1 # Round 2 - AESENC \TMP1, \XMM2 - AESENC \TMP1, \XMM3 - AESENC \TMP1, \XMM4 + aesenc \TMP1, \XMM1 # Round 2 + aesenc \TMP1, \XMM2 + aesenc \TMP1, \XMM3 + aesenc \TMP1, \XMM4 movdqa \XMM6, \TMP1 pshufd $78, \XMM6, \TMP2 pxor \XMM6, \TMP2 movdqu HashKey_3(%arg2), \TMP5 - PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 + pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 movaps 0x30(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 3 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 - PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 + aesenc \TMP3, \XMM1 # Round 3 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 + pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0 movaps 0x40(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 4 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 + aesenc \TMP3, \XMM1 # Round 4 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 movdqu HashKey_3_k(%arg2), \TMP5 - PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movaps 0x50(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 5 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 + aesenc \TMP3, \XMM1 # Round 5 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 pxor \TMP1, \TMP4 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part pxor \XMM6, \XMM5 @@ -1058,25 +1057,25 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation # Multiply TMP5 * HashKey using karatsuba - PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 movaps 0x60(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 6 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 - PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 + aesenc \TMP3, \XMM1 # Round 6 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 + pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0 movaps 0x70(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 7 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 + aesenc \TMP3, \XMM1 # Round 7 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 movdqu HashKey_2_k(%arg2), \TMP5 - PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movaps 0x80(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 8 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 + aesenc \TMP3, \XMM1 # Round 8 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 pxor \TMP1, \TMP4 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part pxor \XMM7, \XMM5 @@ -1089,13 +1088,13 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation pshufd $78, \XMM8, \TMP2 pxor \XMM8, \TMP2 movdqu HashKey(%arg2), \TMP5 - PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 movaps 0x90(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 9 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 - PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 + aesenc \TMP3, \XMM1 # Round 9 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 + pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0 lea 0xa0(%arg1),%r10 mov keysize,%eax shr $2,%eax # 128->4, 192->6, 256->8 @@ -1105,7 +1104,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation aes_loop_par_enc\@: MOVADQ (%r10),\TMP3 .irpc index, 1234 - AESENC \TMP3, %xmm\index + aesenc \TMP3, %xmm\index .endr add $16,%r10 sub $1,%eax @@ -1113,12 +1112,12 @@ aes_loop_par_enc\@: aes_loop_par_enc_done\@: MOVADQ (%r10), \TMP3 - AESENCLAST \TMP3, \XMM1 # Round 10 - AESENCLAST \TMP3, \XMM2 - AESENCLAST \TMP3, \XMM3 - AESENCLAST \TMP3, \XMM4 + aesenclast \TMP3, \XMM1 # Round 10 + aesenclast \TMP3, \XMM2 + aesenclast \TMP3, \XMM3 + aesenclast \TMP3, \XMM4 movdqu HashKey_k(%arg2), \TMP5 - PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movdqu (%arg4,%r11,1), \TMP3 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK movdqu 16(%arg4,%r11,1), \TMP3 @@ -1131,10 +1130,10 @@ aes_loop_par_enc_done\@: movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer - PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap - PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap - PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap - PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap + pshufb %xmm15, \XMM1 # perform a 16 byte swap + pshufb %xmm15, \XMM2 # perform a 16 byte swap + pshufb %xmm15, \XMM3 # perform a 16 byte swap + pshufb %xmm15, \XMM4 # perform a 16 byte swap pxor \TMP4, \TMP1 pxor \XMM8, \XMM5 @@ -1186,7 +1185,7 @@ aes_loop_par_enc_done\@: * arg1, %arg3, %arg4 are used as pointers only, not modified * %r11 is the data offset value */ -.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \ +.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation movdqa \XMM1, \XMM5 @@ -1202,7 +1201,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation pxor \XMM5, \TMP6 paddd ONE(%rip), \XMM0 # INCR CNT movdqu HashKey_4(%arg2), \TMP5 - PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 + pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1 movdqa \XMM0, \XMM1 paddd ONE(%rip), \XMM0 # INCR CNT movdqa \XMM0, \XMM2 @@ -1210,51 +1209,51 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation movdqa \XMM0, \XMM3 paddd ONE(%rip), \XMM0 # INCR CNT movdqa \XMM0, \XMM4 - PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap - PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 - PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap - PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap - PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap + pshufb %xmm15, \XMM1 # perform a 16 byte swap + pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0 + pshufb %xmm15, \XMM2 # perform a 16 byte swap + pshufb %xmm15, \XMM3 # perform a 16 byte swap + pshufb %xmm15, \XMM4 # perform a 16 byte swap pxor (%arg1), \XMM1 pxor (%arg1), \XMM2 pxor (%arg1), \XMM3 pxor (%arg1), \XMM4 movdqu HashKey_4_k(%arg2), \TMP5 - PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) movaps 0x10(%arg1), \TMP1 - AESENC \TMP1, \XMM1 # Round 1 - AESENC \TMP1, \XMM2 - AESENC \TMP1, \XMM3 - AESENC \TMP1, \XMM4 + aesenc \TMP1, \XMM1 # Round 1 + aesenc \TMP1, \XMM2 + aesenc \TMP1, \XMM3 + aesenc \TMP1, \XMM4 movaps 0x20(%arg1), \TMP1 - AESENC \TMP1, \XMM1 # Round 2 - AESENC \TMP1, \XMM2 - AESENC \TMP1, \XMM3 - AESENC \TMP1, \XMM4 + aesenc \TMP1, \XMM1 # Round 2 + aesenc \TMP1, \XMM2 + aesenc \TMP1, \XMM3 + aesenc \TMP1, \XMM4 movdqa \XMM6, \TMP1 pshufd $78, \XMM6, \TMP2 pxor \XMM6, \TMP2 movdqu HashKey_3(%arg2), \TMP5 - PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 + pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 movaps 0x30(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 3 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 - PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 + aesenc \TMP3, \XMM1 # Round 3 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 + pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0 movaps 0x40(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 4 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 + aesenc \TMP3, \XMM1 # Round 4 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 movdqu HashKey_3_k(%arg2), \TMP5 - PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movaps 0x50(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 5 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 + aesenc \TMP3, \XMM1 # Round 5 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 pxor \TMP1, \TMP4 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part pxor \XMM6, \XMM5 @@ -1266,25 +1265,25 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation # Multiply TMP5 * HashKey using karatsuba - PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 movaps 0x60(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 6 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 - PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 + aesenc \TMP3, \XMM1 # Round 6 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 + pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0 movaps 0x70(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 7 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 + aesenc \TMP3, \XMM1 # Round 7 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 movdqu HashKey_2_k(%arg2), \TMP5 - PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movaps 0x80(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 8 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 + aesenc \TMP3, \XMM1 # Round 8 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 pxor \TMP1, \TMP4 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part pxor \XMM7, \XMM5 @@ -1297,13 +1296,13 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation pshufd $78, \XMM8, \TMP2 pxor \XMM8, \TMP2 movdqu HashKey(%arg2), \TMP5 - PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 movaps 0x90(%arg1), \TMP3 - AESENC \TMP3, \XMM1 # Round 9 - AESENC \TMP3, \XMM2 - AESENC \TMP3, \XMM3 - AESENC \TMP3, \XMM4 - PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 + aesenc \TMP3, \XMM1 # Round 9 + aesenc \TMP3, \XMM2 + aesenc \TMP3, \XMM3 + aesenc \TMP3, \XMM4 + pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0 lea 0xa0(%arg1),%r10 mov keysize,%eax shr $2,%eax # 128->4, 192->6, 256->8 @@ -1313,7 +1312,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation aes_loop_par_dec\@: MOVADQ (%r10),\TMP3 .irpc index, 1234 - AESENC \TMP3, %xmm\index + aesenc \TMP3, %xmm\index .endr add $16,%r10 sub $1,%eax @@ -1321,12 +1320,12 @@ aes_loop_par_dec\@: aes_loop_par_dec_done\@: MOVADQ (%r10), \TMP3 - AESENCLAST \TMP3, \XMM1 # last round - AESENCLAST \TMP3, \XMM2 - AESENCLAST \TMP3, \XMM3 - AESENCLAST \TMP3, \XMM4 + aesenclast \TMP3, \XMM1 # last round + aesenclast \TMP3, \XMM2 + aesenclast \TMP3, \XMM3 + aesenclast \TMP3, \XMM4 movdqu HashKey_k(%arg2), \TMP5 - PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movdqu (%arg4,%r11,1), \TMP3 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer @@ -1343,10 +1342,10 @@ aes_loop_par_dec_done\@: pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer movdqa \TMP3, \XMM4 - PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap - PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap - PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap - PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap + pshufb %xmm15, \XMM1 # perform a 16 byte swap + pshufb %xmm15, \XMM2 # perform a 16 byte swap + pshufb %xmm15, \XMM3 # perform a 16 byte swap + pshufb %xmm15, \XMM4 # perform a 16 byte swap pxor \TMP4, \TMP1 pxor \XMM8, \XMM5 @@ -1402,10 +1401,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst pshufd $78, \XMM1, \TMP2 pxor \XMM1, \TMP2 movdqu HashKey_4(%arg2), \TMP5 - PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1 - PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0 + pclmulqdq $0x11, \TMP5, \TMP6 # TMP6 = a1*b1 + pclmulqdq $0x00, \TMP5, \XMM1 # XMM1 = a0*b0 movdqu HashKey_4_k(%arg2), \TMP4 - PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movdqa \XMM1, \XMMDst movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 @@ -1415,10 +1414,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst pshufd $78, \XMM2, \TMP2 pxor \XMM2, \TMP2 movdqu HashKey_3(%arg2), \TMP5 - PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0 + pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + pclmulqdq $0x00, \TMP5, \XMM2 # XMM2 = a0*b0 movdqu HashKey_3_k(%arg2), \TMP4 - PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) pxor \TMP1, \TMP6 pxor \XMM2, \XMMDst pxor \TMP2, \XMM1 @@ -1430,10 +1429,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst pshufd $78, \XMM3, \TMP2 pxor \XMM3, \TMP2 movdqu HashKey_2(%arg2), \TMP5 - PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0 + pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + pclmulqdq $0x00, \TMP5, \XMM3 # XMM3 = a0*b0 movdqu HashKey_2_k(%arg2), \TMP4 - PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) pxor \TMP1, \TMP6 pxor \XMM3, \XMMDst pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 @@ -1443,10 +1442,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst pshufd $78, \XMM4, \TMP2 pxor \XMM4, \TMP2 movdqu HashKey(%arg2), \TMP5 - PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 - PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0 + pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + pclmulqdq $0x00, \TMP5, \XMM4 # XMM4 = a0*b0 movdqu HashKey_k(%arg2), \TMP4 - PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) pxor \TMP1, \TMP6 pxor \XMM4, \XMMDst pxor \XMM1, \TMP2 @@ -1504,13 +1503,13 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst _esb_loop_\@: MOVADQ (%r10),\TMP1 - AESENC \TMP1,\XMM0 + aesenc \TMP1,\XMM0 add $16,%r10 sub $1,%eax jnz _esb_loop_\@ MOVADQ (%r10),\TMP1 - AESENCLAST \TMP1,\XMM0 + aesenclast \TMP1,\XMM0 .endm /***************************************************************************** * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. @@ -1849,72 +1848,72 @@ SYM_FUNC_START(aesni_set_key) movups 0x10(UKEYP), %xmm2 # other user key movaps %xmm2, (TKEYP) add $0x10, TKEYP - AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 + aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 call _key_expansion_256a - AESKEYGENASSIST 0x1 %xmm0 %xmm1 + aeskeygenassist $0x1, %xmm0, %xmm1 call _key_expansion_256b - AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 + aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 call _key_expansion_256a - AESKEYGENASSIST 0x2 %xmm0 %xmm1 + aeskeygenassist $0x2, %xmm0, %xmm1 call _key_expansion_256b - AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 + aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 call _key_expansion_256a - AESKEYGENASSIST 0x4 %xmm0 %xmm1 + aeskeygenassist $0x4, %xmm0, %xmm1 call _key_expansion_256b - AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 + aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 call _key_expansion_256a - AESKEYGENASSIST 0x8 %xmm0 %xmm1 + aeskeygenassist $0x8, %xmm0, %xmm1 call _key_expansion_256b - AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 + aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 call _key_expansion_256a - AESKEYGENASSIST 0x10 %xmm0 %xmm1 + aeskeygenassist $0x10, %xmm0, %xmm1 call _key_expansion_256b - AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 + aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 call _key_expansion_256a - AESKEYGENASSIST 0x20 %xmm0 %xmm1 + aeskeygenassist $0x20, %xmm0, %xmm1 call _key_expansion_256b - AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 + aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 call _key_expansion_256a jmp .Ldec_key .Lenc_key192: movq 0x10(UKEYP), %xmm2 # other user key - AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 + aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 call _key_expansion_192a - AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 + aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 call _key_expansion_192b - AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 + aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 call _key_expansion_192a - AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 + aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 call _key_expansion_192b - AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 + aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 call _key_expansion_192a - AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 + aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 call _key_expansion_192b - AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 + aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 call _key_expansion_192a - AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8 + aeskeygenassist $0x80, %xmm2, %xmm1 # round 8 call _key_expansion_192b jmp .Ldec_key .Lenc_key128: - AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1 + aeskeygenassist $0x1, %xmm0, %xmm1 # round 1 call _key_expansion_128 - AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2 + aeskeygenassist $0x2, %xmm0, %xmm1 # round 2 call _key_expansion_128 - AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3 + aeskeygenassist $0x4, %xmm0, %xmm1 # round 3 call _key_expansion_128 - AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4 + aeskeygenassist $0x8, %xmm0, %xmm1 # round 4 call _key_expansion_128 - AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5 + aeskeygenassist $0x10, %xmm0, %xmm1 # round 5 call _key_expansion_128 - AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6 + aeskeygenassist $0x20, %xmm0, %xmm1 # round 6 call _key_expansion_128 - AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7 + aeskeygenassist $0x40, %xmm0, %xmm1 # round 7 call _key_expansion_128 - AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8 + aeskeygenassist $0x80, %xmm0, %xmm1 # round 8 call _key_expansion_128 - AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9 + aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9 call _key_expansion_128 - AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10 + aeskeygenassist $0x36, %xmm0, %xmm1 # round 10 call _key_expansion_128 .Ldec_key: sub $0x10, TKEYP @@ -1927,7 +1926,7 @@ SYM_FUNC_START(aesni_set_key) .align 4 .Ldec_key_loop: movaps (KEYP), %xmm0 - AESIMC %xmm0 %xmm1 + aesimc %xmm0, %xmm1 movaps %xmm1, (UKEYP) add $0x10, KEYP sub $0x10, UKEYP @@ -1988,37 +1987,37 @@ SYM_FUNC_START_LOCAL(_aesni_enc1) je .Lenc192 add $0x20, TKEYP movaps -0x60(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE movaps -0x50(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE .align 4 .Lenc192: movaps -0x40(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE movaps -0x30(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE .align 4 .Lenc128: movaps -0x20(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE movaps -0x10(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE movaps (TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE movaps 0x10(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE movaps 0x20(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE movaps 0x30(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE movaps 0x40(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE movaps 0x50(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE movaps 0x60(TKEYP), KEY - AESENC KEY STATE + aesenc KEY, STATE movaps 0x70(TKEYP), KEY - AESENCLAST KEY STATE + aesenclast KEY, STATE ret SYM_FUNC_END(_aesni_enc1) @@ -2054,79 +2053,79 @@ SYM_FUNC_START_LOCAL(_aesni_enc4) je .L4enc192 add $0x20, TKEYP movaps -0x60(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 movaps -0x50(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 #.align 4 .L4enc192: movaps -0x40(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 movaps -0x30(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 #.align 4 .L4enc128: movaps -0x20(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 movaps -0x10(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 movaps (TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 movaps 0x10(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 movaps 0x20(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 movaps 0x30(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 movaps 0x40(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 movaps 0x50(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 movaps 0x60(TKEYP), KEY - AESENC KEY STATE1 - AESENC KEY STATE2 - AESENC KEY STATE3 - AESENC KEY STATE4 + aesenc KEY, STATE1 + aesenc KEY, STATE2 + aesenc KEY, STATE3 + aesenc KEY, STATE4 movaps 0x70(TKEYP), KEY - AESENCLAST KEY STATE1 # last round - AESENCLAST KEY STATE2 - AESENCLAST KEY STATE3 - AESENCLAST KEY STATE4 + aesenclast KEY, STATE1 # last round + aesenclast KEY, STATE2 + aesenclast KEY, STATE3 + aesenclast KEY, STATE4 ret SYM_FUNC_END(_aesni_enc4) @@ -2178,37 +2177,37 @@ SYM_FUNC_START_LOCAL(_aesni_dec1) je .Ldec192 add $0x20, TKEYP movaps -0x60(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE movaps -0x50(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE .align 4 .Ldec192: movaps -0x40(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE movaps -0x30(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE .align 4 .Ldec128: movaps -0x20(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE movaps -0x10(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE movaps (TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE movaps 0x10(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE movaps 0x20(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE movaps 0x30(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE movaps 0x40(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE movaps 0x50(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE movaps 0x60(TKEYP), KEY - AESDEC KEY STATE + aesdec KEY, STATE movaps 0x70(TKEYP), KEY - AESDECLAST KEY STATE + aesdeclast KEY, STATE ret SYM_FUNC_END(_aesni_dec1) @@ -2244,79 +2243,79 @@ SYM_FUNC_START_LOCAL(_aesni_dec4) je .L4dec192 add $0x20, TKEYP movaps -0x60(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 movaps -0x50(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 .align 4 .L4dec192: movaps -0x40(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 movaps -0x30(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 .align 4 .L4dec128: movaps -0x20(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 movaps -0x10(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 movaps (TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 movaps 0x10(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 movaps 0x20(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 movaps 0x30(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 movaps 0x40(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 movaps 0x50(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 movaps 0x60(TKEYP), KEY - AESDEC KEY STATE1 - AESDEC KEY STATE2 - AESDEC KEY STATE3 - AESDEC KEY STATE4 + aesdec KEY, STATE1 + aesdec KEY, STATE2 + aesdec KEY, STATE3 + aesdec KEY, STATE4 movaps 0x70(TKEYP), KEY - AESDECLAST KEY STATE1 # last round - AESDECLAST KEY STATE2 - AESDECLAST KEY STATE3 - AESDECLAST KEY STATE4 + aesdeclast KEY, STATE1 # last round + aesdeclast KEY, STATE2 + aesdeclast KEY, STATE3 + aesdeclast KEY, STATE4 ret SYM_FUNC_END(_aesni_dec4) @@ -2599,10 +2598,10 @@ SYM_FUNC_END(aesni_cbc_dec) SYM_FUNC_START_LOCAL(_aesni_inc_init) movaps .Lbswap_mask, BSWAP_MASK movaps IV, CTR - PSHUFB_XMM BSWAP_MASK CTR + pshufb BSWAP_MASK, CTR mov $1, TCTR_LOW - MOVQ_R64_XMM TCTR_LOW INC - MOVQ_R64_XMM CTR TCTR_LOW + movq TCTR_LOW, INC + movq CTR, TCTR_LOW ret SYM_FUNC_END(_aesni_inc_init) @@ -2630,7 +2629,7 @@ SYM_FUNC_START_LOCAL(_aesni_inc) psrldq $8, INC .Linc_low: movaps CTR, IV - PSHUFB_XMM BSWAP_MASK IV + pshufb BSWAP_MASK, IV ret SYM_FUNC_END(_aesni_inc) diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S index 0cea33295287..5fee47956f3b 100644 --- a/arch/x86/crypto/aesni-intel_avx-x86_64.S +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -120,7 +120,6 @@ ## #include <linux/linkage.h> -#include <asm/inst.h> # constants in mergeable sections, linker can reorder and merge .section .rodata.cst16.POLY, "aM", @progbits, 16 diff --git a/arch/x86/crypto/chacha-ssse3-x86_64.S b/arch/x86/crypto/chacha-ssse3-x86_64.S index a38ab2512a6f..ca1788bfee16 100644 --- a/arch/x86/crypto/chacha-ssse3-x86_64.S +++ b/arch/x86/crypto/chacha-ssse3-x86_64.S @@ -120,10 +120,10 @@ SYM_FUNC_START(chacha_block_xor_ssse3) FRAME_BEGIN # x0..3 = s0..3 - movdqa 0x00(%rdi),%xmm0 - movdqa 0x10(%rdi),%xmm1 - movdqa 0x20(%rdi),%xmm2 - movdqa 0x30(%rdi),%xmm3 + movdqu 0x00(%rdi),%xmm0 + movdqu 0x10(%rdi),%xmm1 + movdqu 0x20(%rdi),%xmm2 + movdqu 0x30(%rdi),%xmm3 movdqa %xmm0,%xmm8 movdqa %xmm1,%xmm9 movdqa %xmm2,%xmm10 @@ -205,10 +205,10 @@ SYM_FUNC_START(hchacha_block_ssse3) # %edx: nrounds FRAME_BEGIN - movdqa 0x00(%rdi),%xmm0 - movdqa 0x10(%rdi),%xmm1 - movdqa 0x20(%rdi),%xmm2 - movdqa 0x30(%rdi),%xmm3 + movdqu 0x00(%rdi),%xmm0 + movdqu 0x10(%rdi),%xmm1 + movdqu 0x20(%rdi),%xmm2 + movdqu 0x30(%rdi),%xmm3 mov %edx,%r8d call chacha_permute diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c index 22250091cdbe..e67a59130025 100644 --- a/arch/x86/crypto/chacha_glue.c +++ b/arch/x86/crypto/chacha_glue.c @@ -14,8 +14,6 @@ #include <linux/module.h> #include <asm/simd.h> -#define CHACHA_STATE_ALIGN 16 - asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, @@ -124,8 +122,6 @@ static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds) { - state = PTR_ALIGN(state, CHACHA_STATE_ALIGN); - if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) { hchacha_block_generic(state, stream, nrounds); } else { @@ -138,8 +134,6 @@ EXPORT_SYMBOL(hchacha_block_arch); void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv) { - state = PTR_ALIGN(state, CHACHA_STATE_ALIGN); - chacha_init_generic(state, key, iv); } EXPORT_SYMBOL(chacha_init_arch); @@ -147,8 +141,6 @@ EXPORT_SYMBOL(chacha_init_arch); void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, int nrounds) { - state = PTR_ALIGN(state, CHACHA_STATE_ALIGN); - if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() || bytes <= CHACHA_BLOCK_SIZE) return chacha_crypt_generic(state, dst, src, bytes, nrounds); @@ -170,15 +162,12 @@ EXPORT_SYMBOL(chacha_crypt_arch); static int chacha_simd_stream_xor(struct skcipher_request *req, const struct chacha_ctx *ctx, const u8 *iv) { - u32 *state, state_buf[16 + 2] __aligned(8); + u32 state[CHACHA_STATE_WORDS] __aligned(8); struct skcipher_walk walk; int err; err = skcipher_walk_virt(&walk, req, false); - BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); - state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN); - chacha_init_generic(state, ctx->key, iv); while (walk.nbytes > 0) { @@ -217,12 +206,10 @@ static int xchacha_simd(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - u32 *state, state_buf[16 + 2] __aligned(8); + u32 state[CHACHA_STATE_WORDS] __aligned(8); struct chacha_ctx subctx; u8 real_iv[16]; - BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); - state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN); chacha_init_generic(state, ctx->key, req->iv); if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) { diff --git a/arch/x86/crypto/crc32-pclmul_asm.S b/arch/x86/crypto/crc32-pclmul_asm.S index 9fd28ff65bc2..6e7d4c4d3208 100644 --- a/arch/x86/crypto/crc32-pclmul_asm.S +++ b/arch/x86/crypto/crc32-pclmul_asm.S @@ -38,7 +38,6 @@ */ #include <linux/linkage.h> -#include <asm/inst.h> .section .rodata @@ -129,17 +128,17 @@ loop_64:/* 64 bytes Full cache line folding */ #ifdef __x86_64__ movdqa %xmm4, %xmm8 #endif - PCLMULQDQ 00, CONSTANT, %xmm1 - PCLMULQDQ 00, CONSTANT, %xmm2 - PCLMULQDQ 00, CONSTANT, %xmm3 + pclmulqdq $0x00, CONSTANT, %xmm1 + pclmulqdq $0x00, CONSTANT, %xmm2 + pclmulqdq $0x00, CONSTANT, %xmm3 #ifdef __x86_64__ - PCLMULQDQ 00, CONSTANT, %xmm4 + pclmulqdq $0x00, CONSTANT, %xmm4 #endif - PCLMULQDQ 0x11, CONSTANT, %xmm5 - PCLMULQDQ 0x11, CONSTANT, %xmm6 - PCLMULQDQ 0x11, CONSTANT, %xmm7 + pclmulqdq $0x11, CONSTANT, %xmm5 + pclmulqdq $0x11, CONSTANT, %xmm6 + pclmulqdq $0x11, CONSTANT, %xmm7 #ifdef __x86_64__ - PCLMULQDQ 0x11, CONSTANT, %xmm8 + pclmulqdq $0x11, CONSTANT, %xmm8 #endif pxor %xmm5, %xmm1 pxor %xmm6, %xmm2 @@ -149,8 +148,8 @@ loop_64:/* 64 bytes Full cache line folding */ #else /* xmm8 unsupported for x32 */ movdqa %xmm4, %xmm5 - PCLMULQDQ 00, CONSTANT, %xmm4 - PCLMULQDQ 0x11, CONSTANT, %xmm5 + pclmulqdq $0x00, CONSTANT, %xmm4 + pclmulqdq $0x11, CONSTANT, %xmm5 pxor %xmm5, %xmm4 #endif @@ -172,20 +171,20 @@ less_64:/* Folding cache line into 128bit */ prefetchnta (BUF) movdqa %xmm1, %xmm5 - PCLMULQDQ 0x00, CONSTANT, %xmm1 - PCLMULQDQ 0x11, CONSTANT, %xmm5 + pclmulqdq $0x00, CONSTANT, %xmm1 + pclmulqdq $0x11, CONSTANT, %xmm5 pxor %xmm5, %xmm1 pxor %xmm2, %xmm1 movdqa %xmm1, %xmm5 - PCLMULQDQ 0x00, CONSTANT, %xmm1 - PCLMULQDQ 0x11, CONSTANT, %xmm5 + pclmulqdq $0x00, CONSTANT, %xmm1 + pclmulqdq $0x11, CONSTANT, %xmm5 pxor %xmm5, %xmm1 pxor %xmm3, %xmm1 movdqa %xmm1, %xmm5 - PCLMULQDQ 0x00, CONSTANT, %xmm1 - PCLMULQDQ 0x11, CONSTANT, %xmm5 + pclmulqdq $0x00, CONSTANT, %xmm1 + pclmulqdq $0x11, CONSTANT, %xmm5 pxor %xmm5, %xmm1 pxor %xmm4, %xmm1 @@ -193,8 +192,8 @@ less_64:/* Folding cache line into 128bit */ jb fold_64 loop_16:/* Folding rest buffer into 128bit */ movdqa %xmm1, %xmm5 - PCLMULQDQ 0x00, CONSTANT, %xmm1 - PCLMULQDQ 0x11, CONSTANT, %xmm5 + pclmulqdq $0x00, CONSTANT, %xmm1 + pclmulqdq $0x11, CONSTANT, %xmm5 pxor %xmm5, %xmm1 pxor (BUF), %xmm1 sub $0x10, LEN @@ -205,7 +204,7 @@ loop_16:/* Folding rest buffer into 128bit */ fold_64: /* perform the last 64 bit fold, also adds 32 zeroes * to the input stream */ - PCLMULQDQ 0x01, %xmm1, CONSTANT /* R4 * xmm1.low */ + pclmulqdq $0x01, %xmm1, CONSTANT /* R4 * xmm1.low */ psrldq $0x08, %xmm1 pxor CONSTANT, %xmm1 @@ -220,7 +219,7 @@ fold_64: #endif psrldq $0x04, %xmm2 pand %xmm3, %xmm1 - PCLMULQDQ 0x00, CONSTANT, %xmm1 + pclmulqdq $0x00, CONSTANT, %xmm1 pxor %xmm2, %xmm1 /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ @@ -231,11 +230,11 @@ fold_64: #endif movdqa %xmm1, %xmm2 pand %xmm3, %xmm1 - PCLMULQDQ 0x10, CONSTANT, %xmm1 + pclmulqdq $0x10, CONSTANT, %xmm1 pand %xmm3, %xmm1 - PCLMULQDQ 0x00, CONSTANT, %xmm1 + pclmulqdq $0x00, CONSTANT, %xmm1 pxor %xmm2, %xmm1 - PEXTRD 0x01, %xmm1, %eax + pextrd $0x01, %xmm1, %eax ret SYM_FUNC_END(crc32_pclmul_le_16) diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index 8501ec4532f4..884dc767b051 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -43,7 +43,6 @@ * SOFTWARE. */ -#include <asm/inst.h> #include <linux/linkage.h> #include <asm/nospec-branch.h> @@ -170,7 +169,7 @@ continue_block: ## branch into array lea jump_table(%rip), %bufp - movzxw (%bufp, %rax, 2), len + movzwq (%bufp, %rax, 2), len lea crc_array(%rip), %bufp lea (%bufp, len, 1), %bufp JMP_NOSPEC bufp @@ -225,10 +224,10 @@ LABEL crc_ %i subq %rax, tmp # tmp -= rax*24 movq crc_init, %xmm1 # CRC for block 1 - PCLMULQDQ 0x00,%xmm0,%xmm1 # Multiply by K2 + pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2 movq crc1, %xmm2 # CRC for block 2 - PCLMULQDQ 0x10, %xmm0, %xmm2 # Multiply by K1 + pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1 pxor %xmm2,%xmm1 movq %xmm1, %rax diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c index 8a17621f7d3a..8acbb6584a37 100644 --- a/arch/x86/crypto/curve25519-x86_64.c +++ b/arch/x86/crypto/curve25519-x86_64.c @@ -948,10 +948,8 @@ static void store_felem(u64 *b, u64 *f) { u64 f30 = f[3U]; u64 top_bit0 = f30 >> (u32)63U; - u64 carry0; u64 f31; u64 top_bit; - u64 carry; u64 f0; u64 f1; u64 f2; @@ -970,11 +968,11 @@ static void store_felem(u64 *b, u64 *f) u64 o2; u64 o3; f[3U] = f30 & (u64)0x7fffffffffffffffU; - carry0 = add_scalar(f, f, (u64)19U * top_bit0); + add_scalar(f, f, (u64)19U * top_bit0); f31 = f[3U]; top_bit = f31 >> (u32)63U; f[3U] = f31 & (u64)0x7fffffffffffffffU; - carry = add_scalar(f, f, (u64)19U * top_bit); + add_scalar(f, f, (u64)19U * top_bit); f0 = f[0U]; f1 = f[1U]; f2 = f[2U]; diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S index bb9735fbb865..99ac25e18e09 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_asm.S +++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S @@ -14,7 +14,6 @@ */ #include <linux/linkage.h> -#include <asm/inst.h> #include <asm/frame.h> .section .rodata.cst16.bswap_mask, "aM", @progbits, 16 @@ -51,9 +50,9 @@ SYM_FUNC_START_LOCAL(__clmul_gf128mul_ble) pxor DATA, T2 pxor SHASH, T3 - PCLMULQDQ 0x00 SHASH DATA # DATA = a0 * b0 - PCLMULQDQ 0x11 SHASH T1 # T1 = a1 * b1 - PCLMULQDQ 0x00 T3 T2 # T2 = (a1 + a0) * (b1 + b0) + pclmulqdq $0x00, SHASH, DATA # DATA = a0 * b0 + pclmulqdq $0x11, SHASH, T1 # T1 = a1 * b1 + pclmulqdq $0x00, T3, T2 # T2 = (a1 + a0) * (b1 + b0) pxor DATA, T2 pxor T1, T2 # T2 = a0 * b1 + a1 * b0 @@ -95,9 +94,9 @@ SYM_FUNC_START(clmul_ghash_mul) movups (%rdi), DATA movups (%rsi), SHASH movaps .Lbswap_mask, BSWAP - PSHUFB_XMM BSWAP DATA + pshufb BSWAP, DATA call __clmul_gf128mul_ble - PSHUFB_XMM BSWAP DATA + pshufb BSWAP, DATA movups DATA, (%rdi) FRAME_END ret @@ -114,18 +113,18 @@ SYM_FUNC_START(clmul_ghash_update) movaps .Lbswap_mask, BSWAP movups (%rdi), DATA movups (%rcx), SHASH - PSHUFB_XMM BSWAP DATA + pshufb BSWAP, DATA .align 4 .Lupdate_loop: movups (%rsi), IN1 - PSHUFB_XMM BSWAP IN1 + pshufb BSWAP, IN1 pxor IN1, DATA call __clmul_gf128mul_ble sub $16, %rdx add $16, %rsi cmp $16, %rdx jge .Lupdate_loop - PSHUFB_XMM BSWAP DATA + pshufb BSWAP, DATA movups DATA, (%rdi) .Lupdate_just_ret: FRAME_END diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 4208c1e3f601..98e4d8886f11 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -6,6 +6,7 @@ #include <asm/percpu.h> #include <asm/asm-offsets.h> #include <asm/processor-flags.h> +#include <asm/inst.h> /* @@ -341,6 +342,12 @@ For 32-bit we have the following conventions - kernel is built with #endif .endm +.macro SAVE_AND_SET_GSBASE scratch_reg:req save_reg:req + rdgsbase \save_reg + GET_PERCPU_BASE \scratch_reg + wrgsbase \scratch_reg +.endm + #else /* CONFIG_X86_64 */ # undef UNWIND_HINT_IRET_REGS # define UNWIND_HINT_IRET_REGS @@ -351,3 +358,36 @@ For 32-bit we have the following conventions - kernel is built with call stackleak_erase #endif .endm + +#ifdef CONFIG_SMP + +/* + * CPU/node NR is loaded from the limit (size) field of a special segment + * descriptor entry in GDT. + */ +.macro LOAD_CPU_AND_NODE_SEG_LIMIT reg:req + movq $__CPUNODE_SEG, \reg + lsl \reg, \reg +.endm + +/* + * Fetch the per-CPU GSBASE value for this processor and put it in @reg. + * We normally use %gs for accessing per-CPU data, but we are setting up + * %gs here and obviously can not use %gs itself to access per-CPU data. + */ +.macro GET_PERCPU_BASE reg:req + ALTERNATIVE \ + "LOAD_CPU_AND_NODE_SEG_LIMIT \reg", \ + "RDPID \reg", \ + X86_FEATURE_RDPID + andq $VDSO_CPUNODE_MASK, \reg + movq __per_cpu_offset(, \reg, 8), \reg +.endm + +#else + +.macro GET_PERCPU_BASE reg:req + movq pcpu_unit_offsets(%rip), \reg +.endm + +#endif /* CONFIG_SMP */ diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index f09288431f28..48512c7944e7 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -10,20 +10,13 @@ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> +#include <linux/entry-common.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/errno.h> #include <linux/ptrace.h> -#include <linux/tracehook.h> -#include <linux/audit.h> -#include <linux/seccomp.h> -#include <linux/signal.h> #include <linux/export.h> -#include <linux/context_tracking.h> -#include <linux/user-return-notifier.h> #include <linux/nospec.h> -#include <linux/uprobes.h> -#include <linux/livepatch.h> #include <linux/syscalls.h> #include <linux/uaccess.h> @@ -42,343 +35,12 @@ #include <asm/syscall.h> #include <asm/irq_stack.h> -#define CREATE_TRACE_POINTS -#include <trace/events/syscalls.h> - -/* Check that the stack and regs on entry from user mode are sane. */ -static noinstr void check_user_regs(struct pt_regs *regs) -{ - if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) { - /* - * Make sure that the entry code gave us a sensible EFLAGS - * register. Native because we want to check the actual CPU - * state, not the interrupt state as imagined by Xen. - */ - unsigned long flags = native_save_fl(); - WARN_ON_ONCE(flags & (X86_EFLAGS_AC | X86_EFLAGS_DF | - X86_EFLAGS_NT)); - - /* We think we came from user mode. Make sure pt_regs agrees. */ - WARN_ON_ONCE(!user_mode(regs)); - - /* - * All entries from user mode (except #DF) should be on the - * normal thread stack and should have user pt_regs in the - * correct location. - */ - WARN_ON_ONCE(!on_thread_stack()); - WARN_ON_ONCE(regs != task_pt_regs(current)); - } -} - -#ifdef CONFIG_CONTEXT_TRACKING -/** - * enter_from_user_mode - Establish state when coming from user mode - * - * Syscall entry disables interrupts, but user mode is traced as interrupts - * enabled. Also with NO_HZ_FULL RCU might be idle. - * - * 1) Tell lockdep that interrupts are disabled - * 2) Invoke context tracking if enabled to reactivate RCU - * 3) Trace interrupts off state - */ -static noinstr void enter_from_user_mode(void) -{ - enum ctx_state state = ct_state(); - - lockdep_hardirqs_off(CALLER_ADDR0); - user_exit_irqoff(); - - instrumentation_begin(); - CT_WARN_ON(state != CONTEXT_USER); - trace_hardirqs_off_finish(); - instrumentation_end(); -} -#else -static __always_inline void enter_from_user_mode(void) -{ - lockdep_hardirqs_off(CALLER_ADDR0); - instrumentation_begin(); - trace_hardirqs_off_finish(); - instrumentation_end(); -} -#endif - -/** - * exit_to_user_mode - Fixup state when exiting to user mode - * - * Syscall exit enables interrupts, but the kernel state is interrupts - * disabled when this is invoked. Also tell RCU about it. - * - * 1) Trace interrupts on state - * 2) Invoke context tracking if enabled to adjust RCU state - * 3) Clear CPU buffers if CPU is affected by MDS and the migitation is on. - * 4) Tell lockdep that interrupts are enabled - */ -static __always_inline void exit_to_user_mode(void) -{ - instrumentation_begin(); - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); - instrumentation_end(); - - user_enter_irqoff(); - mds_user_clear_cpu_buffers(); - lockdep_hardirqs_on(CALLER_ADDR0); -} - -static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) -{ -#ifdef CONFIG_X86_64 - if (arch == AUDIT_ARCH_X86_64) { - audit_syscall_entry(regs->orig_ax, regs->di, - regs->si, regs->dx, regs->r10); - } else -#endif - { - audit_syscall_entry(regs->orig_ax, regs->bx, - regs->cx, regs->dx, regs->si); - } -} - -/* - * Returns the syscall nr to run (which should match regs->orig_ax) or -1 - * to skip the syscall. - */ -static long syscall_trace_enter(struct pt_regs *regs) -{ - u32 arch = in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; - - struct thread_info *ti = current_thread_info(); - unsigned long ret = 0; - u32 work; - - work = READ_ONCE(ti->flags); - - if (work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) { - ret = tracehook_report_syscall_entry(regs); - if (ret || (work & _TIF_SYSCALL_EMU)) - return -1L; - } - -#ifdef CONFIG_SECCOMP - /* - * Do seccomp after ptrace, to catch any tracer changes. - */ - if (work & _TIF_SECCOMP) { - struct seccomp_data sd; - - sd.arch = arch; - sd.nr = regs->orig_ax; - sd.instruction_pointer = regs->ip; -#ifdef CONFIG_X86_64 - if (arch == AUDIT_ARCH_X86_64) { - sd.args[0] = regs->di; - sd.args[1] = regs->si; - sd.args[2] = regs->dx; - sd.args[3] = regs->r10; - sd.args[4] = regs->r8; - sd.args[5] = regs->r9; - } else -#endif - { - sd.args[0] = regs->bx; - sd.args[1] = regs->cx; - sd.args[2] = regs->dx; - sd.args[3] = regs->si; - sd.args[4] = regs->di; - sd.args[5] = regs->bp; - } - - ret = __secure_computing(&sd); - if (ret == -1) - return ret; - } -#endif - - if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) - trace_sys_enter(regs, regs->orig_ax); - - do_audit_syscall_entry(regs, arch); - - return ret ?: regs->orig_ax; -} - -#define EXIT_TO_USERMODE_LOOP_FLAGS \ - (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ - _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING) - -static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) -{ - /* - * In order to return to user mode, we need to have IRQs off with - * none of EXIT_TO_USERMODE_LOOP_FLAGS set. Several of these flags - * can be set at any time on preemptible kernels if we have IRQs on, - * so we need to loop. Disabling preemption wouldn't help: doing the - * work to clear some of the flags can sleep. - */ - while (true) { - /* We have work to do. */ - local_irq_enable(); - - if (cached_flags & _TIF_NEED_RESCHED) - schedule(); - - if (cached_flags & _TIF_UPROBE) - uprobe_notify_resume(regs); - - if (cached_flags & _TIF_PATCH_PENDING) - klp_update_patch_state(current); - - /* deal with pending signal delivery */ - if (cached_flags & _TIF_SIGPENDING) - do_signal(regs); - - if (cached_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); - tracehook_notify_resume(regs); - rseq_handle_notify_resume(NULL, regs); - } - - if (cached_flags & _TIF_USER_RETURN_NOTIFY) - fire_user_return_notifiers(); - - /* Disable IRQs and retry */ - local_irq_disable(); - - cached_flags = READ_ONCE(current_thread_info()->flags); - - if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) - break; - } -} - -static void __prepare_exit_to_usermode(struct pt_regs *regs) -{ - struct thread_info *ti = current_thread_info(); - u32 cached_flags; - - addr_limit_user_check(); - - lockdep_assert_irqs_disabled(); - lockdep_sys_exit(); - - cached_flags = READ_ONCE(ti->flags); - - if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) - exit_to_usermode_loop(regs, cached_flags); - - /* Reload ti->flags; we may have rescheduled above. */ - cached_flags = READ_ONCE(ti->flags); - - if (unlikely(cached_flags & _TIF_IO_BITMAP)) - tss_update_io_bitmap(); - - fpregs_assert_state_consistent(); - if (unlikely(cached_flags & _TIF_NEED_FPU_LOAD)) - switch_fpu_return(); - -#ifdef CONFIG_COMPAT - /* - * Compat syscalls set TS_COMPAT. Make sure we clear it before - * returning to user mode. We need to clear it *after* signal - * handling, because syscall restart has a fixup for compat - * syscalls. The fixup is exercised by the ptrace_syscall_32 - * selftest. - * - * We also need to clear TS_REGS_POKED_I386: the 32-bit tracer - * special case only applies after poking regs and before the - * very next return to user mode. - */ - ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED); -#endif -} - -static noinstr void prepare_exit_to_usermode(struct pt_regs *regs) -{ - instrumentation_begin(); - __prepare_exit_to_usermode(regs); - instrumentation_end(); - exit_to_user_mode(); -} - -#define SYSCALL_EXIT_WORK_FLAGS \ - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ - _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT) - -static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags) -{ - bool step; - - audit_syscall_exit(regs); - - if (cached_flags & _TIF_SYSCALL_TRACEPOINT) - trace_sys_exit(regs, regs->ax); - - /* - * If TIF_SYSCALL_EMU is set, we only get here because of - * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). - * We already reported this syscall instruction in - * syscall_trace_enter(). - */ - step = unlikely( - (cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)) - == _TIF_SINGLESTEP); - if (step || cached_flags & _TIF_SYSCALL_TRACE) - tracehook_report_syscall_exit(regs, step); -} - -static void __syscall_return_slowpath(struct pt_regs *regs) -{ - struct thread_info *ti = current_thread_info(); - u32 cached_flags = READ_ONCE(ti->flags); - - CT_WARN_ON(ct_state() != CONTEXT_KERNEL); - - if (IS_ENABLED(CONFIG_PROVE_LOCKING) && - WARN(irqs_disabled(), "syscall %ld left IRQs disabled", regs->orig_ax)) - local_irq_enable(); - - rseq_syscall(regs); - - /* - * First do one-time work. If these work items are enabled, we - * want to run them exactly once per syscall exit with IRQs on. - */ - if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS)) - syscall_slow_exit_work(regs, cached_flags); - - local_irq_disable(); - __prepare_exit_to_usermode(regs); -} - -/* - * Called with IRQs on and fully valid regs. Returns with IRQs off in a - * state such that we can immediately switch to user mode. - */ -__visible noinstr void syscall_return_slowpath(struct pt_regs *regs) -{ - instrumentation_begin(); - __syscall_return_slowpath(regs); - instrumentation_end(); - exit_to_user_mode(); -} - #ifdef CONFIG_X86_64 __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) { - struct thread_info *ti; + nr = syscall_enter_from_user_mode(regs, nr); - check_user_regs(regs); - - enter_from_user_mode(); instrumentation_begin(); - - local_irq_enable(); - ti = current_thread_info(); - if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) - nr = syscall_trace_enter(regs); - if (likely(nr < NR_syscalls)) { nr = array_index_nospec(nr, NR_syscalls); regs->ax = sys_call_table[nr](regs); @@ -390,66 +52,55 @@ __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs) regs->ax = x32_sys_call_table[nr](regs); #endif } - __syscall_return_slowpath(regs); - instrumentation_end(); - exit_to_user_mode(); + syscall_exit_to_user_mode(regs); } #endif #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) -/* - * Does a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. Does - * all entry and exit work and returns with IRQs off. This function is - * extremely hot in workloads that use it, and it's usually called from - * do_fast_syscall_32, so forcibly inline it to improve performance. - */ -static void do_syscall_32_irqs_on(struct pt_regs *regs) +static __always_inline unsigned int syscall_32_enter(struct pt_regs *regs) { - struct thread_info *ti = current_thread_info(); unsigned int nr = (unsigned int)regs->orig_ax; -#ifdef CONFIG_IA32_EMULATION - ti->status |= TS_COMPAT; -#endif - - if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) { - /* - * Subtlety here: if ptrace pokes something larger than - * 2^32-1 into orig_ax, this truncates it. This may or - * may not be necessary, but it matches the old asm - * behavior. - */ - nr = syscall_trace_enter(regs); - } + if (IS_ENABLED(CONFIG_IA32_EMULATION)) + current_thread_info()->status |= TS_COMPAT; + /* + * Subtlety here: if ptrace pokes something larger than 2^32-1 into + * orig_ax, the unsigned int return value truncates it. This may + * or may not be necessary, but it matches the old asm behavior. + */ + return (unsigned int)syscall_enter_from_user_mode(regs, nr); +} +/* + * Invoke a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL. + */ +static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, + unsigned int nr) +{ if (likely(nr < IA32_NR_syscalls)) { + instrumentation_begin(); nr = array_index_nospec(nr, IA32_NR_syscalls); regs->ax = ia32_sys_call_table[nr](regs); + instrumentation_end(); } - - __syscall_return_slowpath(regs); } /* Handles int $0x80 */ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) { - check_user_regs(regs); + unsigned int nr = syscall_32_enter(regs); - enter_from_user_mode(); - instrumentation_begin(); - - local_irq_enable(); - do_syscall_32_irqs_on(regs); - - instrumentation_end(); - exit_to_user_mode(); + do_syscall_32_irqs_on(regs, nr); + syscall_exit_to_user_mode(regs); } -static bool __do_fast_syscall_32(struct pt_regs *regs) +static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) { + unsigned int nr = syscall_32_enter(regs); int res; + instrumentation_begin(); /* Fetch EBP from where the vDSO stashed it. */ if (IS_ENABLED(CONFIG_X86_64)) { /* @@ -462,17 +113,18 @@ static bool __do_fast_syscall_32(struct pt_regs *regs) res = get_user(*(u32 *)®s->bp, (u32 __user __force *)(unsigned long)(u32)regs->sp); } + instrumentation_end(); if (res) { /* User code screwed up. */ regs->ax = -EFAULT; - local_irq_disable(); - __prepare_exit_to_usermode(regs); + syscall_exit_to_user_mode(regs); return false; } /* Now this is just like a normal syscall. */ - do_syscall_32_irqs_on(regs); + do_syscall_32_irqs_on(regs, nr); + syscall_exit_to_user_mode(regs); return true; } @@ -485,9 +137,6 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs) */ unsigned long landing_pad = (unsigned long)current->mm->context.vdso + vdso_image_32.sym_int80_landing_pad; - bool success; - - check_user_regs(regs); /* * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward @@ -496,17 +145,8 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs) */ regs->ip = landing_pad; - enter_from_user_mode(); - instrumentation_begin(); - - local_irq_enable(); - success = __do_fast_syscall_32(regs); - - instrumentation_end(); - exit_to_user_mode(); - - /* If it failed, keep it simple: use IRET. */ - if (!success) + /* Invoke the syscall. If it failed, keep it simple: use IRET. */ + if (!__do_fast_syscall_32(regs)) return 0; #ifdef CONFIG_X86_64 @@ -558,197 +198,38 @@ SYSCALL_DEFINE0(ni_syscall) return -ENOSYS; } -/** - * idtentry_enter_cond_rcu - Handle state tracking on idtentry with conditional - * RCU handling - * @regs: Pointer to pt_regs of interrupted context - * - * Invokes: - * - lockdep irqflag state tracking as low level ASM entry disabled - * interrupts. - * - * - Context tracking if the exception hit user mode. - * - * - The hardirq tracer to keep the state consistent as low level ASM - * entry disabled interrupts. - * - * For kernel mode entries RCU handling is done conditional. If RCU is - * watching then the only RCU requirement is to check whether the tick has - * to be restarted. If RCU is not watching then rcu_irq_enter() has to be - * invoked on entry and rcu_irq_exit() on exit. - * - * Avoiding the rcu_irq_enter/exit() calls is an optimization but also - * solves the problem of kernel mode pagefaults which can schedule, which - * is not possible after invoking rcu_irq_enter() without undoing it. - * - * For user mode entries enter_from_user_mode() must be invoked to - * establish the proper context for NOHZ_FULL. Otherwise scheduling on exit - * would not be possible. - * - * Returns: True if RCU has been adjusted on a kernel entry - * False otherwise - * - * The return value must be fed into the rcu_exit argument of - * idtentry_exit_cond_rcu(). - */ -bool noinstr idtentry_enter_cond_rcu(struct pt_regs *regs) +noinstr bool idtentry_enter_nmi(struct pt_regs *regs) { - if (user_mode(regs)) { - check_user_regs(regs); - enter_from_user_mode(); - return false; - } + bool irq_state = lockdep_hardirqs_enabled(); - /* - * If this entry hit the idle task invoke rcu_irq_enter() whether - * RCU is watching or not. - * - * Interupts can nest when the first interrupt invokes softirq - * processing on return which enables interrupts. - * - * Scheduler ticks in the idle task can mark quiescent state and - * terminate a grace period, if and only if the timer interrupt is - * not nested into another interrupt. - * - * Checking for __rcu_is_watching() here would prevent the nesting - * interrupt to invoke rcu_irq_enter(). If that nested interrupt is - * the tick then rcu_flavor_sched_clock_irq() would wrongfully - * assume that it is the first interupt and eventually claim - * quiescient state and end grace periods prematurely. - * - * Unconditionally invoke rcu_irq_enter() so RCU state stays - * consistent. - * - * TINY_RCU does not support EQS, so let the compiler eliminate - * this part when enabled. - */ - if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { - /* - * If RCU is not watching then the same careful - * sequence vs. lockdep and tracing is required - * as in enter_from_user_mode(). - */ - lockdep_hardirqs_off(CALLER_ADDR0); - rcu_irq_enter(); - instrumentation_begin(); - trace_hardirqs_off_finish(); - instrumentation_end(); - - return true; - } + __nmi_enter(); + lockdep_hardirqs_off(CALLER_ADDR0); + lockdep_hardirq_enter(); + rcu_nmi_enter(); - /* - * If RCU is watching then RCU only wants to check whether it needs - * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick() - * already contains a warning when RCU is not watching, so no point - * in having another one here. - */ instrumentation_begin(); - rcu_irq_enter_check_tick(); - /* Use the combo lockdep/tracing function */ - trace_hardirqs_off(); + trace_hardirqs_off_finish(); + ftrace_nmi_enter(); instrumentation_end(); - return false; -} - -static void idtentry_exit_cond_resched(struct pt_regs *regs, bool may_sched) -{ - if (may_sched && !preempt_count()) { - /* Sanity check RCU and thread stack */ - rcu_irq_exit_check_preempt(); - if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) - WARN_ON_ONCE(!on_thread_stack()); - if (need_resched()) - preempt_schedule_irq(); - } - /* Covers both tracing and lockdep */ - trace_hardirqs_on(); + return irq_state; } -/** - * idtentry_exit_cond_rcu - Handle return from exception with conditional RCU - * handling - * @regs: Pointer to pt_regs (exception entry regs) - * @rcu_exit: Invoke rcu_irq_exit() if true - * - * Depending on the return target (kernel/user) this runs the necessary - * preemption and work checks if possible and reguired and returns to - * the caller with interrupts disabled and no further work pending. - * - * This is the last action before returning to the low level ASM code which - * just needs to return to the appropriate context. - * - * Counterpart to idtentry_enter_cond_rcu(). The return value of the entry - * function must be fed into the @rcu_exit argument. - */ -void noinstr idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit) +noinstr void idtentry_exit_nmi(struct pt_regs *regs, bool restore) { - lockdep_assert_irqs_disabled(); - - /* Check whether this returns to user mode */ - if (user_mode(regs)) { - prepare_exit_to_usermode(regs); - } else if (regs->flags & X86_EFLAGS_IF) { - /* - * If RCU was not watching on entry this needs to be done - * carefully and needs the same ordering of lockdep/tracing - * and RCU as the return to user mode path. - */ - if (rcu_exit) { - instrumentation_begin(); - /* Tell the tracer that IRET will enable interrupts */ - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); - instrumentation_end(); - rcu_irq_exit(); - lockdep_hardirqs_on(CALLER_ADDR0); - return; - } - - instrumentation_begin(); - idtentry_exit_cond_resched(regs, IS_ENABLED(CONFIG_PREEMPTION)); - instrumentation_end(); - } else { - /* - * IRQ flags state is correct already. Just tell RCU if it - * was not watching on entry. - */ - if (rcu_exit) - rcu_irq_exit(); + instrumentation_begin(); + ftrace_nmi_exit(); + if (restore) { + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); } -} - -/** - * idtentry_enter_user - Handle state tracking on idtentry from user mode - * @regs: Pointer to pt_regs of interrupted context - * - * Invokes enter_from_user_mode() to establish the proper context for - * NOHZ_FULL. Otherwise scheduling on exit would not be possible. - */ -void noinstr idtentry_enter_user(struct pt_regs *regs) -{ - check_user_regs(regs); - enter_from_user_mode(); -} - -/** - * idtentry_exit_user - Handle return from exception to user mode - * @regs: Pointer to pt_regs (exception entry regs) - * - * Runs the necessary preemption and work checks and returns to the caller - * with interrupts disabled and no further work pending. - * - * This is the last action before returning to the low level ASM code which - * just needs to return to the appropriate context. - * - * Counterpart to idtentry_enter_user(). - */ -void noinstr idtentry_exit_user(struct pt_regs *regs) -{ - lockdep_assert_irqs_disabled(); + instrumentation_end(); - prepare_exit_to_usermode(regs); + rcu_nmi_exit(); + lockdep_hardirq_exit(); + if (restore) + lockdep_hardirqs_on(CALLER_ADDR0); + __nmi_exit(); } #ifdef CONFIG_XEN_PV @@ -800,9 +281,10 @@ static void __xen_pv_evtchn_do_upcall(void) __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) { struct pt_regs *old_regs; - bool inhcall, rcu_exit; + bool inhcall; + irqentry_state_t state; - rcu_exit = idtentry_enter_cond_rcu(regs); + state = irqentry_enter(regs); old_regs = set_irq_regs(regs); instrumentation_begin(); @@ -812,13 +294,13 @@ __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) set_irq_regs(old_regs); inhcall = get_and_clear_inhcall(); - if (inhcall && !WARN_ON_ONCE(rcu_exit)) { + if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) { instrumentation_begin(); - idtentry_exit_cond_resched(regs, true); + irqentry_exit_cond_resched(); instrumentation_end(); restore_inhcall(inhcall); } else { - idtentry_exit_cond_rcu(regs, rcu_exit); + irqentry_exit(regs, state); } } #endif /* CONFIG_XEN_PV */ diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 2d0bd5d5f032..29b7d52143e9 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -846,7 +846,7 @@ SYM_CODE_START(ret_from_fork) 2: /* When we fork, we trace the syscall return in the child, too. */ movl %esp, %eax - call syscall_return_slowpath + call syscall_exit_to_user_mode jmp .Lsyscall_32_done /* kernel thread */ @@ -854,7 +854,7 @@ SYM_CODE_START(ret_from_fork) CALL_NOSPEC ebx /* * A kernel thread is allowed to return here after successfully - * calling do_execve(). Exit to userspace to complete the execve() + * calling kernel_execve(). Exit to userspace to complete the execve() * syscall. */ movl $0, PT_EAX(%esp) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index d2a00c97e53f..70dea9337816 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -38,6 +38,7 @@ #include <asm/frame.h> #include <asm/trapnr.h> #include <asm/nospec-branch.h> +#include <asm/fsgsbase.h> #include <linux/err.h> #include "calling.h" @@ -283,7 +284,7 @@ SYM_CODE_START(ret_from_fork) 2: UNWIND_HINT_REGS movq %rsp, %rdi - call syscall_return_slowpath /* returns with IRQs disabled */ + call syscall_exit_to_user_mode /* returns with IRQs disabled */ jmp swapgs_restore_regs_and_return_to_usermode 1: @@ -293,7 +294,7 @@ SYM_CODE_START(ret_from_fork) CALL_NOSPEC rbx /* * A kernel thread is allowed to return here after successfully - * calling do_execve(). Exit to userspace to complete the execve() + * calling kernel_execve(). Exit to userspace to complete the execve() * syscall. */ movq $0, RAX(%rsp) @@ -426,10 +427,7 @@ SYM_CODE_START(\asmsym) testb $3, CS-ORIG_RAX(%rsp) jnz .Lfrom_usermode_switch_stack_\@ - /* - * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX. - * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS - */ + /* paranoid_entry returns GS information for paranoid_exit in EBX. */ call paranoid_entry UNWIND_HINT_REGS @@ -458,10 +456,7 @@ SYM_CODE_START(\asmsym) UNWIND_HINT_IRET_REGS offset=8 ASM_CLAC - /* - * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX. - * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS - */ + /* paranoid_entry returns GS information for paranoid_exit in EBX. */ call paranoid_entry UNWIND_HINT_REGS @@ -798,24 +793,21 @@ SYM_CODE_END(xen_failsafe_callback) #endif /* CONFIG_XEN_PV */ /* - * Save all registers in pt_regs, and switch gs if needed. - * Use slow, but surefire "are we in kernel?" check. - * Return: ebx=0: need swapgs on exit, ebx=1: otherwise + * Save all registers in pt_regs. Return GSBASE related information + * in EBX depending on the availability of the FSGSBASE instructions: + * + * FSGSBASE R/EBX + * N 0 -> SWAPGS on exit + * 1 -> no SWAPGS on exit + * + * Y GSBASE value at entry, must be restored in paranoid_exit */ SYM_CODE_START_LOCAL(paranoid_entry) UNWIND_HINT_FUNC cld PUSH_AND_CLEAR_REGS save_ret=1 ENCODE_FRAME_POINTER 8 - movl $1, %ebx - movl $MSR_GS_BASE, %ecx - rdmsr - testl %edx, %edx - js 1f /* negative -> in kernel */ - SWAPGS - xorl %ebx, %ebx -1: /* * Always stash CR3 in %r14. This value will be restored, * verbatim, at exit. Needed if paranoid_entry interrupted @@ -825,16 +817,60 @@ SYM_CODE_START_LOCAL(paranoid_entry) * This is also why CS (stashed in the "iret frame" by the * hardware at entry) can not be used: this may be a return * to kernel code, but with a user CR3 value. + * + * Switching CR3 does not depend on kernel GSBASE so it can + * be done before switching to the kernel GSBASE. This is + * required for FSGSBASE because the kernel GSBASE has to + * be retrieved from a kernel internal table. */ SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 /* + * Handling GSBASE depends on the availability of FSGSBASE. + * + * Without FSGSBASE the kernel enforces that negative GSBASE + * values indicate kernel GSBASE. With FSGSBASE no assumptions + * can be made about the GSBASE value when entering from user + * space. + */ + ALTERNATIVE "jmp .Lparanoid_entry_checkgs", "", X86_FEATURE_FSGSBASE + + /* + * Read the current GSBASE and store it in %rbx unconditionally, + * retrieve and set the current CPUs kernel GSBASE. The stored value + * has to be restored in paranoid_exit unconditionally. + * + * The MSR write ensures that no subsequent load is based on a + * mispredicted GSBASE. No extra FENCE required. + */ + SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx + ret + +.Lparanoid_entry_checkgs: + /* EBX = 1 -> kernel GSBASE active, no restore required */ + movl $1, %ebx + /* + * The kernel-enforced convention is a negative GSBASE indicates + * a kernel value. No SWAPGS needed on entry and exit. + */ + movl $MSR_GS_BASE, %ecx + rdmsr + testl %edx, %edx + jns .Lparanoid_entry_swapgs + ret + +.Lparanoid_entry_swapgs: + SWAPGS + + /* * The above SAVE_AND_SWITCH_TO_KERNEL_CR3 macro doesn't do an * unconditional CR3 write, even in the PTI case. So do an lfence * to prevent GS speculation, regardless of whether PTI is enabled. */ FENCE_SWAPGS_KERNEL_ENTRY + /* EBX = 0 -> SWAPGS required on exit */ + xorl %ebx, %ebx ret SYM_CODE_END(paranoid_entry) @@ -845,23 +881,45 @@ SYM_CODE_END(paranoid_entry) * * We may be returning to very strange contexts (e.g. very early * in syscall entry), so checking for preemption here would - * be complicated. Fortunately, we there's no good reason - * to try to handle preemption here. + * be complicated. Fortunately, there's no good reason to try + * to handle preemption here. + * + * R/EBX contains the GSBASE related information depending on the + * availability of the FSGSBASE instructions: + * + * FSGSBASE R/EBX + * N 0 -> SWAPGS on exit + * 1 -> no SWAPGS on exit * - * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) + * Y User space GSBASE, must be restored unconditionally */ SYM_CODE_START_LOCAL(paranoid_exit) UNWIND_HINT_REGS - testl %ebx, %ebx /* swapgs needed? */ - jnz .Lparanoid_exit_no_swapgs - /* Always restore stashed CR3 value (see paranoid_entry) */ - RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 + /* + * The order of operations is important. RESTORE_CR3 requires + * kernel GSBASE. + * + * NB to anyone to try to optimize this code: this code does + * not execute at all for exceptions from user mode. Those + * exceptions go through error_exit instead. + */ + RESTORE_CR3 scratch_reg=%rax save_reg=%r14 + + /* Handle the three GSBASE cases */ + ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "", X86_FEATURE_FSGSBASE + + /* With FSGSBASE enabled, unconditionally restore GSBASE */ + wrgsbase %rbx + jmp restore_regs_and_return_to_kernel + +.Lparanoid_exit_checkgs: + /* On non-FSGSBASE systems, conditionally do SWAPGS */ + testl %ebx, %ebx + jnz restore_regs_and_return_to_kernel + + /* We are returning to a context with user GSBASE */ SWAPGS_UNSAFE_STACK - jmp restore_regs_and_return_to_kernel -.Lparanoid_exit_no_swapgs: - /* Always restore stashed CR3 value (see paranoid_entry) */ - RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 - jmp restore_regs_and_return_to_kernel + jmp restore_regs_and_return_to_kernel SYM_CODE_END(paranoid_exit) /* @@ -1266,10 +1324,27 @@ end_repeat_nmi: /* Always restore stashed CR3 value (see paranoid_entry) */ RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 - testl %ebx, %ebx /* swapgs needed? */ + /* + * The above invocation of paranoid_entry stored the GSBASE + * related information in R/EBX depending on the availability + * of FSGSBASE. + * + * If FSGSBASE is enabled, restore the saved GSBASE value + * unconditionally, otherwise take the conditional SWAPGS path. + */ + ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE + + wrgsbase %rbx + jmp nmi_restore + +nmi_no_fsgsbase: + /* EBX == 0 -> invoke SWAPGS */ + testl %ebx, %ebx jnz nmi_restore + nmi_swapgs: SWAPGS_UNSAFE_STACK + nmi_restore: POP_REGS diff --git a/arch/x86/entry/syscall_x32.c b/arch/x86/entry/syscall_x32.c index 3d8d70d3896c..1583831f61a9 100644 --- a/arch/x86/entry/syscall_x32.c +++ b/arch/x86/entry/syscall_x32.c @@ -8,6 +8,13 @@ #include <asm/unistd.h> #include <asm/syscall.h> +/* + * Reuse the 64-bit entry points for the x32 versions that occupy different + * slots in the syscall table. + */ +#define __x32_sys_getsockopt __x64_sys_getsockopt +#define __x32_sys_setsockopt __x64_sys_setsockopt + #define __SYSCALL_64(nr, sym) #define __SYSCALL_X32(nr, sym) extern long __x32_##sym(const struct pt_regs *); diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index d8f8a1a69ed1..e31a75262c9c 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -376,8 +376,8 @@ 362 i386 connect sys_connect 363 i386 listen sys_listen 364 i386 accept4 sys_accept4 -365 i386 getsockopt sys_getsockopt compat_sys_getsockopt -366 i386 setsockopt sys_setsockopt compat_sys_setsockopt +365 i386 getsockopt sys_getsockopt sys_getsockopt +366 i386 setsockopt sys_setsockopt sys_setsockopt 367 i386 getsockname sys_getsockname 368 i386 getpeername sys_getpeername 369 i386 sendto sys_sendto @@ -440,6 +440,7 @@ 433 i386 fspick sys_fspick 434 i386 pidfd_open sys_pidfd_open 435 i386 clone3 sys_clone3 +436 i386 close_range sys_close_range 437 i386 openat2 sys_openat2 438 i386 pidfd_getfd sys_pidfd_getfd 439 i386 faccessat2 sys_faccessat2 diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 78847b32e137..9d82078c949a 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -357,6 +357,7 @@ 433 common fspick sys_fspick 434 common pidfd_open sys_pidfd_open 435 common clone3 sys_clone3 +436 common close_range sys_close_range 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 @@ -396,8 +397,8 @@ 538 x32 sendmmsg compat_sys_sendmmsg 539 x32 process_vm_readv compat_sys_process_vm_readv 540 x32 process_vm_writev compat_sys_process_vm_writev -541 x32 setsockopt compat_sys_setsockopt -542 x32 getsockopt compat_sys_getsockopt +541 x32 setsockopt sys_setsockopt +542 x32 getsockopt sys_getsockopt 543 x32 io_setup compat_sys_io_setup 544 x32 io_submit compat_sys_io_submit 545 x32 execveat compat_sys_execveat diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 04e65f0698f6..215376d975a2 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -82,7 +82,7 @@ $(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE # optimize sibling calls. # CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ - $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \ + $(filter -g%,$(KBUILD_CFLAGS)) -fno-stack-protector \ -fno-omit-frame-pointer -foptimize-sibling-calls \ -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO @@ -151,7 +151,7 @@ KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 := $(filter-out $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS_32)) KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic -KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector) +KBUILD_CFLAGS_32 += -fno-stack-protector KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) KBUILD_CFLAGS_32 += -fno-omit-frame-pointer KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index ea7c1f0b79df..9185cb1d13b9 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -144,8 +144,7 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) struct mm_struct *mm = task->mm; struct vm_area_struct *vma; - if (mmap_write_lock_killable(mm)) - return -EINTR; + mmap_read_lock(mm); for (vma = mm->mmap; vma; vma = vma->vm_next) { unsigned long size = vma->vm_end - vma->vm_start; @@ -154,7 +153,7 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) zap_page_range(vma, vma->vm_start, size); } - mmap_write_unlock(mm); + mmap_read_unlock(mm); return 0; } #else diff --git a/arch/x86/events/amd/power.c b/arch/x86/events/amd/power.c index 43b09e9c93a2..16a2369c586e 100644 --- a/arch/x86/events/amd/power.c +++ b/arch/x86/events/amd/power.c @@ -13,10 +13,6 @@ #include <asm/cpu_device_id.h> #include "../perf_event.h" -#define MSR_F15H_CU_PWR_ACCUMULATOR 0xc001007a -#define MSR_F15H_CU_MAX_PWR_ACCUMULATOR 0xc001007b -#define MSR_F15H_PTSC 0xc0010280 - /* Event code: LSB 8 bits, passed in attr->config any other bit is reserved. */ #define AMD_POWER_EVENT_MASK 0xFFULL diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 4103665c6e03..1cbf57dc2ac8 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -71,10 +71,9 @@ u64 x86_perf_event_update(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; int shift = 64 - x86_pmu.cntval_bits; u64 prev_raw_count, new_raw_count; - int idx = hwc->idx; u64 delta; - if (idx == INTEL_PMC_IDX_FIXED_BTS) + if (unlikely(!hwc->event_base)) return 0; /* @@ -359,6 +358,7 @@ void x86_release_hardware(void) if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) { release_pmc_hardware(); release_ds_buffers(); + release_lbr_buffers(); mutex_unlock(&pmc_reserve_mutex); } } @@ -1097,22 +1097,31 @@ static inline void x86_assign_hw_event(struct perf_event *event, struct cpu_hw_events *cpuc, int i) { struct hw_perf_event *hwc = &event->hw; + int idx; - hwc->idx = cpuc->assign[i]; + idx = hwc->idx = cpuc->assign[i]; hwc->last_cpu = smp_processor_id(); hwc->last_tag = ++cpuc->tags[i]; - if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) { + switch (hwc->idx) { + case INTEL_PMC_IDX_FIXED_BTS: + case INTEL_PMC_IDX_FIXED_VLBR: hwc->config_base = 0; hwc->event_base = 0; - } else if (hwc->idx >= INTEL_PMC_IDX_FIXED) { + break; + + case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS-1: hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; - hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED); - hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30; - } else { + hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + + (idx - INTEL_PMC_IDX_FIXED); + hwc->event_base_rdpmc = (idx - INTEL_PMC_IDX_FIXED) | 1<<30; + break; + + default: hwc->config_base = x86_pmu_config_addr(hwc->idx); hwc->event_base = x86_pmu_event_addr(hwc->idx); hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx); + break; } } @@ -1233,7 +1242,7 @@ int x86_perf_event_set_period(struct perf_event *event) s64 period = hwc->sample_period; int ret = 0, idx = hwc->idx; - if (idx == INTEL_PMC_IDX_FIXED_BTS) + if (unlikely(!hwc->event_base)) return 0; /* @@ -2363,7 +2372,6 @@ static struct pmu pmu = { .event_idx = x86_pmu_event_idx, .sched_task = x86_pmu_sched_task, - .task_ctx_size = sizeof(struct x86_perf_task_context), .swap_task_ctx = x86_pmu_swap_task_ctx, .check_period = x86_pmu_check_period, diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index ca35c8b5ee10..50963472ee85 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2136,8 +2136,35 @@ static inline void intel_pmu_ack_status(u64 ack) wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); } -static void intel_pmu_disable_fixed(struct hw_perf_event *hwc) +static inline bool event_is_checkpointed(struct perf_event *event) { + return unlikely(event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0; +} + +static inline void intel_set_masks(struct perf_event *event, int idx) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + if (event->attr.exclude_host) + __set_bit(idx, (unsigned long *)&cpuc->intel_ctrl_guest_mask); + if (event->attr.exclude_guest) + __set_bit(idx, (unsigned long *)&cpuc->intel_ctrl_host_mask); + if (event_is_checkpointed(event)) + __set_bit(idx, (unsigned long *)&cpuc->intel_cp_status); +} + +static inline void intel_clear_masks(struct perf_event *event, int idx) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + __clear_bit(idx, (unsigned long *)&cpuc->intel_ctrl_guest_mask); + __clear_bit(idx, (unsigned long *)&cpuc->intel_ctrl_host_mask); + __clear_bit(idx, (unsigned long *)&cpuc->intel_cp_status); +} + +static void intel_pmu_disable_fixed(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx - INTEL_PMC_IDX_FIXED; u64 ctrl_val, mask; @@ -2148,30 +2175,22 @@ static void intel_pmu_disable_fixed(struct hw_perf_event *hwc) wrmsrl(hwc->config_base, ctrl_val); } -static inline bool event_is_checkpointed(struct perf_event *event) -{ - return (event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0; -} - static void intel_pmu_disable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int idx = hwc->idx; - if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) { + if (idx < INTEL_PMC_IDX_FIXED) { + intel_clear_masks(event, idx); + x86_pmu_disable_event(event); + } else if (idx < INTEL_PMC_IDX_FIXED_BTS) { + intel_clear_masks(event, idx); + intel_pmu_disable_fixed(event); + } else if (idx == INTEL_PMC_IDX_FIXED_BTS) { intel_pmu_disable_bts(); intel_pmu_drain_bts_buffer(); - return; - } - - cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx); - cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx); - cpuc->intel_cp_status &= ~(1ull << hwc->idx); - - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) - intel_pmu_disable_fixed(hwc); - else - x86_pmu_disable_event(event); + } else if (idx == INTEL_PMC_IDX_FIXED_VLBR) + intel_clear_masks(event, idx); /* * Needs to be called after x86_pmu_disable_event, @@ -2238,33 +2257,23 @@ static void intel_pmu_enable_fixed(struct perf_event *event) static void intel_pmu_enable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - - if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) { - if (!__this_cpu_read(cpu_hw_events.enabled)) - return; - - intel_pmu_enable_bts(hwc->config); - return; - } - - if (event->attr.exclude_host) - cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx); - if (event->attr.exclude_guest) - cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx); - - if (unlikely(event_is_checkpointed(event))) - cpuc->intel_cp_status |= (1ull << hwc->idx); + int idx = hwc->idx; if (unlikely(event->attr.precise_ip)) intel_pmu_pebs_enable(event); - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + if (idx < INTEL_PMC_IDX_FIXED) { + intel_set_masks(event, idx); + __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); + } else if (idx < INTEL_PMC_IDX_FIXED_BTS) { + intel_set_masks(event, idx); intel_pmu_enable_fixed(event); - return; - } - - __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); + } else if (idx == INTEL_PMC_IDX_FIXED_BTS) { + if (!__this_cpu_read(cpu_hw_events.enabled)) + return; + intel_pmu_enable_bts(hwc->config); + } else if (idx == INTEL_PMC_IDX_FIXED_VLBR) + intel_set_masks(event, idx); } static void intel_pmu_add_event(struct perf_event *event) @@ -2614,6 +2623,20 @@ intel_bts_constraints(struct perf_event *event) return NULL; } +/* + * Note: matches a fake event, like Fixed2. + */ +static struct event_constraint * +intel_vlbr_constraints(struct perf_event *event) +{ + struct event_constraint *c = &vlbr_constraint; + + if (unlikely(constraint_match(c, event->hw.config))) + return c; + + return NULL; +} + static int intel_alt_er(int idx, u64 config) { int alt_idx = idx; @@ -2804,6 +2827,10 @@ __intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, { struct event_constraint *c; + c = intel_vlbr_constraints(event); + if (c) + return c; + c = intel_bts_constraints(event); if (c) return c; @@ -3951,6 +3978,11 @@ static __initconst const struct x86_pmu core_pmu = { .cpu_dead = intel_pmu_cpu_dead, .check_period = intel_pmu_check_period, + + .lbr_reset = intel_pmu_lbr_reset_64, + .lbr_read = intel_pmu_lbr_read_64, + .lbr_save = intel_pmu_lbr_save, + .lbr_restore = intel_pmu_lbr_restore, }; static __initconst const struct x86_pmu intel_pmu = { @@ -3996,6 +4028,11 @@ static __initconst const struct x86_pmu intel_pmu = { .check_period = intel_pmu_check_period, .aux_output_match = intel_pmu_aux_output_match, + + .lbr_reset = intel_pmu_lbr_reset_64, + .lbr_read = intel_pmu_lbr_read_64, + .lbr_save = intel_pmu_lbr_save, + .lbr_restore = intel_pmu_lbr_restore, }; static __init void intel_clovertown_quirk(void) @@ -4622,6 +4659,14 @@ __init int intel_pmu_init(void) x86_pmu.intel_cap.capabilities = capabilities; } + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) { + x86_pmu.lbr_reset = intel_pmu_lbr_reset_32; + x86_pmu.lbr_read = intel_pmu_lbr_read_32; + } + + if (boot_cpu_has(X86_FEATURE_ARCH_LBR)) + intel_pmu_arch_lbr_init(); + intel_ds_init(); x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */ diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index dc43cc124e09..86848c57b55e 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -954,7 +954,7 @@ static void adaptive_pebs_record_size_update(void) if (pebs_data_cfg & PEBS_DATACFG_XMMS) sz += sizeof(struct pebs_xmm); if (pebs_data_cfg & PEBS_DATACFG_LBRS) - sz += x86_pmu.lbr_nr * sizeof(struct pebs_lbr_entry); + sz += x86_pmu.lbr_nr * sizeof(struct lbr_entry); cpuc->pebs_record_size = sz; } @@ -1595,10 +1595,10 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, } if (format_size & PEBS_DATACFG_LBRS) { - struct pebs_lbr *lbr = next_record; + struct lbr_entry *lbr = next_record; int num_lbr = ((format_size >> PEBS_DATACFG_LBR_SHIFT) & 0xff) + 1; - next_record = next_record + num_lbr*sizeof(struct pebs_lbr_entry); + next_record = next_record + num_lbr * sizeof(struct lbr_entry); if (has_branch_stack(event)) { intel_pmu_store_pebs_lbrs(lbr); diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 65113b16804a..63f58bdf556c 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -8,17 +8,6 @@ #include "../perf_event.h" -enum { - LBR_FORMAT_32 = 0x00, - LBR_FORMAT_LIP = 0x01, - LBR_FORMAT_EIP = 0x02, - LBR_FORMAT_EIP_FLAGS = 0x03, - LBR_FORMAT_EIP_FLAGS2 = 0x04, - LBR_FORMAT_INFO = 0x05, - LBR_FORMAT_TIME = 0x06, - LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_TIME, -}; - static const enum { LBR_EIP_FLAGS = 1, LBR_TSX = 2, @@ -143,8 +132,54 @@ enum { X86_BR_IRQ |\ X86_BR_INT) +/* + * Intel LBR_CTL bits + * + * Hardware branch filter for Arch LBR + */ +#define ARCH_LBR_KERNEL_BIT 1 /* capture at ring0 */ +#define ARCH_LBR_USER_BIT 2 /* capture at ring > 0 */ +#define ARCH_LBR_CALL_STACK_BIT 3 /* enable call stack */ +#define ARCH_LBR_JCC_BIT 16 /* capture conditional branches */ +#define ARCH_LBR_REL_JMP_BIT 17 /* capture relative jumps */ +#define ARCH_LBR_IND_JMP_BIT 18 /* capture indirect jumps */ +#define ARCH_LBR_REL_CALL_BIT 19 /* capture relative calls */ +#define ARCH_LBR_IND_CALL_BIT 20 /* capture indirect calls */ +#define ARCH_LBR_RETURN_BIT 21 /* capture near returns */ +#define ARCH_LBR_OTHER_BRANCH_BIT 22 /* capture other branches */ + +#define ARCH_LBR_KERNEL (1ULL << ARCH_LBR_KERNEL_BIT) +#define ARCH_LBR_USER (1ULL << ARCH_LBR_USER_BIT) +#define ARCH_LBR_CALL_STACK (1ULL << ARCH_LBR_CALL_STACK_BIT) +#define ARCH_LBR_JCC (1ULL << ARCH_LBR_JCC_BIT) +#define ARCH_LBR_REL_JMP (1ULL << ARCH_LBR_REL_JMP_BIT) +#define ARCH_LBR_IND_JMP (1ULL << ARCH_LBR_IND_JMP_BIT) +#define ARCH_LBR_REL_CALL (1ULL << ARCH_LBR_REL_CALL_BIT) +#define ARCH_LBR_IND_CALL (1ULL << ARCH_LBR_IND_CALL_BIT) +#define ARCH_LBR_RETURN (1ULL << ARCH_LBR_RETURN_BIT) +#define ARCH_LBR_OTHER_BRANCH (1ULL << ARCH_LBR_OTHER_BRANCH_BIT) + +#define ARCH_LBR_ANY \ + (ARCH_LBR_JCC |\ + ARCH_LBR_REL_JMP |\ + ARCH_LBR_IND_JMP |\ + ARCH_LBR_REL_CALL |\ + ARCH_LBR_IND_CALL |\ + ARCH_LBR_RETURN |\ + ARCH_LBR_OTHER_BRANCH) + +#define ARCH_LBR_CTL_MASK 0x7f000e + static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc); +static __always_inline bool is_lbr_call_stack_bit_set(u64 config) +{ + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) + return !!(config & ARCH_LBR_CALL_STACK); + + return !!(config & LBR_CALL_STACK); +} + /* * We only support LBR implementations that have FREEZE_LBRS_ON_PMI * otherwise it becomes near impossible to get a reliable stack. @@ -168,33 +203,46 @@ static void __intel_pmu_lbr_enable(bool pmi) */ if (cpuc->lbr_sel) lbr_select = cpuc->lbr_sel->config & x86_pmu.lbr_sel_mask; - if (!pmi && cpuc->lbr_sel) + if (!static_cpu_has(X86_FEATURE_ARCH_LBR) && !pmi && cpuc->lbr_sel) wrmsrl(MSR_LBR_SELECT, lbr_select); rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); orig_debugctl = debugctl; - debugctl |= DEBUGCTLMSR_LBR; + + if (!static_cpu_has(X86_FEATURE_ARCH_LBR)) + debugctl |= DEBUGCTLMSR_LBR; /* * LBR callstack does not work well with FREEZE_LBRS_ON_PMI. * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions * may cause superfluous increase/decrease of LBR_TOS. */ - if (!(lbr_select & LBR_CALL_STACK)) + if (is_lbr_call_stack_bit_set(lbr_select)) + debugctl &= ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; + else debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; + if (orig_debugctl != debugctl) wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) + wrmsrl(MSR_ARCH_LBR_CTL, lbr_select | ARCH_LBR_CTL_LBREN); } static void __intel_pmu_lbr_disable(void) { u64 debugctl; + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) { + wrmsrl(MSR_ARCH_LBR_CTL, 0); + return; + } + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); } -static void intel_pmu_lbr_reset_32(void) +void intel_pmu_lbr_reset_32(void) { int i; @@ -202,7 +250,7 @@ static void intel_pmu_lbr_reset_32(void) wrmsrl(x86_pmu.lbr_from + i, 0); } -static void intel_pmu_lbr_reset_64(void) +void intel_pmu_lbr_reset_64(void) { int i; @@ -210,10 +258,16 @@ static void intel_pmu_lbr_reset_64(void) wrmsrl(x86_pmu.lbr_from + i, 0); wrmsrl(x86_pmu.lbr_to + i, 0); if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - wrmsrl(MSR_LBR_INFO_0 + i, 0); + wrmsrl(x86_pmu.lbr_info + i, 0); } } +static void intel_pmu_arch_lbr_reset(void) +{ + /* Write to ARCH_LBR_DEPTH MSR, all LBR entries are reset to 0 */ + wrmsrl(MSR_ARCH_LBR_DEPTH, x86_pmu.lbr_nr); +} + void intel_pmu_lbr_reset(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -221,10 +275,7 @@ void intel_pmu_lbr_reset(void) if (!x86_pmu.lbr_nr) return; - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) - intel_pmu_lbr_reset_32(); - else - intel_pmu_lbr_reset_64(); + x86_pmu.lbr_reset(); cpuc->last_task_ctx = NULL; cpuc->last_log_id = 0; @@ -308,69 +359,97 @@ static u64 lbr_from_signext_quirk_rd(u64 val) return val; } -static inline void wrlbr_from(unsigned int idx, u64 val) +static __always_inline void wrlbr_from(unsigned int idx, u64 val) { val = lbr_from_signext_quirk_wr(val); wrmsrl(x86_pmu.lbr_from + idx, val); } -static inline void wrlbr_to(unsigned int idx, u64 val) +static __always_inline void wrlbr_to(unsigned int idx, u64 val) { wrmsrl(x86_pmu.lbr_to + idx, val); } -static inline u64 rdlbr_from(unsigned int idx) +static __always_inline void wrlbr_info(unsigned int idx, u64 val) +{ + wrmsrl(x86_pmu.lbr_info + idx, val); +} + +static __always_inline u64 rdlbr_from(unsigned int idx, struct lbr_entry *lbr) { u64 val; + if (lbr) + return lbr->from; + rdmsrl(x86_pmu.lbr_from + idx, val); return lbr_from_signext_quirk_rd(val); } -static inline u64 rdlbr_to(unsigned int idx) +static __always_inline u64 rdlbr_to(unsigned int idx, struct lbr_entry *lbr) { u64 val; + if (lbr) + return lbr->to; + rdmsrl(x86_pmu.lbr_to + idx, val); return val; } -static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) +static __always_inline u64 rdlbr_info(unsigned int idx, struct lbr_entry *lbr) +{ + u64 val; + + if (lbr) + return lbr->info; + + rdmsrl(x86_pmu.lbr_info + idx, val); + + return val; +} + +static inline void +wrlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info) +{ + wrlbr_from(idx, lbr->from); + wrlbr_to(idx, lbr->to); + if (need_info) + wrlbr_info(idx, lbr->info); +} + +static inline bool +rdlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info) +{ + u64 from = rdlbr_from(idx, NULL); + + /* Don't read invalid entry */ + if (!from) + return false; + + lbr->from = from; + lbr->to = rdlbr_to(idx, NULL); + if (need_info) + lbr->info = rdlbr_info(idx, NULL); + + return true; +} + +void intel_pmu_lbr_restore(void *ctx) { + bool need_info = x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct x86_perf_task_context *task_ctx = ctx; int i; unsigned lbr_idx, mask; - u64 tos; - - if (task_ctx->lbr_callstack_users == 0 || - task_ctx->lbr_stack_state == LBR_NONE) { - intel_pmu_lbr_reset(); - return; - } - - tos = task_ctx->tos; - /* - * Does not restore the LBR registers, if - * - No one else touched them, and - * - Did not enter C6 - */ - if ((task_ctx == cpuc->last_task_ctx) && - (task_ctx->log_id == cpuc->last_log_id) && - rdlbr_from(tos)) { - task_ctx->lbr_stack_state = LBR_NONE; - return; - } + u64 tos = task_ctx->tos; mask = x86_pmu.lbr_nr - 1; for (i = 0; i < task_ctx->valid_lbrs; i++) { lbr_idx = (tos - i) & mask; - wrlbr_from(lbr_idx, task_ctx->lbr_from[i]); - wrlbr_to (lbr_idx, task_ctx->lbr_to[i]); - - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); + wrlbr_all(&task_ctx->lbr[i], lbr_idx, need_info); } for (; i < x86_pmu.lbr_nr; i++) { @@ -378,49 +457,149 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) wrlbr_from(lbr_idx, 0); wrlbr_to(lbr_idx, 0); if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - wrmsrl(MSR_LBR_INFO_0 + lbr_idx, 0); + wrlbr_info(lbr_idx, 0); } wrmsrl(x86_pmu.lbr_tos, tos); - task_ctx->lbr_stack_state = LBR_NONE; + + if (cpuc->lbr_select) + wrmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel); } -static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) +static void intel_pmu_arch_lbr_restore(void *ctx) { - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - unsigned lbr_idx, mask; - u64 tos, from; + struct x86_perf_task_context_arch_lbr *task_ctx = ctx; + struct lbr_entry *entries = task_ctx->entries; int i; - if (task_ctx->lbr_callstack_users == 0) { - task_ctx->lbr_stack_state = LBR_NONE; + /* Fast reset the LBRs before restore if the call stack is not full. */ + if (!entries[x86_pmu.lbr_nr - 1].from) + intel_pmu_arch_lbr_reset(); + + for (i = 0; i < x86_pmu.lbr_nr; i++) { + if (!entries[i].from) + break; + wrlbr_all(&entries[i], i, true); + } +} + +/* + * Restore the Architecture LBR state from the xsave area in the perf + * context data for the task via the XRSTORS instruction. + */ +static void intel_pmu_arch_lbr_xrstors(void *ctx) +{ + struct x86_perf_task_context_arch_lbr_xsave *task_ctx = ctx; + + copy_kernel_to_dynamic_supervisor(&task_ctx->xsave, XFEATURE_MASK_LBR); +} + +static __always_inline bool lbr_is_reset_in_cstate(void *ctx) +{ + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) + return x86_pmu.lbr_deep_c_reset && !rdlbr_from(0, NULL); + + return !rdlbr_from(((struct x86_perf_task_context *)ctx)->tos, NULL); +} + +static void __intel_pmu_lbr_restore(void *ctx) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + if (task_context_opt(ctx)->lbr_callstack_users == 0 || + task_context_opt(ctx)->lbr_stack_state == LBR_NONE) { + intel_pmu_lbr_reset(); + return; + } + + /* + * Does not restore the LBR registers, if + * - No one else touched them, and + * - Was not cleared in Cstate + */ + if ((ctx == cpuc->last_task_ctx) && + (task_context_opt(ctx)->log_id == cpuc->last_log_id) && + !lbr_is_reset_in_cstate(ctx)) { + task_context_opt(ctx)->lbr_stack_state = LBR_NONE; return; } + x86_pmu.lbr_restore(ctx); + + task_context_opt(ctx)->lbr_stack_state = LBR_NONE; +} + +void intel_pmu_lbr_save(void *ctx) +{ + bool need_info = x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO; + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct x86_perf_task_context *task_ctx = ctx; + unsigned lbr_idx, mask; + u64 tos; + int i; + mask = x86_pmu.lbr_nr - 1; tos = intel_pmu_lbr_tos(); for (i = 0; i < x86_pmu.lbr_nr; i++) { lbr_idx = (tos - i) & mask; - from = rdlbr_from(lbr_idx); - if (!from) + if (!rdlbr_all(&task_ctx->lbr[i], lbr_idx, need_info)) break; - task_ctx->lbr_from[i] = from; - task_ctx->lbr_to[i] = rdlbr_to(lbr_idx); - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); } task_ctx->valid_lbrs = i; task_ctx->tos = tos; - task_ctx->lbr_stack_state = LBR_VALID; - cpuc->last_task_ctx = task_ctx; - cpuc->last_log_id = ++task_ctx->log_id; + if (cpuc->lbr_select) + rdmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel); +} + +static void intel_pmu_arch_lbr_save(void *ctx) +{ + struct x86_perf_task_context_arch_lbr *task_ctx = ctx; + struct lbr_entry *entries = task_ctx->entries; + int i; + + for (i = 0; i < x86_pmu.lbr_nr; i++) { + if (!rdlbr_all(&entries[i], i, true)) + break; + } + + /* LBR call stack is not full. Reset is required in restore. */ + if (i < x86_pmu.lbr_nr) + entries[x86_pmu.lbr_nr - 1].from = 0; +} + +/* + * Save the Architecture LBR state to the xsave area in the perf + * context data for the task via the XSAVES instruction. + */ +static void intel_pmu_arch_lbr_xsaves(void *ctx) +{ + struct x86_perf_task_context_arch_lbr_xsave *task_ctx = ctx; + + copy_dynamic_supervisor_to_kernel(&task_ctx->xsave, XFEATURE_MASK_LBR); +} + +static void __intel_pmu_lbr_save(void *ctx) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + if (task_context_opt(ctx)->lbr_callstack_users == 0) { + task_context_opt(ctx)->lbr_stack_state = LBR_NONE; + return; + } + + x86_pmu.lbr_save(ctx); + + task_context_opt(ctx)->lbr_stack_state = LBR_VALID; + + cpuc->last_task_ctx = ctx; + cpuc->last_log_id = ++task_context_opt(ctx)->log_id; } void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev, struct perf_event_context *next) { - struct x86_perf_task_context *prev_ctx_data, *next_ctx_data; + void *prev_ctx_data, *next_ctx_data; swap(prev->task_ctx_data, next->task_ctx_data); @@ -436,14 +615,14 @@ void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev, if (!prev_ctx_data || !next_ctx_data) return; - swap(prev_ctx_data->lbr_callstack_users, - next_ctx_data->lbr_callstack_users); + swap(task_context_opt(prev_ctx_data)->lbr_callstack_users, + task_context_opt(next_ctx_data)->lbr_callstack_users); } void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct x86_perf_task_context *task_ctx; + void *task_ctx; if (!cpuc->lbr_users) return; @@ -479,18 +658,19 @@ static inline bool branch_user_callstack(unsigned br_sel) void intel_pmu_lbr_add(struct perf_event *event) { + struct kmem_cache *kmem_cache = event->pmu->task_ctx_cache; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct x86_perf_task_context *task_ctx; if (!x86_pmu.lbr_nr) return; + if (event->hw.flags & PERF_X86_EVENT_LBR_SELECT) + cpuc->lbr_select = 1; + cpuc->br_sel = event->hw.branch_reg.reg; - if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) { - task_ctx = event->ctx->task_ctx_data; - task_ctx->lbr_callstack_users++; - } + if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) + task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users++; /* * Request pmu::sched_task() callback, which will fire inside the @@ -516,21 +696,44 @@ void intel_pmu_lbr_add(struct perf_event *event) perf_sched_cb_inc(event->ctx->pmu); if (!cpuc->lbr_users++ && !event->total_time_running) intel_pmu_lbr_reset(); + + if (static_cpu_has(X86_FEATURE_ARCH_LBR) && + kmem_cache && !cpuc->lbr_xsave && + (cpuc->lbr_users != cpuc->lbr_pebs_users)) + cpuc->lbr_xsave = kmem_cache_alloc(kmem_cache, GFP_KERNEL); +} + +void release_lbr_buffers(void) +{ + struct kmem_cache *kmem_cache = x86_get_pmu()->task_ctx_cache; + struct cpu_hw_events *cpuc; + int cpu; + + if (!static_cpu_has(X86_FEATURE_ARCH_LBR)) + return; + + for_each_possible_cpu(cpu) { + cpuc = per_cpu_ptr(&cpu_hw_events, cpu); + if (kmem_cache && cpuc->lbr_xsave) { + kmem_cache_free(kmem_cache, cpuc->lbr_xsave); + cpuc->lbr_xsave = NULL; + } + } } void intel_pmu_lbr_del(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct x86_perf_task_context *task_ctx; if (!x86_pmu.lbr_nr) return; if (branch_user_callstack(cpuc->br_sel) && - event->ctx->task_ctx_data) { - task_ctx = event->ctx->task_ctx_data; - task_ctx->lbr_callstack_users--; - } + event->ctx->task_ctx_data) + task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users--; + + if (event->hw.flags & PERF_X86_EVENT_LBR_SELECT) + cpuc->lbr_select = 0; if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0) cpuc->lbr_pebs_users--; @@ -540,11 +743,19 @@ void intel_pmu_lbr_del(struct perf_event *event) perf_sched_cb_dec(event->ctx->pmu); } +static inline bool vlbr_exclude_host(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + return test_bit(INTEL_PMC_IDX_FIXED_VLBR, + (unsigned long *)&cpuc->intel_ctrl_guest_mask); +} + void intel_pmu_lbr_enable_all(bool pmi) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - if (cpuc->lbr_users) + if (cpuc->lbr_users && !vlbr_exclude_host()) __intel_pmu_lbr_enable(pmi); } @@ -552,11 +763,11 @@ void intel_pmu_lbr_disable_all(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - if (cpuc->lbr_users) + if (cpuc->lbr_users && !vlbr_exclude_host()) __intel_pmu_lbr_disable(); } -static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) +void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) { unsigned long mask = x86_pmu.lbr_nr - 1; u64 tos = intel_pmu_lbr_tos(); @@ -593,7 +804,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) * is the same as the linear address, allowing us to merge the LIP and EIP * LBR formats. */ -static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) +void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) { bool need_info = false, call_stack = false; unsigned long mask = x86_pmu.lbr_nr - 1; @@ -616,8 +827,8 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) u16 cycles = 0; int lbr_flags = lbr_desc[lbr_format]; - from = rdlbr_from(lbr_idx); - to = rdlbr_to(lbr_idx); + from = rdlbr_from(lbr_idx, NULL); + to = rdlbr_to(lbr_idx, NULL); /* * Read LBR call stack entries @@ -629,7 +840,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) if (lbr_format == LBR_FORMAT_INFO && need_info) { u64 info; - rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info); + info = rdlbr_info(lbr_idx, NULL); mis = !!(info & LBR_INFO_MISPRED); pred = !mis; in_tx = !!(info & LBR_INFO_IN_TX); @@ -684,6 +895,93 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) cpuc->lbr_stack.hw_idx = tos; } +static __always_inline int get_lbr_br_type(u64 info) +{ + if (!static_cpu_has(X86_FEATURE_ARCH_LBR) || !x86_pmu.lbr_br_type) + return 0; + + return (info & LBR_INFO_BR_TYPE) >> LBR_INFO_BR_TYPE_OFFSET; +} + +static __always_inline bool get_lbr_mispred(u64 info) +{ + if (static_cpu_has(X86_FEATURE_ARCH_LBR) && !x86_pmu.lbr_mispred) + return 0; + + return !!(info & LBR_INFO_MISPRED); +} + +static __always_inline bool get_lbr_predicted(u64 info) +{ + if (static_cpu_has(X86_FEATURE_ARCH_LBR) && !x86_pmu.lbr_mispred) + return 0; + + return !(info & LBR_INFO_MISPRED); +} + +static __always_inline bool get_lbr_cycles(u64 info) +{ + if (static_cpu_has(X86_FEATURE_ARCH_LBR) && + !(x86_pmu.lbr_timed_lbr && info & LBR_INFO_CYC_CNT_VALID)) + return 0; + + return info & LBR_INFO_CYCLES; +} + +static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc, + struct lbr_entry *entries) +{ + struct perf_branch_entry *e; + struct lbr_entry *lbr; + u64 from, to, info; + int i; + + for (i = 0; i < x86_pmu.lbr_nr; i++) { + lbr = entries ? &entries[i] : NULL; + e = &cpuc->lbr_entries[i]; + + from = rdlbr_from(i, lbr); + /* + * Read LBR entries until invalid entry (0s) is detected. + */ + if (!from) + break; + + to = rdlbr_to(i, lbr); + info = rdlbr_info(i, lbr); + + e->from = from; + e->to = to; + e->mispred = get_lbr_mispred(info); + e->predicted = get_lbr_predicted(info); + e->in_tx = !!(info & LBR_INFO_IN_TX); + e->abort = !!(info & LBR_INFO_ABORT); + e->cycles = get_lbr_cycles(info); + e->type = get_lbr_br_type(info); + e->reserved = 0; + } + + cpuc->lbr_stack.nr = i; +} + +static void intel_pmu_arch_lbr_read(struct cpu_hw_events *cpuc) +{ + intel_pmu_store_lbr(cpuc, NULL); +} + +static void intel_pmu_arch_lbr_read_xsave(struct cpu_hw_events *cpuc) +{ + struct x86_perf_task_context_arch_lbr_xsave *xsave = cpuc->lbr_xsave; + + if (!xsave) { + intel_pmu_store_lbr(cpuc, NULL); + return; + } + copy_dynamic_supervisor_to_kernel(&xsave->xsave, XFEATURE_MASK_LBR); + + intel_pmu_store_lbr(cpuc, xsave->lbr.entries); +} + void intel_pmu_lbr_read(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -694,13 +992,11 @@ void intel_pmu_lbr_read(void) * This could be smarter and actually check the event, * but this simple approach seems to work for now. */ - if (!cpuc->lbr_users || cpuc->lbr_users == cpuc->lbr_pebs_users) + if (!cpuc->lbr_users || vlbr_exclude_host() || + cpuc->lbr_users == cpuc->lbr_pebs_users) return; - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) - intel_pmu_lbr_read_32(cpuc); - else - intel_pmu_lbr_read_64(cpuc); + x86_pmu.lbr_read(cpuc); intel_pmu_lbr_filter(cpuc); } @@ -800,6 +1096,11 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) reg = &event->hw.branch_reg; reg->idx = EXTRA_REG_LBR; + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) { + reg->config = mask; + return 0; + } + /* * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate * in suppress mode. So LBR_SELECT should be set to @@ -1056,6 +1357,27 @@ common_branch_type(int type) return PERF_BR_UNKNOWN; } +enum { + ARCH_LBR_BR_TYPE_JCC = 0, + ARCH_LBR_BR_TYPE_NEAR_IND_JMP = 1, + ARCH_LBR_BR_TYPE_NEAR_REL_JMP = 2, + ARCH_LBR_BR_TYPE_NEAR_IND_CALL = 3, + ARCH_LBR_BR_TYPE_NEAR_REL_CALL = 4, + ARCH_LBR_BR_TYPE_NEAR_RET = 5, + ARCH_LBR_BR_TYPE_KNOWN_MAX = ARCH_LBR_BR_TYPE_NEAR_RET, + + ARCH_LBR_BR_TYPE_MAP_MAX = 16, +}; + +static const int arch_lbr_br_type_map[ARCH_LBR_BR_TYPE_MAP_MAX] = { + [ARCH_LBR_BR_TYPE_JCC] = X86_BR_JCC, + [ARCH_LBR_BR_TYPE_NEAR_IND_JMP] = X86_BR_IND_JMP, + [ARCH_LBR_BR_TYPE_NEAR_REL_JMP] = X86_BR_JMP, + [ARCH_LBR_BR_TYPE_NEAR_IND_CALL] = X86_BR_IND_CALL, + [ARCH_LBR_BR_TYPE_NEAR_REL_CALL] = X86_BR_CALL, + [ARCH_LBR_BR_TYPE_NEAR_RET] = X86_BR_RET, +}; + /* * implement actual branch filter based on user demand. * Hardware may not exactly satisfy that request, thus @@ -1068,7 +1390,7 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) { u64 from, to; int br_sel = cpuc->br_sel; - int i, j, type; + int i, j, type, to_plm; bool compress = false; /* if sampling all branches, then nothing to filter */ @@ -1080,8 +1402,19 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) from = cpuc->lbr_entries[i].from; to = cpuc->lbr_entries[i].to; + type = cpuc->lbr_entries[i].type; - type = branch_type(from, to, cpuc->lbr_entries[i].abort); + /* + * Parse the branch type recorded in LBR_x_INFO MSR. + * Doesn't support OTHER_BRANCH decoding for now. + * OTHER_BRANCH branch type still rely on software decoding. + */ + if (static_cpu_has(X86_FEATURE_ARCH_LBR) && + type <= ARCH_LBR_BR_TYPE_KNOWN_MAX) { + to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER; + type = arch_lbr_br_type_map[type] | to_plm; + } else + type = branch_type(from, to, cpuc->lbr_entries[i].abort); if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) { if (cpuc->lbr_entries[i].in_tx) type |= X86_BR_IN_TX; @@ -1116,32 +1449,18 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) } } -void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr) +void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - int i; - - cpuc->lbr_stack.nr = x86_pmu.lbr_nr; - /* Cannot get TOS for large PEBS */ - if (cpuc->n_pebs == cpuc->n_large_pebs) + /* Cannot get TOS for large PEBS and Arch LBR */ + if (static_cpu_has(X86_FEATURE_ARCH_LBR) || + (cpuc->n_pebs == cpuc->n_large_pebs)) cpuc->lbr_stack.hw_idx = -1ULL; else cpuc->lbr_stack.hw_idx = intel_pmu_lbr_tos(); - for (i = 0; i < x86_pmu.lbr_nr; i++) { - u64 info = lbr->lbr[i].info; - struct perf_branch_entry *e = &cpuc->lbr_entries[i]; - - e->from = lbr->lbr[i].from; - e->to = lbr->lbr[i].to; - e->mispred = !!(info & LBR_INFO_MISPRED); - e->predicted = !(info & LBR_INFO_MISPRED); - e->in_tx = !!(info & LBR_INFO_IN_TX); - e->abort = !!(info & LBR_INFO_ABORT); - e->cycles = info & LBR_INFO_CYCLES; - e->reserved = 0; - } + intel_pmu_store_lbr(cpuc, lbr); intel_pmu_lbr_filter(cpuc); } @@ -1198,6 +1517,26 @@ static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { [PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_REL_CALL, }; +static int arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { + [PERF_SAMPLE_BRANCH_ANY_SHIFT] = ARCH_LBR_ANY, + [PERF_SAMPLE_BRANCH_USER_SHIFT] = ARCH_LBR_USER, + [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = ARCH_LBR_KERNEL, + [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN, + [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = ARCH_LBR_RETURN | + ARCH_LBR_OTHER_BRANCH, + [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = ARCH_LBR_REL_CALL | + ARCH_LBR_IND_CALL | + ARCH_LBR_OTHER_BRANCH, + [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = ARCH_LBR_IND_CALL, + [PERF_SAMPLE_BRANCH_COND_SHIFT] = ARCH_LBR_JCC, + [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = ARCH_LBR_REL_CALL | + ARCH_LBR_IND_CALL | + ARCH_LBR_RETURN | + ARCH_LBR_CALL_STACK, + [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = ARCH_LBR_IND_JMP, + [PERF_SAMPLE_BRANCH_CALL_SHIFT] = ARCH_LBR_REL_CALL, +}; + /* core */ void __init intel_pmu_lbr_init_core(void) { @@ -1251,9 +1590,17 @@ void __init intel_pmu_lbr_init_snb(void) */ } +static inline struct kmem_cache * +create_lbr_kmem_cache(size_t size, size_t align) +{ + return kmem_cache_create("x86_lbr", size, align, 0, NULL); +} + /* haswell */ void intel_pmu_lbr_init_hsw(void) { + size_t size = sizeof(struct x86_perf_task_context); + x86_pmu.lbr_nr = 16; x86_pmu.lbr_tos = MSR_LBR_TOS; x86_pmu.lbr_from = MSR_LBR_NHM_FROM; @@ -1262,6 +1609,8 @@ void intel_pmu_lbr_init_hsw(void) x86_pmu.lbr_sel_mask = LBR_SEL_MASK; x86_pmu.lbr_sel_map = hsw_lbr_sel_map; + x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0); + if (lbr_from_signext_quirk_needed()) static_branch_enable(&lbr_from_quirk_key); } @@ -1269,14 +1618,19 @@ void intel_pmu_lbr_init_hsw(void) /* skylake */ __init void intel_pmu_lbr_init_skl(void) { + size_t size = sizeof(struct x86_perf_task_context); + x86_pmu.lbr_nr = 32; x86_pmu.lbr_tos = MSR_LBR_TOS; x86_pmu.lbr_from = MSR_LBR_NHM_FROM; x86_pmu.lbr_to = MSR_LBR_NHM_TO; + x86_pmu.lbr_info = MSR_LBR_INFO_0; x86_pmu.lbr_sel_mask = LBR_SEL_MASK; x86_pmu.lbr_sel_map = hsw_lbr_sel_map; + x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0); + /* * SW branch filter usage: * - support syscall, sysret capture. @@ -1343,3 +1697,152 @@ void intel_pmu_lbr_init_knl(void) if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_LIP) x86_pmu.intel_cap.lbr_format = LBR_FORMAT_EIP_FLAGS; } + +/* + * LBR state size is variable based on the max number of registers. + * This calculates the expected state size, which should match + * what the hardware enumerates for the size of XFEATURE_LBR. + */ +static inline unsigned int get_lbr_state_size(void) +{ + return sizeof(struct arch_lbr_state) + + x86_pmu.lbr_nr * sizeof(struct lbr_entry); +} + +static bool is_arch_lbr_xsave_available(void) +{ + if (!boot_cpu_has(X86_FEATURE_XSAVES)) + return false; + + /* + * Check the LBR state with the corresponding software structure. + * Disable LBR XSAVES support if the size doesn't match. + */ + if (WARN_ON(xfeature_size(XFEATURE_LBR) != get_lbr_state_size())) + return false; + + return true; +} + +void __init intel_pmu_arch_lbr_init(void) +{ + struct pmu *pmu = x86_get_pmu(); + union cpuid28_eax eax; + union cpuid28_ebx ebx; + union cpuid28_ecx ecx; + unsigned int unused_edx; + bool arch_lbr_xsave; + size_t size; + u64 lbr_nr; + + /* Arch LBR Capabilities */ + cpuid(28, &eax.full, &ebx.full, &ecx.full, &unused_edx); + + lbr_nr = fls(eax.split.lbr_depth_mask) * 8; + if (!lbr_nr) + goto clear_arch_lbr; + + /* Apply the max depth of Arch LBR */ + if (wrmsrl_safe(MSR_ARCH_LBR_DEPTH, lbr_nr)) + goto clear_arch_lbr; + + x86_pmu.lbr_depth_mask = eax.split.lbr_depth_mask; + x86_pmu.lbr_deep_c_reset = eax.split.lbr_deep_c_reset; + x86_pmu.lbr_lip = eax.split.lbr_lip; + x86_pmu.lbr_cpl = ebx.split.lbr_cpl; + x86_pmu.lbr_filter = ebx.split.lbr_filter; + x86_pmu.lbr_call_stack = ebx.split.lbr_call_stack; + x86_pmu.lbr_mispred = ecx.split.lbr_mispred; + x86_pmu.lbr_timed_lbr = ecx.split.lbr_timed_lbr; + x86_pmu.lbr_br_type = ecx.split.lbr_br_type; + x86_pmu.lbr_nr = lbr_nr; + + + arch_lbr_xsave = is_arch_lbr_xsave_available(); + if (arch_lbr_xsave) { + size = sizeof(struct x86_perf_task_context_arch_lbr_xsave) + + get_lbr_state_size(); + pmu->task_ctx_cache = create_lbr_kmem_cache(size, + XSAVE_ALIGNMENT); + } + + if (!pmu->task_ctx_cache) { + arch_lbr_xsave = false; + + size = sizeof(struct x86_perf_task_context_arch_lbr) + + lbr_nr * sizeof(struct lbr_entry); + pmu->task_ctx_cache = create_lbr_kmem_cache(size, 0); + } + + x86_pmu.lbr_from = MSR_ARCH_LBR_FROM_0; + x86_pmu.lbr_to = MSR_ARCH_LBR_TO_0; + x86_pmu.lbr_info = MSR_ARCH_LBR_INFO_0; + + /* LBR callstack requires both CPL and Branch Filtering support */ + if (!x86_pmu.lbr_cpl || + !x86_pmu.lbr_filter || + !x86_pmu.lbr_call_stack) + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_NOT_SUPP; + + if (!x86_pmu.lbr_cpl) { + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_NOT_SUPP; + } else if (!x86_pmu.lbr_filter) { + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_NOT_SUPP; + } + + x86_pmu.lbr_ctl_mask = ARCH_LBR_CTL_MASK; + x86_pmu.lbr_ctl_map = arch_lbr_ctl_map; + + if (!x86_pmu.lbr_cpl && !x86_pmu.lbr_filter) + x86_pmu.lbr_ctl_map = NULL; + + x86_pmu.lbr_reset = intel_pmu_arch_lbr_reset; + if (arch_lbr_xsave) { + x86_pmu.lbr_save = intel_pmu_arch_lbr_xsaves; + x86_pmu.lbr_restore = intel_pmu_arch_lbr_xrstors; + x86_pmu.lbr_read = intel_pmu_arch_lbr_read_xsave; + pr_cont("XSAVE "); + } else { + x86_pmu.lbr_save = intel_pmu_arch_lbr_save; + x86_pmu.lbr_restore = intel_pmu_arch_lbr_restore; + x86_pmu.lbr_read = intel_pmu_arch_lbr_read; + } + + pr_cont("Architectural LBR, "); + + return; + +clear_arch_lbr: + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_ARCH_LBR); +} + +/** + * x86_perf_get_lbr - get the LBR records information + * + * @lbr: the caller's memory to store the LBR records information + * + * Returns: 0 indicates the LBR info has been successfully obtained + */ +int x86_perf_get_lbr(struct x86_pmu_lbr *lbr) +{ + int lbr_fmt = x86_pmu.intel_cap.lbr_format; + + lbr->nr = x86_pmu.lbr_nr; + lbr->from = x86_pmu.lbr_from; + lbr->to = x86_pmu.lbr_to; + lbr->info = (lbr_fmt == LBR_FORMAT_INFO) ? x86_pmu.lbr_info : 0; + + return 0; +} +EXPORT_SYMBOL_GPL(x86_perf_get_lbr); + +struct event_constraint vlbr_constraint = + __EVENT_CONSTRAINT(INTEL_FIXED_VLBR_EVENT, (1ULL << INTEL_PMC_IDX_FIXED_VLBR), + FIXED_EVENT_FLAGS, 1, 0, PERF_X86_EVENT_LBR_SELECT); diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index cf76d6631afa..d5c6d3b340c5 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -16,7 +16,7 @@ struct pci_driver *uncore_pci_driver; DEFINE_RAW_SPINLOCK(pci2phy_map_lock); struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head); struct pci_extra_dev *uncore_extra_pci_dev; -static int max_dies; +int __uncore_max_dies; /* mask of cpus that collect uncore events */ static cpumask_t uncore_cpu_mask; @@ -108,7 +108,7 @@ struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu * The unsigned check also catches the '-1' return value for non * existent mappings in the topology map. */ - return dieid < max_dies ? pmu->boxes[dieid] : NULL; + return dieid < uncore_max_dies() ? pmu->boxes[dieid] : NULL; } u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event) @@ -132,6 +132,9 @@ u64 uncore_mmio_read_counter(struct intel_uncore_box *box, if (!box->io_addr) return 0; + if (!uncore_mmio_is_valid_offset(box, event->hw.event_base)) + return 0; + return readq(box->io_addr + event->hw.event_base); } @@ -843,10 +846,12 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu) .read = uncore_pmu_event_read, .module = THIS_MODULE, .capabilities = PERF_PMU_CAP_NO_EXCLUDE, + .attr_update = pmu->type->attr_update, }; } else { pmu->pmu = *pmu->type->pmu; pmu->pmu.attr_groups = pmu->type->attr_groups; + pmu->pmu.attr_update = pmu->type->attr_update; } if (pmu->type->num_boxes == 1) { @@ -877,7 +882,7 @@ static void uncore_free_boxes(struct intel_uncore_pmu *pmu) { int die; - for (die = 0; die < max_dies; die++) + for (die = 0; die < uncore_max_dies(); die++) kfree(pmu->boxes[die]); kfree(pmu->boxes); } @@ -887,6 +892,9 @@ static void uncore_type_exit(struct intel_uncore_type *type) struct intel_uncore_pmu *pmu = type->pmus; int i; + if (type->cleanup_mapping) + type->cleanup_mapping(type); + if (pmu) { for (i = 0; i < type->num_boxes; i++, pmu++) { uncore_pmu_unregister(pmu); @@ -915,7 +923,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) if (!pmus) return -ENOMEM; - size = max_dies * sizeof(struct intel_uncore_box *); + size = uncore_max_dies() * sizeof(struct intel_uncore_box *); for (i = 0; i < type->num_boxes; i++) { pmus[i].func_id = setid ? i : -1; @@ -954,6 +962,9 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) type->pmu_group = &uncore_pmu_attr_group; + if (type->set_mapping) + type->set_mapping(type); + return 0; err: @@ -1112,7 +1123,7 @@ static int __init uncore_pci_init(void) size_t size; int ret; - size = max_dies * sizeof(struct pci_extra_dev); + size = uncore_max_dies() * sizeof(struct pci_extra_dev); uncore_extra_pci_dev = kzalloc(size, GFP_KERNEL); if (!uncore_extra_pci_dev) { ret = -ENOMEM; @@ -1514,6 +1525,8 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &skx_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L, &skl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &skl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &skl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &skl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &icl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_NNPI, &icl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_uncore_init), @@ -1539,7 +1552,8 @@ static int __init intel_uncore_init(void) if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) return -ENODEV; - max_dies = topology_max_packages() * topology_max_die_per_package(); + __uncore_max_dies = + topology_max_packages() * topology_max_die_per_package(); uncore_init = (struct intel_uncore_init_fun *)id->driver_data; if (uncore_init->pci_init) { diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index b469ddd45515..105fdc69825e 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -61,6 +61,7 @@ struct intel_uncore_type { unsigned msr_offset; unsigned mmio_offset; }; + unsigned mmio_map_size; unsigned num_shared_regs:8; unsigned single_fixed:1; unsigned pair_ctr_ctl:1; @@ -72,7 +73,19 @@ struct intel_uncore_type { struct uncore_event_desc *event_descs; struct freerunning_counters *freerunning; const struct attribute_group *attr_groups[4]; + const struct attribute_group **attr_update; struct pmu *pmu; /* for custom pmu ops */ + /* + * Uncore PMU would store relevant platform topology configuration here + * to identify which platform component each PMON block of that type is + * supposed to monitor. + */ + u64 *topology; + /* + * Optional callbacks for managing mapping of Uncore units to PMONs + */ + int (*set_mapping)(struct intel_uncore_type *type); + void (*cleanup_mapping)(struct intel_uncore_type *type); }; #define pmu_group attr_groups[0] @@ -169,6 +182,18 @@ int uncore_pcibus_to_physid(struct pci_bus *bus); ssize_t uncore_event_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf); +static inline struct intel_uncore_pmu *dev_to_uncore_pmu(struct device *dev) +{ + return container_of(dev_get_drvdata(dev), struct intel_uncore_pmu, pmu); +} + +#define to_device_attribute(n) container_of(n, struct device_attribute, attr) +#define to_dev_ext_attribute(n) container_of(n, struct dev_ext_attribute, attr) +#define attr_to_ext_attr(n) to_dev_ext_attribute(to_device_attribute(n)) + +extern int __uncore_max_dies; +#define uncore_max_dies() (__uncore_max_dies) + #define INTEL_UNCORE_EVENT_DESC(_name, _config) \ { \ .attr = __ATTR(_name, 0444, uncore_event_show, NULL), \ @@ -196,6 +221,18 @@ static inline bool uncore_pmc_freerunning(int idx) return idx == UNCORE_PMC_IDX_FREERUNNING; } +static inline bool uncore_mmio_is_valid_offset(struct intel_uncore_box *box, + unsigned long offset) +{ + if (offset < box->pmu->type->mmio_map_size) + return true; + + pr_warn_once("perf uncore: Invalid offset 0x%lx exceeds mapped area of %s.\n", + offset, box->pmu->type->name); + + return false; +} + static inline unsigned int uncore_mmio_box_ctl(struct intel_uncore_box *box) { diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 3de1065eefc4..cb94ba86efd2 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -42,6 +42,17 @@ #define PCI_DEVICE_ID_INTEL_WHL_UQ_IMC 0x3ed0 #define PCI_DEVICE_ID_INTEL_WHL_4_UQ_IMC 0x3e34 #define PCI_DEVICE_ID_INTEL_WHL_UD_IMC 0x3e35 +#define PCI_DEVICE_ID_INTEL_CML_H1_IMC 0x9b44 +#define PCI_DEVICE_ID_INTEL_CML_H2_IMC 0x9b54 +#define PCI_DEVICE_ID_INTEL_CML_H3_IMC 0x9b64 +#define PCI_DEVICE_ID_INTEL_CML_U1_IMC 0x9b51 +#define PCI_DEVICE_ID_INTEL_CML_U2_IMC 0x9b61 +#define PCI_DEVICE_ID_INTEL_CML_U3_IMC 0x9b71 +#define PCI_DEVICE_ID_INTEL_CML_S1_IMC 0x9b33 +#define PCI_DEVICE_ID_INTEL_CML_S2_IMC 0x9b43 +#define PCI_DEVICE_ID_INTEL_CML_S3_IMC 0x9b53 +#define PCI_DEVICE_ID_INTEL_CML_S4_IMC 0x9b63 +#define PCI_DEVICE_ID_INTEL_CML_S5_IMC 0x9b73 #define PCI_DEVICE_ID_INTEL_ICL_U_IMC 0x8a02 #define PCI_DEVICE_ID_INTEL_ICL_U2_IMC 0x8a12 #define PCI_DEVICE_ID_INTEL_TGL_U1_IMC 0x9a02 @@ -415,6 +426,7 @@ static const struct attribute_group snb_uncore_imc_format_group = { static void snb_uncore_imc_init_box(struct intel_uncore_box *box) { + struct intel_uncore_type *type = box->pmu->type; struct pci_dev *pdev = box->pci_dev; int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET; resource_size_t addr; @@ -430,7 +442,10 @@ static void snb_uncore_imc_init_box(struct intel_uncore_box *box) addr &= ~(PAGE_SIZE - 1); - box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE); + box->io_addr = ioremap(addr, type->mmio_map_size); + if (!box->io_addr) + pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name); + box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL; } @@ -586,6 +601,7 @@ static struct intel_uncore_type snb_uncore_imc = { .num_counters = 2, .num_boxes = 1, .num_freerunning_types = SNB_PCI_UNCORE_IMC_FREERUNNING_TYPE_MAX, + .mmio_map_size = SNB_UNCORE_PCI_IMC_MAP_SIZE, .freerunning = snb_uncore_imc_freerunning, .event_descs = snb_uncore_imc_events, .format_group = &snb_uncore_imc_format_group, @@ -771,6 +787,50 @@ static const struct pci_device_id skl_uncore_pci_ids[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_WHL_UD_IMC), .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H1_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H2_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H3_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U1_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U2_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U3_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S1_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S2_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S3_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S4_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S5_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, { /* end: all zeroes */ }, }; @@ -863,6 +923,17 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = { IMC_DEV(WHL_UQ_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U Mobile Quad Core */ IMC_DEV(WHL_4_UQ_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U Mobile Quad Core */ IMC_DEV(WHL_UD_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U Mobile Dual Core */ + IMC_DEV(CML_H1_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_H2_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_H3_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_U1_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_U2_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_U3_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_S1_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_S2_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_S3_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_S4_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_S5_IMC, &skl_uncore_pci_driver), IMC_DEV(ICL_U_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */ IMC_DEV(ICL_U2_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */ { /* end marker */ } @@ -1085,11 +1156,13 @@ static struct pci_dev *tgl_uncore_get_mc_dev(void) } #define TGL_UNCORE_MMIO_IMC_MEM_OFFSET 0x10000 +#define TGL_UNCORE_PCI_IMC_MAP_SIZE 0xe000 static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box) { struct pci_dev *pdev = tgl_uncore_get_mc_dev(); struct intel_uncore_pmu *pmu = box->pmu; + struct intel_uncore_type *type = pmu->type; resource_size_t addr; u32 mch_bar; @@ -1112,7 +1185,9 @@ static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box) addr |= ((resource_size_t)mch_bar << 32); #endif - box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE); + box->io_addr = ioremap(addr, type->mmio_map_size); + if (!box->io_addr) + pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name); } static struct intel_uncore_ops tgl_uncore_imc_freerunning_ops = { @@ -1138,6 +1213,7 @@ static struct intel_uncore_type tgl_uncore_imc_free_running = { .num_counters = 3, .num_boxes = 2, .num_freerunning_types = TGL_MMIO_UNCORE_IMC_FREERUNNING_TYPE_MAX, + .mmio_map_size = TGL_UNCORE_PCI_IMC_MAP_SIZE, .freerunning = tgl_uncore_imc_freerunning, .ops = &tgl_uncore_imc_freerunning_ops, .event_descs = tgl_uncore_imc_events, diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 07652fa20ebb..62e88ad919ff 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -273,6 +273,30 @@ #define SKX_CPUNODEID 0xc0 #define SKX_GIDNIDMAP 0xd4 +/* + * The CPU_BUS_NUMBER MSR returns the values of the respective CPUBUSNO CSR + * that BIOS programmed. MSR has package scope. + * | Bit | Default | Description + * | [63] | 00h | VALID - When set, indicates the CPU bus + * numbers have been initialized. (RO) + * |[62:48]| --- | Reserved + * |[47:40]| 00h | BUS_NUM_5 — Return the bus number BIOS assigned + * CPUBUSNO(5). (RO) + * |[39:32]| 00h | BUS_NUM_4 — Return the bus number BIOS assigned + * CPUBUSNO(4). (RO) + * |[31:24]| 00h | BUS_NUM_3 — Return the bus number BIOS assigned + * CPUBUSNO(3). (RO) + * |[23:16]| 00h | BUS_NUM_2 — Return the bus number BIOS assigned + * CPUBUSNO(2). (RO) + * |[15:8] | 00h | BUS_NUM_1 — Return the bus number BIOS assigned + * CPUBUSNO(1). (RO) + * | [7:0] | 00h | BUS_NUM_0 — Return the bus number BIOS assigned + * CPUBUSNO(0). (RO) + */ +#define SKX_MSR_CPU_BUS_NUMBER 0x300 +#define SKX_MSR_CPU_BUS_VALID_BIT (1ULL << 63) +#define BUS_NUM_STRIDE 8 + /* SKX CHA */ #define SKX_CHA_MSR_PMON_BOX_FILTER_TID (0x1ffULL << 0) #define SKX_CHA_MSR_PMON_BOX_FILTER_LINK (0xfULL << 9) @@ -3612,6 +3636,170 @@ static struct intel_uncore_ops skx_uncore_iio_ops = { .read_counter = uncore_msr_read_counter, }; +static inline u8 skx_iio_stack(struct intel_uncore_pmu *pmu, int die) +{ + return pmu->type->topology[die] >> (pmu->pmu_idx * BUS_NUM_STRIDE); +} + +static umode_t +skx_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, int die) +{ + struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(kobj_to_dev(kobj)); + + /* Root bus 0x00 is valid only for die 0 AND pmu_idx = 0. */ + return (!skx_iio_stack(pmu, die) && pmu->pmu_idx) ? 0 : attr->mode; +} + +static ssize_t skx_iio_mapping_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct pci_bus *bus = pci_find_next_bus(NULL); + struct intel_uncore_pmu *uncore_pmu = dev_to_uncore_pmu(dev); + struct dev_ext_attribute *ea = to_dev_ext_attribute(attr); + long die = (long)ea->var; + + /* + * Current implementation is for single segment configuration hence it's + * safe to take the segment value from the first available root bus. + */ + return sprintf(buf, "%04x:%02x\n", pci_domain_nr(bus), + skx_iio_stack(uncore_pmu, die)); +} + +static int skx_msr_cpu_bus_read(int cpu, u64 *topology) +{ + u64 msr_value; + + if (rdmsrl_on_cpu(cpu, SKX_MSR_CPU_BUS_NUMBER, &msr_value) || + !(msr_value & SKX_MSR_CPU_BUS_VALID_BIT)) + return -ENXIO; + + *topology = msr_value; + + return 0; +} + +static int die_to_cpu(int die) +{ + int res = 0, cpu, current_die; + /* + * Using cpus_read_lock() to ensure cpu is not going down between + * looking at cpu_online_mask. + */ + cpus_read_lock(); + for_each_online_cpu(cpu) { + current_die = topology_logical_die_id(cpu); + if (current_die == die) { + res = cpu; + break; + } + } + cpus_read_unlock(); + return res; +} + +static int skx_iio_get_topology(struct intel_uncore_type *type) +{ + int i, ret; + struct pci_bus *bus = NULL; + + /* + * Verified single-segment environments only; disabled for multiple + * segment topologies for now except VMD domains. + * VMD domains start at 0x10000 to not clash with ACPI _SEG domains. + */ + while ((bus = pci_find_next_bus(bus)) + && (!pci_domain_nr(bus) || pci_domain_nr(bus) > 0xffff)) + ; + if (bus) + return -EPERM; + + type->topology = kcalloc(uncore_max_dies(), sizeof(u64), GFP_KERNEL); + if (!type->topology) + return -ENOMEM; + + for (i = 0; i < uncore_max_dies(); i++) { + ret = skx_msr_cpu_bus_read(die_to_cpu(i), &type->topology[i]); + if (ret) { + kfree(type->topology); + type->topology = NULL; + return ret; + } + } + + return 0; +} + +static struct attribute_group skx_iio_mapping_group = { + .is_visible = skx_iio_mapping_visible, +}; + +static const struct attribute_group *skx_iio_attr_update[] = { + &skx_iio_mapping_group, + NULL, +}; + +static int skx_iio_set_mapping(struct intel_uncore_type *type) +{ + char buf[64]; + int ret; + long die = -1; + struct attribute **attrs = NULL; + struct dev_ext_attribute *eas = NULL; + + ret = skx_iio_get_topology(type); + if (ret) + return ret; + + /* One more for NULL. */ + attrs = kcalloc((uncore_max_dies() + 1), sizeof(*attrs), GFP_KERNEL); + if (!attrs) + goto err; + + eas = kcalloc(uncore_max_dies(), sizeof(*eas), GFP_KERNEL); + if (!eas) + goto err; + + for (die = 0; die < uncore_max_dies(); die++) { + sprintf(buf, "die%ld", die); + sysfs_attr_init(&eas[die].attr.attr); + eas[die].attr.attr.name = kstrdup(buf, GFP_KERNEL); + if (!eas[die].attr.attr.name) + goto err; + eas[die].attr.attr.mode = 0444; + eas[die].attr.show = skx_iio_mapping_show; + eas[die].attr.store = NULL; + eas[die].var = (void *)die; + attrs[die] = &eas[die].attr.attr; + } + skx_iio_mapping_group.attrs = attrs; + + return 0; +err: + for (; die >= 0; die--) + kfree(eas[die].attr.attr.name); + kfree(eas); + kfree(attrs); + kfree(type->topology); + type->attr_update = NULL; + return -ENOMEM; +} + +static void skx_iio_cleanup_mapping(struct intel_uncore_type *type) +{ + struct attribute **attr = skx_iio_mapping_group.attrs; + + if (!attr) + return; + + for (; *attr; attr++) + kfree((*attr)->name); + kfree(attr_to_ext_attr(*skx_iio_mapping_group.attrs)); + kfree(skx_iio_mapping_group.attrs); + skx_iio_mapping_group.attrs = NULL; + kfree(type->topology); +} + static struct intel_uncore_type skx_uncore_iio = { .name = "iio", .num_counters = 4, @@ -3626,6 +3814,9 @@ static struct intel_uncore_type skx_uncore_iio = { .constraints = skx_uncore_iio_constraints, .ops = &skx_uncore_iio_ops, .format_group = &skx_uncore_iio_format_group, + .attr_update = skx_iio_attr_update, + .set_mapping = skx_iio_set_mapping, + .cleanup_mapping = skx_iio_cleanup_mapping, }; enum perf_uncore_iio_freerunning_type_id { @@ -4421,6 +4612,7 @@ static void __snr_uncore_mmio_init_box(struct intel_uncore_box *box, unsigned int box_ctl, int mem_offset) { struct pci_dev *pdev = snr_uncore_get_mc_dev(box->dieid); + struct intel_uncore_type *type = box->pmu->type; resource_size_t addr; u32 pci_dword; @@ -4435,9 +4627,11 @@ static void __snr_uncore_mmio_init_box(struct intel_uncore_box *box, addr += box_ctl; - box->io_addr = ioremap(addr, SNR_IMC_MMIO_SIZE); - if (!box->io_addr) + box->io_addr = ioremap(addr, type->mmio_map_size); + if (!box->io_addr) { + pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name); return; + } writel(IVBEP_PMON_BOX_CTL_INT, box->io_addr); } @@ -4480,6 +4674,9 @@ static void snr_uncore_mmio_enable_event(struct intel_uncore_box *box, if (!box->io_addr) return; + if (!uncore_mmio_is_valid_offset(box, hwc->config_base)) + return; + writel(hwc->config | SNBEP_PMON_CTL_EN, box->io_addr + hwc->config_base); } @@ -4492,6 +4689,9 @@ static void snr_uncore_mmio_disable_event(struct intel_uncore_box *box, if (!box->io_addr) return; + if (!uncore_mmio_is_valid_offset(box, hwc->config_base)) + return; + writel(hwc->config, box->io_addr + hwc->config_base); } @@ -4530,6 +4730,7 @@ static struct intel_uncore_type snr_uncore_imc = { .event_mask = SNBEP_PMON_RAW_EVENT_MASK, .box_ctl = SNR_IMC_MMIO_PMON_BOX_CTL, .mmio_offset = SNR_IMC_MMIO_OFFSET, + .mmio_map_size = SNR_IMC_MMIO_SIZE, .ops = &snr_uncore_mmio_ops, .format_group = &skx_uncore_format_group, }; @@ -4570,6 +4771,7 @@ static struct intel_uncore_type snr_uncore_imc_free_running = { .num_counters = 3, .num_boxes = 1, .num_freerunning_types = SNR_IMC_FREERUNNING_TYPE_MAX, + .mmio_map_size = SNR_IMC_MMIO_SIZE, .freerunning = snr_imc_freerunning, .ops = &snr_uncore_imc_freerunning_ops, .event_descs = snr_uncore_imc_freerunning_events, @@ -4987,6 +5189,7 @@ static struct intel_uncore_type icx_uncore_imc = { .event_mask = SNBEP_PMON_RAW_EVENT_MASK, .box_ctl = SNR_IMC_MMIO_PMON_BOX_CTL, .mmio_offset = SNR_IMC_MMIO_OFFSET, + .mmio_map_size = SNR_IMC_MMIO_SIZE, .ops = &icx_uncore_mmio_ops, .format_group = &skx_uncore_format_group, }; @@ -5044,6 +5247,7 @@ static struct intel_uncore_type icx_uncore_imc_free_running = { .num_counters = 5, .num_boxes = 4, .num_freerunning_types = ICX_IMC_FREERUNNING_TYPE_MAX, + .mmio_map_size = SNR_IMC_MMIO_SIZE, .freerunning = icx_imc_freerunning, .ops = &icx_uncore_imc_freerunning_ops, .event_descs = icx_uncore_imc_freerunning_events, diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index e17a3d8a47ed..7b68ab5f19e7 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -78,6 +78,7 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode) #define PERF_X86_EVENT_LARGE_PEBS 0x0400 /* use large PEBS */ #define PERF_X86_EVENT_PEBS_VIA_PT 0x0800 /* use PT buffer for PEBS */ #define PERF_X86_EVENT_PAIR 0x1000 /* Large Increment per Cycle */ +#define PERF_X86_EVENT_LBR_SELECT 0x2000 /* Save/Restore MSR_LBR_SELECT */ struct amd_nb { int nb_id; /* NorthBridge id */ @@ -179,6 +180,17 @@ struct x86_perf_task_context; #define MAX_LBR_ENTRIES 32 enum { + LBR_FORMAT_32 = 0x00, + LBR_FORMAT_LIP = 0x01, + LBR_FORMAT_EIP = 0x02, + LBR_FORMAT_EIP_FLAGS = 0x03, + LBR_FORMAT_EIP_FLAGS2 = 0x04, + LBR_FORMAT_INFO = 0x05, + LBR_FORMAT_TIME = 0x06, + LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_TIME, +}; + +enum { X86_PERF_KFREE_SHARED = 0, X86_PERF_KFREE_EXCL = 1, X86_PERF_KFREE_MAX @@ -233,10 +245,15 @@ struct cpu_hw_events { int lbr_pebs_users; struct perf_branch_stack lbr_stack; struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; - struct er_account *lbr_sel; + union { + struct er_account *lbr_sel; + struct er_account *lbr_ctl; + }; u64 br_sel; - struct x86_perf_task_context *last_task_ctx; + void *last_task_ctx; int last_log_id; + int lbr_select; + void *lbr_xsave; /* * Intel host/guest exclude bits @@ -673,14 +690,38 @@ struct x86_pmu { /* * Intel LBR */ - unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */ - int lbr_nr; /* hardware stack size */ - u64 lbr_sel_mask; /* LBR_SELECT valid bits */ - const int *lbr_sel_map; /* lbr_select mappings */ + unsigned int lbr_tos, lbr_from, lbr_to, + lbr_info, lbr_nr; /* LBR base regs and size */ + union { + u64 lbr_sel_mask; /* LBR_SELECT valid bits */ + u64 lbr_ctl_mask; /* LBR_CTL valid bits */ + }; + union { + const int *lbr_sel_map; /* lbr_select mappings */ + int *lbr_ctl_map; /* LBR_CTL mappings */ + }; bool lbr_double_abort; /* duplicated lbr aborts */ bool lbr_pt_coexist; /* (LBR|BTS) may coexist with PT */ /* + * Intel Architectural LBR CPUID Enumeration + */ + unsigned int lbr_depth_mask:8; + unsigned int lbr_deep_c_reset:1; + unsigned int lbr_lip:1; + unsigned int lbr_cpl:1; + unsigned int lbr_filter:1; + unsigned int lbr_call_stack:1; + unsigned int lbr_mispred:1; + unsigned int lbr_timed_lbr:1; + unsigned int lbr_br_type:1; + + void (*lbr_reset)(void); + void (*lbr_read)(struct cpu_hw_events *cpuc); + void (*lbr_save)(void *ctx); + void (*lbr_restore)(void *ctx); + + /* * Intel PT/LBR/BTS are exclusive */ atomic_t lbr_exclusive[x86_lbr_exclusive_max]; @@ -718,17 +759,46 @@ struct x86_pmu { int (*aux_output_match) (struct perf_event *event); }; -struct x86_perf_task_context { - u64 lbr_from[MAX_LBR_ENTRIES]; - u64 lbr_to[MAX_LBR_ENTRIES]; - u64 lbr_info[MAX_LBR_ENTRIES]; - int tos; - int valid_lbrs; +struct x86_perf_task_context_opt { int lbr_callstack_users; int lbr_stack_state; int log_id; }; +struct x86_perf_task_context { + u64 lbr_sel; + int tos; + int valid_lbrs; + struct x86_perf_task_context_opt opt; + struct lbr_entry lbr[MAX_LBR_ENTRIES]; +}; + +struct x86_perf_task_context_arch_lbr { + struct x86_perf_task_context_opt opt; + struct lbr_entry entries[]; +}; + +/* + * Add padding to guarantee the 64-byte alignment of the state buffer. + * + * The structure is dynamically allocated. The size of the LBR state may vary + * based on the number of LBR registers. + * + * Do not put anything after the LBR state. + */ +struct x86_perf_task_context_arch_lbr_xsave { + struct x86_perf_task_context_opt opt; + + union { + struct xregs_state xsave; + struct { + struct fxregs_state i387; + struct xstate_header header; + struct arch_lbr_state lbr; + } __attribute__ ((packed, aligned (XSAVE_ALIGNMENT))); + }; +}; + #define x86_add_quirk(func_) \ do { \ static struct x86_pmu_quirk __quirk __initdata = { \ @@ -777,6 +847,14 @@ static struct perf_pmu_events_ht_attr event_attr_##v = { \ struct pmu *x86_get_pmu(void); extern struct x86_pmu x86_pmu __read_mostly; +static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx) +{ + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) + return &((struct x86_perf_task_context_arch_lbr *)ctx)->opt; + + return &((struct x86_perf_task_context *)ctx)->opt; +} + static inline bool x86_pmu_has_lbr_callstack(void) { return x86_pmu.lbr_sel_map && @@ -989,7 +1067,10 @@ void release_ds_buffers(void); void reserve_ds_buffers(void); +void release_lbr_buffers(void); + extern struct event_constraint bts_constraint; +extern struct event_constraint vlbr_constraint; void intel_pmu_enable_bts(u64 config); @@ -1041,7 +1122,7 @@ void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in); void intel_pmu_auto_reload_read(struct perf_event *event); -void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr); +void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr); void intel_ds_init(void); @@ -1054,6 +1135,10 @@ u64 lbr_from_signext_quirk_wr(u64 val); void intel_pmu_lbr_reset(void); +void intel_pmu_lbr_reset_32(void); + +void intel_pmu_lbr_reset_64(void); + void intel_pmu_lbr_add(struct perf_event *event); void intel_pmu_lbr_del(struct perf_event *event); @@ -1064,6 +1149,14 @@ void intel_pmu_lbr_disable_all(void); void intel_pmu_lbr_read(void); +void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc); + +void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc); + +void intel_pmu_lbr_save(void *ctx); + +void intel_pmu_lbr_restore(void *ctx); + void intel_pmu_lbr_init_core(void); void intel_pmu_lbr_init_nhm(void); @@ -1080,6 +1173,8 @@ void intel_pmu_lbr_init_skl(void); void intel_pmu_lbr_init_knl(void); +void intel_pmu_arch_lbr_init(void); + void intel_pmu_pebs_data_source_nhm(void); void intel_pmu_pebs_data_source_skl(bool pmem); @@ -1115,6 +1210,10 @@ static inline void release_ds_buffers(void) { } +static inline void release_lbr_buffers(void) +{ +} + static inline int intel_pmu_init(void) { return 0; diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 0f2bf59f4354..68b38820b10e 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -787,7 +787,8 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &model_hsx), X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &model_skl), - X86_MATCH_VENDOR_FAM(AMD, 0x17, &model_amd_fam17h), + X86_MATCH_VENDOR_FAM(AMD, 0x17, &model_amd_fam17h), + X86_MATCH_VENDOR_FAM(HYGON, 0x18, &model_amd_fam17h), {}, }; MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); diff --git a/arch/x86/events/zhaoxin/core.c b/arch/x86/events/zhaoxin/core.c index 898fa1ae9ceb..e68827e604ad 100644 --- a/arch/x86/events/zhaoxin/core.c +++ b/arch/x86/events/zhaoxin/core.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Zhoaxin PMU; like Intel Architectural PerfMon-v2 + * Zhaoxin PMU; like Intel Architectural PerfMon-v2 */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 385d3d172ee1..ca8a657edf59 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -30,7 +30,6 @@ #include <linux/sched/task_stack.h> #include <linux/uaccess.h> -#include <asm/pgalloc.h> #include <asm/cacheflush.h> #include <asm/user32.h> #include <asm/ia32.h> diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 0f63585edf5f..5c15f95b1ba7 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -144,7 +144,7 @@ _ASM_PTR (entry); \ .popsection -#else +#else /* ! __ASSEMBLY__ */ # define _EXPAND_EXTABLE_HANDLE(x) #x # define _ASM_EXTABLE_HANDLE(from, to, handler) \ " .pushsection \"__ex_table\",\"a\"\n" \ @@ -164,9 +164,7 @@ _ASM_EXTABLE_HANDLE(from, to, ex_handler_fault) /* For C file, we already have NOKPROBE_SYMBOL macro */ -#endif -#ifndef __ASSEMBLY__ /* * This output constraint should be used for any inline asm which has a "call" * instruction. Otherwise the asm may be inserted before the frame pointer @@ -175,6 +173,6 @@ */ register unsigned long current_stack_pointer asm(_ASM_SP); #define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer) -#endif +#endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_ASM_H */ diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index bf35e476a776..b6cac6e9bb70 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -14,8 +14,6 @@ * resource counting etc.. */ -#define ATOMIC_INIT(i) { (i) } - /** * arch_atomic_read - read atomic variable * @v: pointer of type atomic_t diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 680c320363db..9191280d9ea3 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -24,9 +24,16 @@ # error "Invalid value for CONFIG_PHYSICAL_ALIGN" #endif -#ifdef CONFIG_KERNEL_BZIP2 +#if defined(CONFIG_KERNEL_BZIP2) # define BOOT_HEAP_SIZE 0x400000 -#else /* !CONFIG_KERNEL_BZIP2 */ +#elif defined(CONFIG_KERNEL_ZSTD) +/* + * Zstd needs to allocate the ZSTD_DCtx in order to decompress the kernel. + * The ZSTD_DCtx is ~160KB, so set the heap size to 192KB because it is a + * round number and to allow some slack. + */ +# define BOOT_HEAP_SIZE 0x30000 +#else # define BOOT_HEAP_SIZE 0x10000 #endif diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 028189575560..297fa12e7e27 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -3,6 +3,7 @@ #define _ASM_X86_BUG_H #include <linux/stringify.h> +#include <linux/instrumentation.h> /* * Despite that some emulators terminate on UD2, we use it for WARN(). diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index 1a2eafca7038..0a7fe0321613 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -3,7 +3,7 @@ #define _ASM_X86_CMPXCHG_32_H /* - * Note: if you use set64_bit(), __cmpxchg64(), or their variants, you + * Note: if you use set64_bit(), __cmpxchg64(), or their variants, * you need to test for the feature in boot_cpu_data. */ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 02dabc9e77b0..2901d5df4366 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -96,6 +96,7 @@ #define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */ #define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ #define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ +/* free ( 3*32+17) */ #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */ #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ @@ -107,6 +108,7 @@ #define X86_FEATURE_EXTD_APICID ( 3*32+26) /* Extended APICID (8 bits) */ #define X86_FEATURE_AMD_DCM ( 3*32+27) /* AMD multi-node processor */ #define X86_FEATURE_APERFMPERF ( 3*32+28) /* P-State hardware coordination feedback capability (APERF/MPERF MSRs) */ +/* free ( 3*32+29) */ #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ #define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */ @@ -365,7 +367,9 @@ #define X86_FEATURE_SRBDS_CTRL (18*32+ 9) /* "" SRBDS mitigation MSR available */ #define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ #define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */ +#define X86_FEATURE_SERIALIZE (18*32+14) /* SERIALIZE instruction */ #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ +#define X86_FEATURE_ARCH_LBR (18*32+19) /* Intel ARCH LBR */ #define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ #define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ #define X86_FEATURE_FLUSH_L1D (18*32+28) /* Flush L1D cache */ diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h index 49bd6cf3eec9..7c0a52ca2f4d 100644 --- a/arch/x86/include/asm/device.h +++ b/arch/x86/include/asm/device.h @@ -3,9 +3,6 @@ #define _ASM_X86_DEVICE_H struct dev_archdata { -#ifdef CONFIG_IOMMU_API - void *iommu; /* hook for IOMMU specific extension */ -#endif }; struct pdev_archdata { diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h index 9b8cb50768c2..b8f1dc0761e4 100644 --- a/arch/x86/include/asm/div64.h +++ b/arch/x86/include/asm/div64.h @@ -74,16 +74,26 @@ static inline u64 mul_u32_u32(u32 a, u32 b) #else # include <asm-generic/div64.h> -static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div) +/* + * Will generate an #DE when the result doesn't fit u64, could fix with an + * __ex_table[] entry when it becomes an issue. + */ +static inline u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div) { u64 q; asm ("mulq %2; divq %3" : "=a" (q) - : "a" (a), "rm" ((u64)mul), "rm" ((u64)div) + : "a" (a), "rm" (mul), "rm" (div) : "rdx"); return q; } +#define mul_u64_u64_div_u64 mul_u64_u64_div_u64 + +static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div) +{ + return mul_u64_u64_div_u64(a, mul, div); +} #define mul_u64_u32_div mul_u64_u32_div #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 6b15a24930e0..fed67eafcacc 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -3,8 +3,8 @@ #define _ASM_X86_DMA_MAPPING_H /* - * IOMMU interface. See Documentation/DMA-API-HOWTO.txt and - * Documentation/DMA-API.txt for documentation. + * IOMMU interface. See Documentation/core-api/dma-api-howto.rst and + * Documentation/core-api/dma-api.rst for documentation. */ #include <linux/scatterlist.h> diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index e7d2ccfdd507..b9c2667ac46c 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -22,17 +22,7 @@ extern unsigned long efi_fw_vendor, efi_config_table; * * This is the main reason why we're doing stable VA mappings for RT * services. - * - * SGI UV1 machines are known to be incompatible with this scheme, so we - * provide an opt-out for these machines via a DMI quirk that sets the - * attribute below. */ -#define EFI_UV1_MEMMAP EFI_ARCH_1 - -static inline bool efi_have_uv1_memmap(void) -{ - return IS_ENABLED(CONFIG_X86_UV) && efi_enabled(EFI_UV1_MEMMAP); -} #define EFI32_LOADER_SIGNATURE "EL32" #define EFI64_LOADER_SIGNATURE "EL64" @@ -122,9 +112,7 @@ struct efi_scratch { efi_sync_low_kernel_mappings(); \ kernel_fpu_begin(); \ firmware_restrict_branch_speculation_start(); \ - \ - if (!efi_have_uv1_memmap()) \ - efi_switch_mm(&efi_mm); \ + efi_switch_mm(&efi_mm); \ }) #define arch_efi_call_virt(p, f, args...) \ @@ -132,9 +120,7 @@ struct efi_scratch { #define arch_efi_call_virt_teardown() \ ({ \ - if (!efi_have_uv1_memmap()) \ - efi_switch_mm(efi_scratch.prev_mm); \ - \ + efi_switch_mm(efi_scratch.prev_mm); \ firmware_restrict_branch_speculation_end(); \ kernel_fpu_end(); \ }) @@ -176,8 +162,6 @@ extern void efi_delete_dummy_variable(void); extern void efi_switch_mm(struct mm_struct *mm); extern void efi_recover_from_page_fault(unsigned long phys_addr); extern void efi_free_boot_services(void); -extern pgd_t * __init efi_uv1_memmap_phys_prolog(void); -extern void __init efi_uv1_memmap_phys_epilog(pgd_t *save_pgd); /* kexec external ABI */ struct efi_setup_data { diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 452beed7892b..b9a5d488f1a5 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -21,8 +21,6 @@ typedef struct user_i387_struct elf_fpregset_t; #ifdef __i386__ -typedef struct user_fxsr_struct elf_fpxregset_t; - #define R_386_NONE 0 #define R_386_32 1 #define R_386_PC32 2 diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h new file mode 100644 index 000000000000..a8f9315b9eae --- /dev/null +++ b/arch/x86/include/asm/entry-common.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _ASM_X86_ENTRY_COMMON_H +#define _ASM_X86_ENTRY_COMMON_H + +#include <linux/user-return-notifier.h> + +#include <asm/nospec-branch.h> +#include <asm/io_bitmap.h> +#include <asm/fpu/api.h> + +/* Check that the stack and regs on entry from user mode are sane. */ +static __always_inline void arch_check_user_regs(struct pt_regs *regs) +{ + if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) { + /* + * Make sure that the entry code gave us a sensible EFLAGS + * register. Native because we want to check the actual CPU + * state, not the interrupt state as imagined by Xen. + */ + unsigned long flags = native_save_fl(); + WARN_ON_ONCE(flags & (X86_EFLAGS_AC | X86_EFLAGS_DF | + X86_EFLAGS_NT)); + + /* We think we came from user mode. Make sure pt_regs agrees. */ + WARN_ON_ONCE(!user_mode(regs)); + + /* + * All entries from user mode (except #DF) should be on the + * normal thread stack and should have user pt_regs in the + * correct location. + */ + WARN_ON_ONCE(!on_thread_stack()); + WARN_ON_ONCE(regs != task_pt_regs(current)); + } +} +#define arch_check_user_regs arch_check_user_regs + +#define ARCH_SYSCALL_EXIT_WORK (_TIF_SINGLESTEP) + +static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, + unsigned long ti_work) +{ + if (ti_work & _TIF_USER_RETURN_NOTIFY) + fire_user_return_notifiers(); + + if (unlikely(ti_work & _TIF_IO_BITMAP)) + tss_update_io_bitmap(); + + fpregs_assert_state_consistent(); + if (unlikely(ti_work & _TIF_NEED_FPU_LOAD)) + switch_fpu_return(); + +#ifdef CONFIG_COMPAT + /* + * Compat syscalls set TS_COMPAT. Make sure we clear it before + * returning to user mode. We need to clear it *after* signal + * handling, because syscall restart has a fixup for compat + * syscalls. The fixup is exercised by the ptrace_syscall_32 + * selftest. + * + * We also need to clear TS_REGS_POKED_I386: the 32-bit tracer + * special case only applies after poking regs and before the + * very next return to user mode. + */ + current_thread_info()->status &= ~(TS_COMPAT | TS_I386_REGS_POKED); +#endif +} +#define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare + +static __always_inline void arch_exit_to_user_mode(void) +{ + mds_user_clear_cpu_buffers(); +} +#define arch_exit_to_user_mode arch_exit_to_user_mode + +#endif diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index b9527a54db99..0f0dd645b594 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -26,9 +26,9 @@ #ifndef __ASSEMBLY__ #include <linux/kernel.h> -#include <asm/acpi.h> #include <asm/apicdef.h> #include <asm/page.h> +#include <asm/pgtable_types.h> #ifdef CONFIG_X86_32 #include <linux/threads.h> #include <asm/kmap_types.h> diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 845e7481ab77..0a460f2a3f90 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -34,7 +34,6 @@ extern int fpu__copy(struct task_struct *dst, struct task_struct *src); extern void fpu__clear_user_states(struct fpu *fpu); extern void fpu__clear_all(struct fpu *fpu); extern int fpu__exception_code(struct fpu *fpu, int trap_nr); -extern int dump_fpu(struct pt_regs *ptregs, struct user_i387_struct *fpstate); /* * Boot time FPU initialization functions: @@ -274,7 +273,7 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu) */ static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate) { - u64 mask = -1; + u64 mask = xfeatures_mask_all; u32 lmask = mask; u32 hmask = mask >> 32; int err; @@ -320,7 +319,7 @@ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate) */ static inline void copy_xregs_to_kernel(struct xregs_state *xstate) { - u64 mask = -1; + u64 mask = xfeatures_mask_all; u32 lmask = mask; u32 hmask = mask >> 32; int err; @@ -356,6 +355,9 @@ static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask) */ static inline int copy_xregs_to_user(struct xregs_state __user *buf) { + u64 mask = xfeatures_mask_user(); + u32 lmask = mask; + u32 hmask = mask >> 32; int err; /* @@ -367,7 +369,7 @@ static inline int copy_xregs_to_user(struct xregs_state __user *buf) return -EFAULT; stac(); - XSTATE_OP(XSAVE, buf, -1, -1, err); + XSTATE_OP(XSAVE, buf, lmask, hmask, err); clac(); return err; @@ -408,43 +410,7 @@ static inline int copy_kernel_to_xregs_err(struct xregs_state *xstate, u64 mask) return err; } -/* - * These must be called with preempt disabled. Returns - * 'true' if the FPU state is still intact and we can - * keep registers active. - * - * The legacy FNSAVE instruction cleared all FPU state - * unconditionally, so registers are essentially destroyed. - * Modern FPU state can be kept in registers, if there are - * no pending FP exceptions. - */ -static inline int copy_fpregs_to_fpstate(struct fpu *fpu) -{ - if (likely(use_xsave())) { - copy_xregs_to_kernel(&fpu->state.xsave); - - /* - * AVX512 state is tracked here because its use is - * known to slow the max clock speed of the core. - */ - if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512) - fpu->avx512_timestamp = jiffies; - return 1; - } - - if (likely(use_fxsr())) { - copy_fxregs_to_kernel(fpu); - return 1; - } - - /* - * Legacy FPU register saving, FNSAVE always clears FPU registers, - * so we have to mark them inactive: - */ - asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave)); - - return 0; -} +extern int copy_fpregs_to_fpstate(struct fpu *fpu); static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask) { diff --git a/arch/x86/include/asm/fpu/regset.h b/arch/x86/include/asm/fpu/regset.h index d5bdffb9d27f..4f928d6a367b 100644 --- a/arch/x86/include/asm/fpu/regset.h +++ b/arch/x86/include/asm/fpu/regset.h @@ -8,8 +8,8 @@ #include <linux/regset.h> extern user_regset_active_fn regset_fpregs_active, regset_xregset_fpregs_active; -extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get, - xstateregs_get; +extern user_regset_get2_fn fpregs_get, xfpregs_get, fpregs_soft_get, + xstateregs_get; extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set, xstateregs_set; diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index f098f6cab94b..c87364ea6446 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -114,6 +114,12 @@ enum xfeature { XFEATURE_Hi16_ZMM, XFEATURE_PT_UNIMPLEMENTED_SO_FAR, XFEATURE_PKRU, + XFEATURE_RSRVD_COMP_10, + XFEATURE_RSRVD_COMP_11, + XFEATURE_RSRVD_COMP_12, + XFEATURE_RSRVD_COMP_13, + XFEATURE_RSRVD_COMP_14, + XFEATURE_LBR, XFEATURE_MAX, }; @@ -128,6 +134,7 @@ enum xfeature { #define XFEATURE_MASK_Hi16_ZMM (1 << XFEATURE_Hi16_ZMM) #define XFEATURE_MASK_PT (1 << XFEATURE_PT_UNIMPLEMENTED_SO_FAR) #define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU) +#define XFEATURE_MASK_LBR (1 << XFEATURE_LBR) #define XFEATURE_MASK_FPSSE (XFEATURE_MASK_FP | XFEATURE_MASK_SSE) #define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK \ @@ -229,6 +236,26 @@ struct pkru_state { u32 pad; } __packed; +/* + * State component 15: Architectural LBR configuration state. + * The size of Arch LBR state depends on the number of LBRs (lbr_depth). + */ + +struct lbr_entry { + u64 from; + u64 to; + u64 info; +}; + +struct arch_lbr_state { + u64 lbr_ctl; + u64 lbr_depth; + u64 ler_from; + u64 ler_to; + u64 ler_info; + struct lbr_entry entries[]; +} __packed; + struct xstate_header { u64 xfeatures; u64 xcomp_bv; diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 422d8369012a..14ab815132d4 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -21,6 +21,8 @@ #define XSAVE_YMM_SIZE 256 #define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET) +#define XSAVE_ALIGNMENT 64 + /* All currently supported user features */ #define XFEATURE_MASK_USER_SUPPORTED (XFEATURE_MASK_FP | \ XFEATURE_MASK_SSE | \ @@ -36,6 +38,27 @@ #define XFEATURE_MASK_SUPERVISOR_SUPPORTED (0) /* + * A supervisor state component may not always contain valuable information, + * and its size may be huge. Saving/restoring such supervisor state components + * at each context switch can cause high CPU and space overhead, which should + * be avoided. Such supervisor state components should only be saved/restored + * on demand. The on-demand dynamic supervisor features are set in this mask. + * + * Unlike the existing supported supervisor features, a dynamic supervisor + * feature does not allocate a buffer in task->fpu, and the corresponding + * supervisor state component cannot be saved/restored at each context switch. + * + * To support a dynamic supervisor feature, a developer should follow the + * dos and don'ts as below: + * - Do dynamically allocate a buffer for the supervisor state component. + * - Do manually invoke the XSAVES/XRSTORS instruction to save/restore the + * state component to/from the buffer. + * - Don't set the bit corresponding to the dynamic supervisor feature in + * IA32_XSS at run time, since it has been set at boot time. + */ +#define XFEATURE_MASK_DYNAMIC (XFEATURE_MASK_LBR) + +/* * Unsupported supervisor features. When a supervisor feature in this mask is * supported in the future, move it to the supported supervisor feature mask. */ @@ -43,6 +66,7 @@ /* All supervisor states including supported and unsupported states. */ #define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \ + XFEATURE_MASK_DYNAMIC | \ XFEATURE_MASK_SUPERVISOR_UNSUPPORTED) #ifdef CONFIG_X86_64 @@ -63,6 +87,14 @@ static inline u64 xfeatures_mask_user(void) return xfeatures_mask_all & XFEATURE_MASK_USER_SUPPORTED; } +static inline u64 xfeatures_mask_dynamic(void) +{ + if (!boot_cpu_has(X86_FEATURE_ARCH_LBR)) + return XFEATURE_MASK_DYNAMIC & ~XFEATURE_MASK_LBR; + + return XFEATURE_MASK_DYNAMIC; +} + extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; extern void __init update_regset_xstate_info(unsigned int size, @@ -71,11 +103,15 @@ extern void __init update_regset_xstate_info(unsigned int size, void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr); const void *get_xsave_field_ptr(int xfeature_nr); int using_compacted_format(void); -int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); -int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); +int xfeature_size(int xfeature_nr); +struct membuf; +void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave); int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf); int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf); void copy_supervisor_to_kernel(struct xregs_state *xsave); +void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask); +void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask); + /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ int validate_user_xstate_header(const struct xstate_header *hdr); diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h index bca4c743de77..d552646411a9 100644 --- a/arch/x86/include/asm/fsgsbase.h +++ b/arch/x86/include/asm/fsgsbase.h @@ -19,36 +19,65 @@ extern unsigned long x86_gsbase_read_task(struct task_struct *task); extern void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase); extern void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase); -/* Helper functions for reading/writing FS/GS base */ +/* Must be protected by X86_FEATURE_FSGSBASE check. */ -static inline unsigned long x86_fsbase_read_cpu(void) +static __always_inline unsigned long rdfsbase(void) { unsigned long fsbase; - rdmsrl(MSR_FS_BASE, fsbase); + asm volatile("rdfsbase %0" : "=r" (fsbase) :: "memory"); return fsbase; } -static inline unsigned long x86_gsbase_read_cpu_inactive(void) +static __always_inline unsigned long rdgsbase(void) { unsigned long gsbase; - rdmsrl(MSR_KERNEL_GS_BASE, gsbase); + asm volatile("rdgsbase %0" : "=r" (gsbase) :: "memory"); return gsbase; } -static inline void x86_fsbase_write_cpu(unsigned long fsbase) +static __always_inline void wrfsbase(unsigned long fsbase) { - wrmsrl(MSR_FS_BASE, fsbase); + asm volatile("wrfsbase %0" :: "r" (fsbase) : "memory"); } -static inline void x86_gsbase_write_cpu_inactive(unsigned long gsbase) +static __always_inline void wrgsbase(unsigned long gsbase) { - wrmsrl(MSR_KERNEL_GS_BASE, gsbase); + asm volatile("wrgsbase %0" :: "r" (gsbase) : "memory"); } +#include <asm/cpufeature.h> + +/* Helper functions for reading/writing FS/GS base */ + +static inline unsigned long x86_fsbase_read_cpu(void) +{ + unsigned long fsbase; + + if (static_cpu_has(X86_FEATURE_FSGSBASE)) + fsbase = rdfsbase(); + else + rdmsrl(MSR_FS_BASE, fsbase); + + return fsbase; +} + +static inline void x86_fsbase_write_cpu(unsigned long fsbase) +{ + if (static_cpu_has(X86_FEATURE_FSGSBASE)) + wrfsbase(fsbase); + else + wrmsrl(MSR_FS_BASE, fsbase); +} + +extern unsigned long x86_gsbase_read_cpu_inactive(void); +extern void x86_gsbase_write_cpu_inactive(unsigned long gsbase); +extern unsigned long x86_fsgsbase_read_task(struct task_struct *task, + unsigned short selector); + #endif /* CONFIG_X86_64 */ #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 07533795b8d2..275e7fd20310 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -67,12 +67,12 @@ static inline void kvm_set_cpu_l1tf_flush_l1d(void) __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1); } -static inline void kvm_clear_cpu_l1tf_flush_l1d(void) +static __always_inline void kvm_clear_cpu_l1tf_flush_l1d(void) { __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 0); } -static inline bool kvm_get_cpu_l1tf_flush_l1d(void) +static __always_inline bool kvm_get_cpu_l1tf_flush_l1d(void) { return __this_cpu_read(irq_stat.kvm_cpu_l1tf_flush_l1d); } diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 80d3b30d3ee3..a43366191212 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -6,15 +6,13 @@ #include <asm/trapnr.h> #ifndef __ASSEMBLY__ +#include <linux/entry-common.h> #include <linux/hardirq.h> #include <asm/irq_stack.h> -void idtentry_enter_user(struct pt_regs *regs); -void idtentry_exit_user(struct pt_regs *regs); - -bool idtentry_enter_cond_rcu(struct pt_regs *regs); -void idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit); +bool idtentry_enter_nmi(struct pt_regs *regs); +void idtentry_exit_nmi(struct pt_regs *regs, bool irq_state); /** * DECLARE_IDTENTRY - Declare functions for simple IDT entry points @@ -45,8 +43,8 @@ void idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit); * The macro is written so it acts as function definition. Append the * body with a pair of curly brackets. * - * idtentry_enter() contains common code which has to be invoked before - * arbitrary code in the body. idtentry_exit() contains common code + * irqentry_enter() contains common code which has to be invoked before + * arbitrary code in the body. irqentry_exit() contains common code * which has to run before returning to the low level assembly code. */ #define DEFINE_IDTENTRY(func) \ @@ -54,12 +52,12 @@ static __always_inline void __##func(struct pt_regs *regs); \ \ __visible noinstr void func(struct pt_regs *regs) \ { \ - bool rcu_exit = idtentry_enter_cond_rcu(regs); \ + irqentry_state_t state = irqentry_enter(regs); \ \ instrumentation_begin(); \ __##func (regs); \ instrumentation_end(); \ - idtentry_exit_cond_rcu(regs, rcu_exit); \ + irqentry_exit(regs, state); \ } \ \ static __always_inline void __##func(struct pt_regs *regs) @@ -101,12 +99,12 @@ static __always_inline void __##func(struct pt_regs *regs, \ __visible noinstr void func(struct pt_regs *regs, \ unsigned long error_code) \ { \ - bool rcu_exit = idtentry_enter_cond_rcu(regs); \ + irqentry_state_t state = irqentry_enter(regs); \ \ instrumentation_begin(); \ __##func (regs, error_code); \ instrumentation_end(); \ - idtentry_exit_cond_rcu(regs, rcu_exit); \ + irqentry_exit(regs, state); \ } \ \ static __always_inline void __##func(struct pt_regs *regs, \ @@ -161,7 +159,7 @@ __visible noinstr void func(struct pt_regs *regs) * body with a pair of curly brackets. * * Contrary to DEFINE_IDTENTRY_ERRORCODE() this does not invoke the - * idtentry_enter/exit() helpers before and after the body invocation. This + * irqentry_enter/exit() helpers before and after the body invocation. This * needs to be done in the body itself if applicable. Use if extra work * is required before the enter/exit() helpers are invoked. */ @@ -187,11 +185,9 @@ __visible noinstr void func(struct pt_regs *regs, unsigned long error_code) * to the function as error_code argument which needs to be truncated * to an u8 because the push is sign extending. * - * On 64-bit idtentry_enter/exit() are invoked in the ASM entry code before - * and after switching to the interrupt stack. On 32-bit this happens in C. - * * irq_enter/exit_rcu() are invoked before the function body and the - * KVM L1D flush request is set. + * KVM L1D flush request is set. Stack switching to the interrupt stack + * has to be done in the function body if necessary. */ #define DEFINE_IDTENTRY_IRQ(func) \ static __always_inline void __##func(struct pt_regs *regs, u8 vector); \ @@ -199,7 +195,7 @@ static __always_inline void __##func(struct pt_regs *regs, u8 vector); \ __visible noinstr void func(struct pt_regs *regs, \ unsigned long error_code) \ { \ - bool rcu_exit = idtentry_enter_cond_rcu(regs); \ + irqentry_state_t state = irqentry_enter(regs); \ \ instrumentation_begin(); \ irq_enter_rcu(); \ @@ -207,7 +203,7 @@ __visible noinstr void func(struct pt_regs *regs, \ __##func (regs, (u8)error_code); \ irq_exit_rcu(); \ instrumentation_end(); \ - idtentry_exit_cond_rcu(regs, rcu_exit); \ + irqentry_exit(regs, state); \ } \ \ static __always_inline void __##func(struct pt_regs *regs, u8 vector) @@ -231,7 +227,7 @@ static __always_inline void __##func(struct pt_regs *regs, u8 vector) * DEFINE_IDTENTRY_SYSVEC - Emit code for system vector IDT entry points * @func: Function name of the entry point * - * idtentry_enter/exit() and irq_enter/exit_rcu() are invoked before the + * irqentry_enter/exit() and irq_enter/exit_rcu() are invoked before the * function body. KVM L1D flush request is set. * * Runs the function on the interrupt stack if the entry hit kernel mode @@ -241,7 +237,7 @@ static void __##func(struct pt_regs *regs); \ \ __visible noinstr void func(struct pt_regs *regs) \ { \ - bool rcu_exit = idtentry_enter_cond_rcu(regs); \ + irqentry_state_t state = irqentry_enter(regs); \ \ instrumentation_begin(); \ irq_enter_rcu(); \ @@ -249,7 +245,7 @@ __visible noinstr void func(struct pt_regs *regs) \ run_on_irqstack_cond(__##func, regs, regs); \ irq_exit_rcu(); \ instrumentation_end(); \ - idtentry_exit_cond_rcu(regs, rcu_exit); \ + irqentry_exit(regs, state); \ } \ \ static noinline void __##func(struct pt_regs *regs) @@ -270,7 +266,7 @@ static __always_inline void __##func(struct pt_regs *regs); \ \ __visible noinstr void func(struct pt_regs *regs) \ { \ - bool rcu_exit = idtentry_enter_cond_rcu(regs); \ + irqentry_state_t state = irqentry_enter(regs); \ \ instrumentation_begin(); \ __irq_enter_raw(); \ @@ -278,7 +274,7 @@ __visible noinstr void func(struct pt_regs *regs) \ __##func (regs); \ __irq_exit_raw(); \ instrumentation_end(); \ - idtentry_exit_cond_rcu(regs, rcu_exit); \ + irqentry_exit(regs, state); \ } \ \ static __always_inline void __##func(struct pt_regs *regs) @@ -636,6 +632,10 @@ DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_acrn_hv_callback); DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_xen_hvm_callback); #endif +#ifdef CONFIG_KVM_GUEST +DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_kvm_asyncpf_interrupt); +#endif + #undef X86_TRAP_OTHER #endif diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h index f5a796da07f8..bd7f02480ca1 100644 --- a/arch/x86/include/asm/inst.h +++ b/arch/x86/include/asm/inst.h @@ -12,7 +12,6 @@ #define REG_TYPE_R32 0 #define REG_TYPE_R64 1 -#define REG_TYPE_XMM 2 #define REG_TYPE_INVALID 100 .macro R32_NUM opd r32 @@ -123,77 +122,18 @@ #endif .endm - .macro XMM_NUM opd xmm - \opd = REG_NUM_INVALID - .ifc \xmm,%xmm0 - \opd = 0 - .endif - .ifc \xmm,%xmm1 - \opd = 1 - .endif - .ifc \xmm,%xmm2 - \opd = 2 - .endif - .ifc \xmm,%xmm3 - \opd = 3 - .endif - .ifc \xmm,%xmm4 - \opd = 4 - .endif - .ifc \xmm,%xmm5 - \opd = 5 - .endif - .ifc \xmm,%xmm6 - \opd = 6 - .endif - .ifc \xmm,%xmm7 - \opd = 7 - .endif - .ifc \xmm,%xmm8 - \opd = 8 - .endif - .ifc \xmm,%xmm9 - \opd = 9 - .endif - .ifc \xmm,%xmm10 - \opd = 10 - .endif - .ifc \xmm,%xmm11 - \opd = 11 - .endif - .ifc \xmm,%xmm12 - \opd = 12 - .endif - .ifc \xmm,%xmm13 - \opd = 13 - .endif - .ifc \xmm,%xmm14 - \opd = 14 - .endif - .ifc \xmm,%xmm15 - \opd = 15 - .endif - .endm - .macro REG_TYPE type reg R32_NUM reg_type_r32 \reg R64_NUM reg_type_r64 \reg - XMM_NUM reg_type_xmm \reg .if reg_type_r64 <> REG_NUM_INVALID \type = REG_TYPE_R64 .elseif reg_type_r32 <> REG_NUM_INVALID \type = REG_TYPE_R32 - .elseif reg_type_xmm <> REG_NUM_INVALID - \type = REG_TYPE_XMM .else \type = REG_TYPE_INVALID .endif .endm - .macro PFX_OPD_SIZE - .byte 0x66 - .endm - .macro PFX_REX opd1 opd2 W=0 .if ((\opd1 | \opd2) & 8) || \W .byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) | (\W << 3) @@ -204,108 +144,20 @@ .byte \mod | (\opd1 & 7) | ((\opd2 & 7) << 3) .endm - .macro PSHUFB_XMM xmm1 xmm2 - XMM_NUM pshufb_opd1 \xmm1 - XMM_NUM pshufb_opd2 \xmm2 - PFX_OPD_SIZE - PFX_REX pshufb_opd1 pshufb_opd2 - .byte 0x0f, 0x38, 0x00 - MODRM 0xc0 pshufb_opd1 pshufb_opd2 - .endm - - .macro PCLMULQDQ imm8 xmm1 xmm2 - XMM_NUM clmul_opd1 \xmm1 - XMM_NUM clmul_opd2 \xmm2 - PFX_OPD_SIZE - PFX_REX clmul_opd1 clmul_opd2 - .byte 0x0f, 0x3a, 0x44 - MODRM 0xc0 clmul_opd1 clmul_opd2 - .byte \imm8 - .endm - - .macro PEXTRD imm8 xmm gpr - R32_NUM extrd_opd1 \gpr - XMM_NUM extrd_opd2 \xmm - PFX_OPD_SIZE - PFX_REX extrd_opd1 extrd_opd2 - .byte 0x0f, 0x3a, 0x16 - MODRM 0xc0 extrd_opd1 extrd_opd2 - .byte \imm8 - .endm - - .macro AESKEYGENASSIST rcon xmm1 xmm2 - XMM_NUM aeskeygen_opd1 \xmm1 - XMM_NUM aeskeygen_opd2 \xmm2 - PFX_OPD_SIZE - PFX_REX aeskeygen_opd1 aeskeygen_opd2 - .byte 0x0f, 0x3a, 0xdf - MODRM 0xc0 aeskeygen_opd1 aeskeygen_opd2 - .byte \rcon - .endm - - .macro AESIMC xmm1 xmm2 - XMM_NUM aesimc_opd1 \xmm1 - XMM_NUM aesimc_opd2 \xmm2 - PFX_OPD_SIZE - PFX_REX aesimc_opd1 aesimc_opd2 - .byte 0x0f, 0x38, 0xdb - MODRM 0xc0 aesimc_opd1 aesimc_opd2 - .endm - - .macro AESENC xmm1 xmm2 - XMM_NUM aesenc_opd1 \xmm1 - XMM_NUM aesenc_opd2 \xmm2 - PFX_OPD_SIZE - PFX_REX aesenc_opd1 aesenc_opd2 - .byte 0x0f, 0x38, 0xdc - MODRM 0xc0 aesenc_opd1 aesenc_opd2 - .endm - - .macro AESENCLAST xmm1 xmm2 - XMM_NUM aesenclast_opd1 \xmm1 - XMM_NUM aesenclast_opd2 \xmm2 - PFX_OPD_SIZE - PFX_REX aesenclast_opd1 aesenclast_opd2 - .byte 0x0f, 0x38, 0xdd - MODRM 0xc0 aesenclast_opd1 aesenclast_opd2 - .endm - - .macro AESDEC xmm1 xmm2 - XMM_NUM aesdec_opd1 \xmm1 - XMM_NUM aesdec_opd2 \xmm2 - PFX_OPD_SIZE - PFX_REX aesdec_opd1 aesdec_opd2 - .byte 0x0f, 0x38, 0xde - MODRM 0xc0 aesdec_opd1 aesdec_opd2 - .endm - - .macro AESDECLAST xmm1 xmm2 - XMM_NUM aesdeclast_opd1 \xmm1 - XMM_NUM aesdeclast_opd2 \xmm2 - PFX_OPD_SIZE - PFX_REX aesdeclast_opd1 aesdeclast_opd2 - .byte 0x0f, 0x38, 0xdf - MODRM 0xc0 aesdeclast_opd1 aesdeclast_opd2 - .endm - - .macro MOVQ_R64_XMM opd1 opd2 - REG_TYPE movq_r64_xmm_opd1_type \opd1 - .if movq_r64_xmm_opd1_type == REG_TYPE_XMM - XMM_NUM movq_r64_xmm_opd1 \opd1 - R64_NUM movq_r64_xmm_opd2 \opd2 +.macro RDPID opd + REG_TYPE rdpid_opd_type \opd + .if rdpid_opd_type == REG_TYPE_R64 + R64_NUM rdpid_opd \opd .else - R64_NUM movq_r64_xmm_opd1 \opd1 - XMM_NUM movq_r64_xmm_opd2 \opd2 + R32_NUM rdpid_opd \opd .endif - PFX_OPD_SIZE - PFX_REX movq_r64_xmm_opd1 movq_r64_xmm_opd2 1 - .if movq_r64_xmm_opd1_type == REG_TYPE_XMM - .byte 0x0f, 0x7e - .else - .byte 0x0f, 0x6e + .byte 0xf3 + .if rdpid_opd > 7 + PFX_REX rdpid_opd 0 .endif - MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2 - .endm + .byte 0x0f, 0xc7 + MODRM 0xc0 rdpid_opd 0x7 +.endm #endif #endif diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index a338a6deb950..5e658ba2654a 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -89,8 +89,15 @@ #define INTEL_FAM6_COMETLAKE 0xA5 #define INTEL_FAM6_COMETLAKE_L 0xA6 +#define INTEL_FAM6_ROCKETLAKE 0xA7 + #define INTEL_FAM6_SAPPHIRERAPIDS_X 0x8F +/* Hybrid Core/Atom Processors */ + +#define INTEL_FAM6_LAKEFIELD 0x8A +#define INTEL_FAM6_ALDERLAKE 0x97 + /* "Small Core" Processors (Atom) */ #define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */ diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index fd20a2334885..a1a26f6d3aa4 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -99,7 +99,6 @@ struct IR_IO_APIC_route_entry { struct irq_alloc_info; struct ioapic_domain_cfg; -#define IOAPIC_AUTO -1 #define IOAPIC_EDGE 0 #define IOAPIC_LEVEL 1 diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index 247ab14c6309..d1514e70477b 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -36,8 +36,9 @@ extern void die(const char *, struct pt_regs *,long); void die_addr(const char *str, struct pt_regs *regs, long err, long gp_addr); extern int __must_check __die(const char *, struct pt_regs *, long); extern void show_stack_regs(struct pt_regs *regs); -extern void __show_regs(struct pt_regs *regs, enum show_regs_mode); -extern void show_iret_regs(struct pt_regs *regs); +extern void __show_regs(struct pt_regs *regs, enum show_regs_mode, + const char *log_lvl); +extern void show_iret_regs(struct pt_regs *regs, const char *log_lvl); extern unsigned long oops_begin(void); extern void oops_end(unsigned long, struct pt_regs *, int signr); diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 073eb7ad2f56..143bc9abe99c 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -66,6 +66,8 @@ struct arch_specific_insn { */ bool boostable; bool if_modifier; + /* Number of bytes of text poked */ + int tp_len; }; struct arch_optimized_insn { diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index be5363b21540..5ab3af7275d8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -193,8 +193,6 @@ struct x86_exception; enum x86_intercept; enum x86_intercept_stage; -#define KVM_NR_MEM_OBJS 40 - #define KVM_NR_DB_REGS 4 #define DR6_BD (1 << 13) @@ -246,15 +244,6 @@ enum x86_intercept_stage; struct kvm_kernel_irq_routing_entry; /* - * We don't want allocation failures within the mmu code, so we preallocate - * enough memory for a single page fault in a cache. - */ -struct kvm_mmu_memory_cache { - int nobjs; - void *objects[KVM_NR_MEM_OBJS]; -}; - -/* * the pages used as guest page table on soft mmu are tracked by * kvm_memory_slot.arch.gfn_track which is 16 bits, so the role bits used * by indirect shadow page can not be more than 15 bits. @@ -322,43 +311,6 @@ struct kvm_rmap_head { unsigned long val; }; -struct kvm_mmu_page { - struct list_head link; - struct hlist_node hash_link; - struct list_head lpage_disallowed_link; - - bool unsync; - u8 mmu_valid_gen; - bool mmio_cached; - bool lpage_disallowed; /* Can't be replaced by an equiv large page */ - - /* - * The following two entries are used to key the shadow page in the - * hash table. - */ - union kvm_mmu_page_role role; - gfn_t gfn; - - u64 *spt; - /* hold the gfn of each spte inside spt */ - gfn_t *gfns; - int root_count; /* Currently serving as active root */ - unsigned int unsync_children; - struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ - DECLARE_BITMAP(unsync_child_bitmap, 512); - -#ifdef CONFIG_X86_32 - /* - * Used out of the mmu-lock to avoid reading spte values while an - * update is in progress; see the comments in __get_spte_lockless(). - */ - int clear_spte_count; -#endif - - /* Number of writes since the last time traversal visited this page. */ - atomic_t write_flooding_count; -}; - struct kvm_pio_request { unsigned long linear_rip; unsigned long count; @@ -384,6 +336,8 @@ struct kvm_mmu_root_info { #define KVM_MMU_NUM_PREV_ROOTS 3 +struct kvm_mmu_page; + /* * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit, * and 2-level 32-bit). The kvm_mmu structure abstracts the details of the @@ -580,6 +534,7 @@ struct kvm_vcpu_arch { unsigned long cr3; unsigned long cr4; unsigned long cr4_guest_owned_bits; + unsigned long cr4_guest_rsvd_bits; unsigned long cr8; u32 host_pkru; u32 pkru; @@ -635,7 +590,8 @@ struct kvm_vcpu_arch { struct kvm_mmu *walk_mmu; struct kvm_mmu_memory_cache mmu_pte_list_desc_cache; - struct kvm_mmu_memory_cache mmu_page_cache; + struct kvm_mmu_memory_cache mmu_shadow_page_cache; + struct kvm_mmu_memory_cache mmu_gfn_array_cache; struct kvm_mmu_memory_cache mmu_page_header_cache; /* @@ -683,7 +639,7 @@ struct kvm_vcpu_arch { struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES]; int maxphyaddr; - int tdp_level; + int max_tdp_level; /* emulate context */ @@ -827,6 +783,9 @@ struct kvm_vcpu_arch { /* Flush the L1 Data cache for L1TF mitigation on VMENTER */ bool l1tf_flush_l1d; + /* Host CPU on which VM-entry was most recently attempted */ + unsigned int last_vmentry_cpu; + /* AMD MSRC001_0015 Hardware Configuration */ u64 msr_hwcr; }; @@ -1083,7 +1042,7 @@ struct kvm_x86_ops { void (*hardware_unsetup)(void); bool (*cpu_has_accelerated_tpr)(void); bool (*has_emulated_msr)(u32 index); - void (*cpuid_update)(struct kvm_vcpu *vcpu); + void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu); unsigned int vm_size; int (*vm_init)(struct kvm *kvm); @@ -1098,7 +1057,7 @@ struct kvm_x86_ops { void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); void (*vcpu_put)(struct kvm_vcpu *vcpu); - void (*update_bp_intercept)(struct kvm_vcpu *vcpu); + void (*update_exception_bitmap)(struct kvm_vcpu *vcpu); int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); @@ -1174,10 +1133,10 @@ struct kvm_x86_ops { int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr); - int (*get_tdp_level)(struct kvm_vcpu *vcpu); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); - void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, unsigned long cr3); + void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, unsigned long pgd, + int pgd_level); bool (*has_wbinvd_exit)(void); @@ -1220,7 +1179,6 @@ struct kvm_x86_ops { void (*enable_log_dirty_pt_masked)(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t offset, unsigned long mask); - int (*write_log_dirty)(struct kvm_vcpu *vcpu, gpa_t l2_gpa); /* pmu operations of sub-arch */ const struct kvm_pmu_ops *pmu_ops; @@ -1281,6 +1239,7 @@ struct kvm_x86_nested_ops { struct kvm_nested_state __user *user_kvm_nested_state, struct kvm_nested_state *kvm_state); bool (*get_vmcs12_pages)(struct kvm_vcpu *vcpu); + int (*write_log_dirty)(struct kvm_vcpu *vcpu, gpa_t l2_gpa); int (*enable_evmcs)(struct kvm_vcpu *vcpu, uint16_t *vmcs_version); @@ -1304,7 +1263,7 @@ struct kvm_arch_async_pf { }; extern u64 __read_mostly host_efer; - +extern bool __read_mostly allow_smaller_maxphyaddr; extern struct kvm_x86_ops kvm_x86_ops; #define __KVM_HAVE_ARCH_VM_ALLOC @@ -1549,20 +1508,8 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush, bool skip_mmu_sync); -void kvm_configure_mmu(bool enable_tdp, int tdp_page_level); - -static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, - struct x86_exception *exception) -{ - return gpa; -} - -static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) -{ - struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); - - return (struct kvm_mmu_page *)page_private(page); -} +void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, + int tdp_huge_page_level); static inline u16 kvm_read_ldt(void) { @@ -1636,7 +1583,15 @@ asmlinkage void kvm_spurious_fault(void); insn "\n\t" \ "jmp 668f \n\t" \ "667: \n\t" \ + "1: \n\t" \ + ".pushsection .discard.instr_begin \n\t" \ + ".long 1b - . \n\t" \ + ".popsection \n\t" \ "call kvm_spurious_fault \n\t" \ + "1: \n\t" \ + ".pushsection .discard.instr_end \n\t" \ + ".long 1b - . \n\t" \ + ".popsection \n\t" \ "668: \n\t" \ _ASM_EXTABLE(666b, 667b) diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 49d3a9edb06f..338119852512 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -4,6 +4,7 @@ #include <asm/processor.h> #include <asm/alternative.h> +#include <linux/interrupt.h> #include <uapi/asm/kvm_para.h> extern void kvmclock_init(void); @@ -18,7 +19,7 @@ static inline bool kvm_check_and_clear_guest_paused(void) #endif /* CONFIG_KVM_GUEST */ #define KVM_HYPERCALL \ - ALTERNATIVE(".byte 0x0f,0x01,0xc1", ".byte 0x0f,0x01,0xd9", X86_FEATURE_VMMCALL) + ALTERNATIVE("vmcall", "vmmcall", X86_FEATURE_VMMCALL) /* For KVM hypercalls, a three-byte sequence of either the vmcall or the vmmcall * instruction. The hypervisor may replace it with something else but only the diff --git a/arch/x86/include/asm/kvm_types.h b/arch/x86/include/asm/kvm_types.h new file mode 100644 index 000000000000..08f1b57d3b62 --- /dev/null +++ b/arch/x86/include/asm/kvm_types.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_KVM_TYPES_H +#define _ASM_X86_KVM_TYPES_H + +#define KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE 40 + +#endif /* _ASM_X86_KVM_TYPES_H */ diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 848ce43b9040..5049f6c22683 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -43,9 +43,10 @@ void __init sme_enable(struct boot_params *bp); int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size); int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size); +void __init mem_encrypt_free_decrypted_mem(void); + /* Architecture __weak replacement functions */ void __init mem_encrypt_init(void); -void __init mem_encrypt_free_decrypted_mem(void); bool sme_active(void); bool sev_active(void); @@ -77,6 +78,8 @@ early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; static inline int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; } +static inline void mem_encrypt_free_decrypted_mem(void) { } + #define __bss_decrypted #endif /* CONFIG_AMD_MEM_ENCRYPT */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 47562147e70b..d98016b83755 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -9,7 +9,6 @@ #include <trace/events/tlb.h> -#include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/paravirt.h> #include <asm/debugreg.h> diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index e8370e64a155..2859ee4f39a8 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -149,6 +149,10 @@ #define MSR_LBR_SELECT 0x000001c8 #define MSR_LBR_TOS 0x000001c9 + +#define MSR_IA32_POWER_CTL 0x000001fc +#define MSR_IA32_POWER_CTL_BIT_EE 19 + #define MSR_LBR_NHM_FROM 0x00000680 #define MSR_LBR_NHM_TO 0x000006c0 #define MSR_LBR_CORE_FROM 0x00000040 @@ -158,7 +162,23 @@ #define LBR_INFO_MISPRED BIT_ULL(63) #define LBR_INFO_IN_TX BIT_ULL(62) #define LBR_INFO_ABORT BIT_ULL(61) +#define LBR_INFO_CYC_CNT_VALID BIT_ULL(60) #define LBR_INFO_CYCLES 0xffff +#define LBR_INFO_BR_TYPE_OFFSET 56 +#define LBR_INFO_BR_TYPE (0xfull << LBR_INFO_BR_TYPE_OFFSET) + +#define MSR_ARCH_LBR_CTL 0x000014ce +#define ARCH_LBR_CTL_LBREN BIT(0) +#define ARCH_LBR_CTL_CPL_OFFSET 1 +#define ARCH_LBR_CTL_CPL (0x3ull << ARCH_LBR_CTL_CPL_OFFSET) +#define ARCH_LBR_CTL_STACK_OFFSET 3 +#define ARCH_LBR_CTL_STACK (0x1ull << ARCH_LBR_CTL_STACK_OFFSET) +#define ARCH_LBR_CTL_FILTER_OFFSET 16 +#define ARCH_LBR_CTL_FILTER (0x7full << ARCH_LBR_CTL_FILTER_OFFSET) +#define MSR_ARCH_LBR_DEPTH 0x000014cf +#define MSR_ARCH_LBR_FROM_0 0x00001500 +#define MSR_ARCH_LBR_TO_0 0x00001600 +#define MSR_ARCH_LBR_INFO_0 0x00001200 #define MSR_IA32_PEBS_ENABLE 0x000003f1 #define MSR_PEBS_DATA_CFG 0x000003f2 @@ -253,8 +273,6 @@ #define MSR_PEBS_FRONTEND 0x000003f7 -#define MSR_IA32_POWER_CTL 0x000001fc - #define MSR_IA32_MC0_CTL 0x00000400 #define MSR_IA32_MC0_STATUS 0x00000401 #define MSR_IA32_MC0_ADDR 0x00000402 @@ -418,7 +436,6 @@ #define MSR_AMD64_PATCH_LEVEL 0x0000008b #define MSR_AMD64_TSC_RATIO 0xc0000104 #define MSR_AMD64_NB_CFG 0xc001001f -#define MSR_AMD64_CPUID_FN_1 0xc0011004 #define MSR_AMD64_PATCH_LOADER 0xc0010020 #define MSR_AMD_PERF_CTL 0xc0010062 #define MSR_AMD_PERF_STATUS 0xc0010063 @@ -427,6 +444,7 @@ #define MSR_AMD64_OSVW_STATUS 0xc0010141 #define MSR_AMD_PPIN_CTL 0xc00102f0 #define MSR_AMD_PPIN 0xc00102f1 +#define MSR_AMD64_CPUID_FN_1 0xc0011004 #define MSR_AMD64_LS_CFG 0xc0011020 #define MSR_AMD64_DC_CFG 0xc0011022 #define MSR_AMD64_BU_CFG2 0xc001102a @@ -466,6 +484,8 @@ #define MSR_F16H_DR0_ADDR_MASK 0xc0011027 /* Fam 15h MSRs */ +#define MSR_F15H_CU_PWR_ACCUMULATOR 0xc001007a +#define MSR_F15H_CU_MAX_PWR_ACCUMULATOR 0xc001007b #define MSR_F15H_PERF_CTL 0xc0010200 #define MSR_F15H_PERF_CTL0 MSR_F15H_PERF_CTL #define MSR_F15H_PERF_CTL1 (MSR_F15H_PERF_CTL + 2) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 2278797c769d..a3c33b79fb86 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -4,33 +4,15 @@ #ifdef CONFIG_X86_64 #define __percpu_seg gs -#define __percpu_mov_op movq #else #define __percpu_seg fs -#define __percpu_mov_op movl #endif #ifdef __ASSEMBLY__ -/* - * PER_CPU finds an address of a per-cpu variable. - * - * Args: - * var - variable name - * reg - 32bit register - * - * The resulting address is stored in the "reg" argument. - * - * Example: - * PER_CPU(cpu_gdt_descr, %ebx) - */ #ifdef CONFIG_SMP -#define PER_CPU(var, reg) \ - __percpu_mov_op %__percpu_seg:this_cpu_off, reg; \ - lea var(reg), reg #define PER_CPU_VAR(var) %__percpu_seg:var #else /* ! SMP */ -#define PER_CPU(var, reg) __percpu_mov_op $var, reg #define PER_CPU_VAR(var) var #endif /* SMP */ @@ -85,213 +67,108 @@ /* For arch-specific code, we can use direct single-insn ops (they * don't give an lvalue though). */ -extern void __bad_percpu_size(void); - -#define percpu_to_op(qual, op, var, val) \ -do { \ - typedef typeof(var) pto_T__; \ - if (0) { \ - pto_T__ pto_tmp__; \ - pto_tmp__ = (val); \ - (void)pto_tmp__; \ - } \ - switch (sizeof(var)) { \ - case 1: \ - asm qual (op "b %1,"__percpu_arg(0) \ - : "+m" (var) \ - : "qi" ((pto_T__)(val))); \ - break; \ - case 2: \ - asm qual (op "w %1,"__percpu_arg(0) \ - : "+m" (var) \ - : "ri" ((pto_T__)(val))); \ - break; \ - case 4: \ - asm qual (op "l %1,"__percpu_arg(0) \ - : "+m" (var) \ - : "ri" ((pto_T__)(val))); \ - break; \ - case 8: \ - asm qual (op "q %1,"__percpu_arg(0) \ - : "+m" (var) \ - : "re" ((pto_T__)(val))); \ - break; \ - default: __bad_percpu_size(); \ - } \ + +#define __pcpu_type_1 u8 +#define __pcpu_type_2 u16 +#define __pcpu_type_4 u32 +#define __pcpu_type_8 u64 + +#define __pcpu_cast_1(val) ((u8)(((unsigned long) val) & 0xff)) +#define __pcpu_cast_2(val) ((u16)(((unsigned long) val) & 0xffff)) +#define __pcpu_cast_4(val) ((u32)(((unsigned long) val) & 0xffffffff)) +#define __pcpu_cast_8(val) ((u64)(val)) + +#define __pcpu_op1_1(op, dst) op "b " dst +#define __pcpu_op1_2(op, dst) op "w " dst +#define __pcpu_op1_4(op, dst) op "l " dst +#define __pcpu_op1_8(op, dst) op "q " dst + +#define __pcpu_op2_1(op, src, dst) op "b " src ", " dst +#define __pcpu_op2_2(op, src, dst) op "w " src ", " dst +#define __pcpu_op2_4(op, src, dst) op "l " src ", " dst +#define __pcpu_op2_8(op, src, dst) op "q " src ", " dst + +#define __pcpu_reg_1(mod, x) mod "q" (x) +#define __pcpu_reg_2(mod, x) mod "r" (x) +#define __pcpu_reg_4(mod, x) mod "r" (x) +#define __pcpu_reg_8(mod, x) mod "r" (x) + +#define __pcpu_reg_imm_1(x) "qi" (x) +#define __pcpu_reg_imm_2(x) "ri" (x) +#define __pcpu_reg_imm_4(x) "ri" (x) +#define __pcpu_reg_imm_8(x) "re" (x) + +#define percpu_to_op(size, qual, op, _var, _val) \ +do { \ + __pcpu_type_##size pto_val__ = __pcpu_cast_##size(_val); \ + if (0) { \ + typeof(_var) pto_tmp__; \ + pto_tmp__ = (_val); \ + (void)pto_tmp__; \ + } \ + asm qual(__pcpu_op2_##size(op, "%[val]", __percpu_arg([var])) \ + : [var] "+m" (_var) \ + : [val] __pcpu_reg_imm_##size(pto_val__)); \ } while (0) +#define percpu_unary_op(size, qual, op, _var) \ +({ \ + asm qual (__pcpu_op1_##size(op, __percpu_arg([var])) \ + : [var] "+m" (_var)); \ +}) + /* * Generate a percpu add to memory instruction and optimize code * if one is added or subtracted. */ -#define percpu_add_op(qual, var, val) \ +#define percpu_add_op(size, qual, var, val) \ do { \ - typedef typeof(var) pao_T__; \ const int pao_ID__ = (__builtin_constant_p(val) && \ ((val) == 1 || (val) == -1)) ? \ (int)(val) : 0; \ if (0) { \ - pao_T__ pao_tmp__; \ + typeof(var) pao_tmp__; \ pao_tmp__ = (val); \ (void)pao_tmp__; \ } \ - switch (sizeof(var)) { \ - case 1: \ - if (pao_ID__ == 1) \ - asm qual ("incb "__percpu_arg(0) : "+m" (var)); \ - else if (pao_ID__ == -1) \ - asm qual ("decb "__percpu_arg(0) : "+m" (var)); \ - else \ - asm qual ("addb %1, "__percpu_arg(0) \ - : "+m" (var) \ - : "qi" ((pao_T__)(val))); \ - break; \ - case 2: \ - if (pao_ID__ == 1) \ - asm qual ("incw "__percpu_arg(0) : "+m" (var)); \ - else if (pao_ID__ == -1) \ - asm qual ("decw "__percpu_arg(0) : "+m" (var)); \ - else \ - asm qual ("addw %1, "__percpu_arg(0) \ - : "+m" (var) \ - : "ri" ((pao_T__)(val))); \ - break; \ - case 4: \ - if (pao_ID__ == 1) \ - asm qual ("incl "__percpu_arg(0) : "+m" (var)); \ - else if (pao_ID__ == -1) \ - asm qual ("decl "__percpu_arg(0) : "+m" (var)); \ - else \ - asm qual ("addl %1, "__percpu_arg(0) \ - : "+m" (var) \ - : "ri" ((pao_T__)(val))); \ - break; \ - case 8: \ - if (pao_ID__ == 1) \ - asm qual ("incq "__percpu_arg(0) : "+m" (var)); \ - else if (pao_ID__ == -1) \ - asm qual ("decq "__percpu_arg(0) : "+m" (var)); \ - else \ - asm qual ("addq %1, "__percpu_arg(0) \ - : "+m" (var) \ - : "re" ((pao_T__)(val))); \ - break; \ - default: __bad_percpu_size(); \ - } \ + if (pao_ID__ == 1) \ + percpu_unary_op(size, qual, "inc", var); \ + else if (pao_ID__ == -1) \ + percpu_unary_op(size, qual, "dec", var); \ + else \ + percpu_to_op(size, qual, "add", var, val); \ } while (0) -#define percpu_from_op(qual, op, var) \ -({ \ - typeof(var) pfo_ret__; \ - switch (sizeof(var)) { \ - case 1: \ - asm qual (op "b "__percpu_arg(1)",%0" \ - : "=q" (pfo_ret__) \ - : "m" (var)); \ - break; \ - case 2: \ - asm qual (op "w "__percpu_arg(1)",%0" \ - : "=r" (pfo_ret__) \ - : "m" (var)); \ - break; \ - case 4: \ - asm qual (op "l "__percpu_arg(1)",%0" \ - : "=r" (pfo_ret__) \ - : "m" (var)); \ - break; \ - case 8: \ - asm qual (op "q "__percpu_arg(1)",%0" \ - : "=r" (pfo_ret__) \ - : "m" (var)); \ - break; \ - default: __bad_percpu_size(); \ - } \ - pfo_ret__; \ -}) - -#define percpu_stable_op(op, var) \ -({ \ - typeof(var) pfo_ret__; \ - switch (sizeof(var)) { \ - case 1: \ - asm(op "b "__percpu_arg(P1)",%0" \ - : "=q" (pfo_ret__) \ - : "p" (&(var))); \ - break; \ - case 2: \ - asm(op "w "__percpu_arg(P1)",%0" \ - : "=r" (pfo_ret__) \ - : "p" (&(var))); \ - break; \ - case 4: \ - asm(op "l "__percpu_arg(P1)",%0" \ - : "=r" (pfo_ret__) \ - : "p" (&(var))); \ - break; \ - case 8: \ - asm(op "q "__percpu_arg(P1)",%0" \ - : "=r" (pfo_ret__) \ - : "p" (&(var))); \ - break; \ - default: __bad_percpu_size(); \ - } \ - pfo_ret__; \ +#define percpu_from_op(size, qual, op, _var) \ +({ \ + __pcpu_type_##size pfo_val__; \ + asm qual (__pcpu_op2_##size(op, __percpu_arg([var]), "%[val]") \ + : [val] __pcpu_reg_##size("=", pfo_val__) \ + : [var] "m" (_var)); \ + (typeof(_var))(unsigned long) pfo_val__; \ }) -#define percpu_unary_op(qual, op, var) \ -({ \ - switch (sizeof(var)) { \ - case 1: \ - asm qual (op "b "__percpu_arg(0) \ - : "+m" (var)); \ - break; \ - case 2: \ - asm qual (op "w "__percpu_arg(0) \ - : "+m" (var)); \ - break; \ - case 4: \ - asm qual (op "l "__percpu_arg(0) \ - : "+m" (var)); \ - break; \ - case 8: \ - asm qual (op "q "__percpu_arg(0) \ - : "+m" (var)); \ - break; \ - default: __bad_percpu_size(); \ - } \ +#define percpu_stable_op(size, op, _var) \ +({ \ + __pcpu_type_##size pfo_val__; \ + asm(__pcpu_op2_##size(op, __percpu_arg(P[var]), "%[val]") \ + : [val] __pcpu_reg_##size("=", pfo_val__) \ + : [var] "p" (&(_var))); \ + (typeof(_var))(unsigned long) pfo_val__; \ }) /* * Add return operation */ -#define percpu_add_return_op(qual, var, val) \ +#define percpu_add_return_op(size, qual, _var, _val) \ ({ \ - typeof(var) paro_ret__ = val; \ - switch (sizeof(var)) { \ - case 1: \ - asm qual ("xaddb %0, "__percpu_arg(1) \ - : "+q" (paro_ret__), "+m" (var) \ - : : "memory"); \ - break; \ - case 2: \ - asm qual ("xaddw %0, "__percpu_arg(1) \ - : "+r" (paro_ret__), "+m" (var) \ - : : "memory"); \ - break; \ - case 4: \ - asm qual ("xaddl %0, "__percpu_arg(1) \ - : "+r" (paro_ret__), "+m" (var) \ - : : "memory"); \ - break; \ - case 8: \ - asm qual ("xaddq %0, "__percpu_arg(1) \ - : "+re" (paro_ret__), "+m" (var) \ - : : "memory"); \ - break; \ - default: __bad_percpu_size(); \ - } \ - paro_ret__ += val; \ - paro_ret__; \ + __pcpu_type_##size paro_tmp__ = __pcpu_cast_##size(_val); \ + asm qual (__pcpu_op2_##size("xadd", "%[tmp]", \ + __percpu_arg([var])) \ + : [tmp] __pcpu_reg_##size("+", paro_tmp__), \ + [var] "+m" (_var) \ + : : "memory"); \ + (typeof(_var))(unsigned long) (paro_tmp__ + _val); \ }) /* @@ -299,85 +176,38 @@ do { \ * expensive due to the implied lock prefix. The processor cannot prefetch * cachelines if xchg is used. */ -#define percpu_xchg_op(qual, var, nval) \ +#define percpu_xchg_op(size, qual, _var, _nval) \ ({ \ - typeof(var) pxo_ret__; \ - typeof(var) pxo_new__ = (nval); \ - switch (sizeof(var)) { \ - case 1: \ - asm qual ("\n\tmov "__percpu_arg(1)",%%al" \ - "\n1:\tcmpxchgb %2, "__percpu_arg(1) \ - "\n\tjnz 1b" \ - : "=&a" (pxo_ret__), "+m" (var) \ - : "q" (pxo_new__) \ - : "memory"); \ - break; \ - case 2: \ - asm qual ("\n\tmov "__percpu_arg(1)",%%ax" \ - "\n1:\tcmpxchgw %2, "__percpu_arg(1) \ - "\n\tjnz 1b" \ - : "=&a" (pxo_ret__), "+m" (var) \ - : "r" (pxo_new__) \ - : "memory"); \ - break; \ - case 4: \ - asm qual ("\n\tmov "__percpu_arg(1)",%%eax" \ - "\n1:\tcmpxchgl %2, "__percpu_arg(1) \ - "\n\tjnz 1b" \ - : "=&a" (pxo_ret__), "+m" (var) \ - : "r" (pxo_new__) \ - : "memory"); \ - break; \ - case 8: \ - asm qual ("\n\tmov "__percpu_arg(1)",%%rax" \ - "\n1:\tcmpxchgq %2, "__percpu_arg(1) \ - "\n\tjnz 1b" \ - : "=&a" (pxo_ret__), "+m" (var) \ - : "r" (pxo_new__) \ - : "memory"); \ - break; \ - default: __bad_percpu_size(); \ - } \ - pxo_ret__; \ + __pcpu_type_##size pxo_old__; \ + __pcpu_type_##size pxo_new__ = __pcpu_cast_##size(_nval); \ + asm qual (__pcpu_op2_##size("mov", __percpu_arg([var]), \ + "%[oval]") \ + "\n1:\t" \ + __pcpu_op2_##size("cmpxchg", "%[nval]", \ + __percpu_arg([var])) \ + "\n\tjnz 1b" \ + : [oval] "=&a" (pxo_old__), \ + [var] "+m" (_var) \ + : [nval] __pcpu_reg_##size(, pxo_new__) \ + : "memory"); \ + (typeof(_var))(unsigned long) pxo_old__; \ }) /* * cmpxchg has no such implied lock semantics as a result it is much * more efficient for cpu local operations. */ -#define percpu_cmpxchg_op(qual, var, oval, nval) \ +#define percpu_cmpxchg_op(size, qual, _var, _oval, _nval) \ ({ \ - typeof(var) pco_ret__; \ - typeof(var) pco_old__ = (oval); \ - typeof(var) pco_new__ = (nval); \ - switch (sizeof(var)) { \ - case 1: \ - asm qual ("cmpxchgb %2, "__percpu_arg(1) \ - : "=a" (pco_ret__), "+m" (var) \ - : "q" (pco_new__), "0" (pco_old__) \ - : "memory"); \ - break; \ - case 2: \ - asm qual ("cmpxchgw %2, "__percpu_arg(1) \ - : "=a" (pco_ret__), "+m" (var) \ - : "r" (pco_new__), "0" (pco_old__) \ - : "memory"); \ - break; \ - case 4: \ - asm qual ("cmpxchgl %2, "__percpu_arg(1) \ - : "=a" (pco_ret__), "+m" (var) \ - : "r" (pco_new__), "0" (pco_old__) \ - : "memory"); \ - break; \ - case 8: \ - asm qual ("cmpxchgq %2, "__percpu_arg(1) \ - : "=a" (pco_ret__), "+m" (var) \ - : "r" (pco_new__), "0" (pco_old__) \ - : "memory"); \ - break; \ - default: __bad_percpu_size(); \ - } \ - pco_ret__; \ + __pcpu_type_##size pco_old__ = __pcpu_cast_##size(_oval); \ + __pcpu_type_##size pco_new__ = __pcpu_cast_##size(_nval); \ + asm qual (__pcpu_op2_##size("cmpxchg", "%[nval]", \ + __percpu_arg([var])) \ + : [oval] "+a" (pco_old__), \ + [var] "+m" (_var) \ + : [nval] __pcpu_reg_##size(, pco_new__) \ + : "memory"); \ + (typeof(_var))(unsigned long) pco_old__; \ }) /* @@ -389,24 +219,28 @@ do { \ * per-thread variables implemented as per-cpu variables and thus * stable for the duration of the respective task. */ -#define this_cpu_read_stable(var) percpu_stable_op("mov", var) - -#define raw_cpu_read_1(pcp) percpu_from_op(, "mov", pcp) -#define raw_cpu_read_2(pcp) percpu_from_op(, "mov", pcp) -#define raw_cpu_read_4(pcp) percpu_from_op(, "mov", pcp) - -#define raw_cpu_write_1(pcp, val) percpu_to_op(, "mov", (pcp), val) -#define raw_cpu_write_2(pcp, val) percpu_to_op(, "mov", (pcp), val) -#define raw_cpu_write_4(pcp, val) percpu_to_op(, "mov", (pcp), val) -#define raw_cpu_add_1(pcp, val) percpu_add_op(, (pcp), val) -#define raw_cpu_add_2(pcp, val) percpu_add_op(, (pcp), val) -#define raw_cpu_add_4(pcp, val) percpu_add_op(, (pcp), val) -#define raw_cpu_and_1(pcp, val) percpu_to_op(, "and", (pcp), val) -#define raw_cpu_and_2(pcp, val) percpu_to_op(, "and", (pcp), val) -#define raw_cpu_and_4(pcp, val) percpu_to_op(, "and", (pcp), val) -#define raw_cpu_or_1(pcp, val) percpu_to_op(, "or", (pcp), val) -#define raw_cpu_or_2(pcp, val) percpu_to_op(, "or", (pcp), val) -#define raw_cpu_or_4(pcp, val) percpu_to_op(, "or", (pcp), val) +#define this_cpu_read_stable_1(pcp) percpu_stable_op(1, "mov", pcp) +#define this_cpu_read_stable_2(pcp) percpu_stable_op(2, "mov", pcp) +#define this_cpu_read_stable_4(pcp) percpu_stable_op(4, "mov", pcp) +#define this_cpu_read_stable_8(pcp) percpu_stable_op(8, "mov", pcp) +#define this_cpu_read_stable(pcp) __pcpu_size_call_return(this_cpu_read_stable_, pcp) + +#define raw_cpu_read_1(pcp) percpu_from_op(1, , "mov", pcp) +#define raw_cpu_read_2(pcp) percpu_from_op(2, , "mov", pcp) +#define raw_cpu_read_4(pcp) percpu_from_op(4, , "mov", pcp) + +#define raw_cpu_write_1(pcp, val) percpu_to_op(1, , "mov", (pcp), val) +#define raw_cpu_write_2(pcp, val) percpu_to_op(2, , "mov", (pcp), val) +#define raw_cpu_write_4(pcp, val) percpu_to_op(4, , "mov", (pcp), val) +#define raw_cpu_add_1(pcp, val) percpu_add_op(1, , (pcp), val) +#define raw_cpu_add_2(pcp, val) percpu_add_op(2, , (pcp), val) +#define raw_cpu_add_4(pcp, val) percpu_add_op(4, , (pcp), val) +#define raw_cpu_and_1(pcp, val) percpu_to_op(1, , "and", (pcp), val) +#define raw_cpu_and_2(pcp, val) percpu_to_op(2, , "and", (pcp), val) +#define raw_cpu_and_4(pcp, val) percpu_to_op(4, , "and", (pcp), val) +#define raw_cpu_or_1(pcp, val) percpu_to_op(1, , "or", (pcp), val) +#define raw_cpu_or_2(pcp, val) percpu_to_op(2, , "or", (pcp), val) +#define raw_cpu_or_4(pcp, val) percpu_to_op(4, , "or", (pcp), val) /* * raw_cpu_xchg() can use a load-store since it is not required to be @@ -423,38 +257,38 @@ do { \ #define raw_cpu_xchg_2(pcp, val) raw_percpu_xchg_op(pcp, val) #define raw_cpu_xchg_4(pcp, val) raw_percpu_xchg_op(pcp, val) -#define this_cpu_read_1(pcp) percpu_from_op(volatile, "mov", pcp) -#define this_cpu_read_2(pcp) percpu_from_op(volatile, "mov", pcp) -#define this_cpu_read_4(pcp) percpu_from_op(volatile, "mov", pcp) -#define this_cpu_write_1(pcp, val) percpu_to_op(volatile, "mov", (pcp), val) -#define this_cpu_write_2(pcp, val) percpu_to_op(volatile, "mov", (pcp), val) -#define this_cpu_write_4(pcp, val) percpu_to_op(volatile, "mov", (pcp), val) -#define this_cpu_add_1(pcp, val) percpu_add_op(volatile, (pcp), val) -#define this_cpu_add_2(pcp, val) percpu_add_op(volatile, (pcp), val) -#define this_cpu_add_4(pcp, val) percpu_add_op(volatile, (pcp), val) -#define this_cpu_and_1(pcp, val) percpu_to_op(volatile, "and", (pcp), val) -#define this_cpu_and_2(pcp, val) percpu_to_op(volatile, "and", (pcp), val) -#define this_cpu_and_4(pcp, val) percpu_to_op(volatile, "and", (pcp), val) -#define this_cpu_or_1(pcp, val) percpu_to_op(volatile, "or", (pcp), val) -#define this_cpu_or_2(pcp, val) percpu_to_op(volatile, "or", (pcp), val) -#define this_cpu_or_4(pcp, val) percpu_to_op(volatile, "or", (pcp), val) -#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(volatile, pcp, nval) -#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(volatile, pcp, nval) -#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(volatile, pcp, nval) - -#define raw_cpu_add_return_1(pcp, val) percpu_add_return_op(, pcp, val) -#define raw_cpu_add_return_2(pcp, val) percpu_add_return_op(, pcp, val) -#define raw_cpu_add_return_4(pcp, val) percpu_add_return_op(, pcp, val) -#define raw_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(, pcp, oval, nval) -#define raw_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(, pcp, oval, nval) -#define raw_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(, pcp, oval, nval) - -#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(volatile, pcp, val) -#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(volatile, pcp, val) -#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(volatile, pcp, val) -#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(volatile, pcp, oval, nval) -#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(volatile, pcp, oval, nval) -#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(volatile, pcp, oval, nval) +#define this_cpu_read_1(pcp) percpu_from_op(1, volatile, "mov", pcp) +#define this_cpu_read_2(pcp) percpu_from_op(2, volatile, "mov", pcp) +#define this_cpu_read_4(pcp) percpu_from_op(4, volatile, "mov", pcp) +#define this_cpu_write_1(pcp, val) percpu_to_op(1, volatile, "mov", (pcp), val) +#define this_cpu_write_2(pcp, val) percpu_to_op(2, volatile, "mov", (pcp), val) +#define this_cpu_write_4(pcp, val) percpu_to_op(4, volatile, "mov", (pcp), val) +#define this_cpu_add_1(pcp, val) percpu_add_op(1, volatile, (pcp), val) +#define this_cpu_add_2(pcp, val) percpu_add_op(2, volatile, (pcp), val) +#define this_cpu_add_4(pcp, val) percpu_add_op(4, volatile, (pcp), val) +#define this_cpu_and_1(pcp, val) percpu_to_op(1, volatile, "and", (pcp), val) +#define this_cpu_and_2(pcp, val) percpu_to_op(2, volatile, "and", (pcp), val) +#define this_cpu_and_4(pcp, val) percpu_to_op(4, volatile, "and", (pcp), val) +#define this_cpu_or_1(pcp, val) percpu_to_op(1, volatile, "or", (pcp), val) +#define this_cpu_or_2(pcp, val) percpu_to_op(2, volatile, "or", (pcp), val) +#define this_cpu_or_4(pcp, val) percpu_to_op(4, volatile, "or", (pcp), val) +#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(1, volatile, pcp, nval) +#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(2, volatile, pcp, nval) +#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(4, volatile, pcp, nval) + +#define raw_cpu_add_return_1(pcp, val) percpu_add_return_op(1, , pcp, val) +#define raw_cpu_add_return_2(pcp, val) percpu_add_return_op(2, , pcp, val) +#define raw_cpu_add_return_4(pcp, val) percpu_add_return_op(4, , pcp, val) +#define raw_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, , pcp, oval, nval) +#define raw_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, , pcp, oval, nval) +#define raw_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, , pcp, oval, nval) + +#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(1, volatile, pcp, val) +#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(2, volatile, pcp, val) +#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(4, volatile, pcp, val) +#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, volatile, pcp, oval, nval) +#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, volatile, pcp, oval, nval) +#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, volatile, pcp, oval, nval) #ifdef CONFIG_X86_CMPXCHG64 #define percpu_cmpxchg8b_double(pcp1, pcp2, o1, o2, n1, n2) \ @@ -478,23 +312,23 @@ do { \ * 32 bit must fall back to generic operations. */ #ifdef CONFIG_X86_64 -#define raw_cpu_read_8(pcp) percpu_from_op(, "mov", pcp) -#define raw_cpu_write_8(pcp, val) percpu_to_op(, "mov", (pcp), val) -#define raw_cpu_add_8(pcp, val) percpu_add_op(, (pcp), val) -#define raw_cpu_and_8(pcp, val) percpu_to_op(, "and", (pcp), val) -#define raw_cpu_or_8(pcp, val) percpu_to_op(, "or", (pcp), val) -#define raw_cpu_add_return_8(pcp, val) percpu_add_return_op(, pcp, val) +#define raw_cpu_read_8(pcp) percpu_from_op(8, , "mov", pcp) +#define raw_cpu_write_8(pcp, val) percpu_to_op(8, , "mov", (pcp), val) +#define raw_cpu_add_8(pcp, val) percpu_add_op(8, , (pcp), val) +#define raw_cpu_and_8(pcp, val) percpu_to_op(8, , "and", (pcp), val) +#define raw_cpu_or_8(pcp, val) percpu_to_op(8, , "or", (pcp), val) +#define raw_cpu_add_return_8(pcp, val) percpu_add_return_op(8, , pcp, val) #define raw_cpu_xchg_8(pcp, nval) raw_percpu_xchg_op(pcp, nval) -#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(, pcp, oval, nval) - -#define this_cpu_read_8(pcp) percpu_from_op(volatile, "mov", pcp) -#define this_cpu_write_8(pcp, val) percpu_to_op(volatile, "mov", (pcp), val) -#define this_cpu_add_8(pcp, val) percpu_add_op(volatile, (pcp), val) -#define this_cpu_and_8(pcp, val) percpu_to_op(volatile, "and", (pcp), val) -#define this_cpu_or_8(pcp, val) percpu_to_op(volatile, "or", (pcp), val) -#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(volatile, pcp, val) -#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(volatile, pcp, nval) -#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(volatile, pcp, oval, nval) +#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval) + +#define this_cpu_read_8(pcp) percpu_from_op(8, volatile, "mov", pcp) +#define this_cpu_write_8(pcp, val) percpu_to_op(8, volatile, "mov", (pcp), val) +#define this_cpu_add_8(pcp, val) percpu_add_op(8, volatile, (pcp), val) +#define this_cpu_and_8(pcp, val) percpu_to_op(8, volatile, "and", (pcp), val) +#define this_cpu_or_8(pcp, val) percpu_to_op(8, volatile, "or", (pcp), val) +#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(8, volatile, pcp, val) +#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(8, volatile, pcp, nval) +#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval) /* * Pretty complex macro to generate cmpxchg16 instruction. The instruction diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index e855e9cf2c37..0c1b13720525 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -142,6 +142,46 @@ union cpuid10_edx { unsigned int full; }; +/* + * Intel Architectural LBR CPUID detection/enumeration details: + */ +union cpuid28_eax { + struct { + /* Supported LBR depth values */ + unsigned int lbr_depth_mask:8; + unsigned int reserved:22; + /* Deep C-state Reset */ + unsigned int lbr_deep_c_reset:1; + /* IP values contain LIP */ + unsigned int lbr_lip:1; + } split; + unsigned int full; +}; + +union cpuid28_ebx { + struct { + /* CPL Filtering Supported */ + unsigned int lbr_cpl:1; + /* Branch Filtering Supported */ + unsigned int lbr_filter:1; + /* Call-stack Mode Supported */ + unsigned int lbr_call_stack:1; + } split; + unsigned int full; +}; + +union cpuid28_ecx { + struct { + /* Mispredict Bit Supported */ + unsigned int lbr_mispred:1; + /* Timed LBRs Supported */ + unsigned int lbr_timed_lbr:1; + /* Branch Type Field Supported */ + unsigned int lbr_br_type:1; + } split; + unsigned int full; +}; + struct x86_pmu_capability { int version; int num_counters_gp; @@ -192,10 +232,30 @@ struct x86_pmu_capability { #define GLOBAL_STATUS_UNC_OVF BIT_ULL(61) #define GLOBAL_STATUS_ASIF BIT_ULL(60) #define GLOBAL_STATUS_COUNTERS_FROZEN BIT_ULL(59) -#define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(58) +#define GLOBAL_STATUS_LBRS_FROZEN_BIT 58 +#define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(GLOBAL_STATUS_LBRS_FROZEN_BIT) #define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(55) /* + * We model guest LBR event tracing as another fixed-mode PMC like BTS. + * + * We choose bit 58 because it's used to indicate LBR stack frozen state + * for architectural perfmon v4, also we unconditionally mask that bit in + * the handle_pmi_common(), so it'll never be set in the overflow handling. + * + * With this fake counter assigned, the guest LBR event user (such as KVM), + * can program the LBR registers on its own, and we don't actually do anything + * with then in the host context. + */ +#define INTEL_PMC_IDX_FIXED_VLBR (GLOBAL_STATUS_LBRS_FROZEN_BIT) + +/* + * Pseudo-encoding the guest LBR event as event=0x00,umask=0x1b, + * since it would claim bit 58 which is effectively Fixed26. + */ +#define INTEL_FIXED_VLBR_EVENT 0x1b00 + +/* * Adaptive PEBS v4 */ @@ -222,14 +282,6 @@ struct pebs_xmm { u64 xmm[16*2]; /* two entries for each register */ }; -struct pebs_lbr_entry { - u64 from, to, info; -}; - -struct pebs_lbr { - struct pebs_lbr_entry lbr[0]; /* Variable length */ -}; - /* * IBS cpuid feature detection */ @@ -333,6 +385,13 @@ struct perf_guest_switch_msr { u64 host, guest; }; +struct x86_pmu_lbr { + unsigned int nr; + unsigned int from; + unsigned int to; + unsigned int info; +}; + extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap); extern void perf_check_microcode(void); extern int x86_perf_rdpmc_index(struct perf_event *event); @@ -348,12 +407,17 @@ static inline void perf_check_microcode(void) { } #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); +extern int x86_perf_get_lbr(struct x86_pmu_lbr *lbr); #else static inline struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr) { *nr = 0; return NULL; } +static inline int x86_perf_get_lbr(struct x86_pmu_lbr *lbr) +{ + return -1; +} #endif #ifdef CONFIG_CPU_SUP_INTEL diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index 29aa7859bdee..62ad61d6fefc 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -7,7 +7,8 @@ #include <linux/pagemap.h> #define __HAVE_ARCH_PTE_ALLOC_ONE -#include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */ +#define __HAVE_ARCH_PGD_FREE +#include <asm-generic/pgalloc.h> static inline int __paravirt_pgd_alloc(struct mm_struct *mm) { return 0; } @@ -86,30 +87,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, #define pmd_pgtable(pmd) pmd_page(pmd) #if CONFIG_PGTABLE_LEVELS > 2 -static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) -{ - struct page *page; - gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO; - - if (mm == &init_mm) - gfp &= ~__GFP_ACCOUNT; - page = alloc_pages(gfp, 0); - if (!page) - return NULL; - if (!pgtable_pmd_page_ctor(page)) { - __free_pages(page, 0); - return NULL; - } - return (pmd_t *)page_address(page); -} - -static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) -{ - BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); - pgtable_pmd_page_dtor(virt_to_page(pmd)); - free_page((unsigned long)pmd); -} - extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, @@ -147,21 +124,6 @@ static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d, pud_t *pu set_p4d_safe(p4d, __p4d(_PAGE_TABLE | __pa(pud))); } -static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) -{ - gfp_t gfp = GFP_KERNEL_ACCOUNT; - - if (mm == &init_mm) - gfp &= ~__GFP_ACCOUNT; - return (pud_t *)get_zeroed_page(gfp); -} - -static inline void pud_free(struct mm_struct *mm, pud_t *pud) -{ - BUG_ON((unsigned long)pud & (PAGE_SIZE-1)); - free_page((unsigned long)pud); -} - extern void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud); static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 76aa21e8128d..b836138ce852 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -999,15 +999,12 @@ extern int direct_gbpages; void init_mem_mapping(void); void early_alloc_pgt_buf(void); extern void memblock_find_dma_reserve(void); - - -#ifdef CONFIG_X86_64 -extern pgd_t trampoline_pgd_entry; - void __init poking_init(void); - unsigned long init_memory_mapping(unsigned long start, unsigned long end, pgprot_t prot); + +#ifdef CONFIG_X86_64 +extern pgd_t trampoline_pgd_entry; #endif /* local pte updates need not use xchg for locking */ diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 1b68d24dc6a0..56d0399a0cd1 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -168,23 +168,18 @@ static inline void native_pgd_clear(pgd_t *pgd) native_set_pgd(pgd, native_make_pgd(0)); } -extern void sync_global_pgds(unsigned long start, unsigned long end); - /* * Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to. */ -/* - * Level 4 access. - */ -#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE) +/* PGD - Level 4 access */ -/* PUD - Level3 access */ +/* PUD - Level 3 access */ -/* PMD - Level 2 access */ +/* PMD - Level 2 access */ -/* PTE - Level 1 access. */ +/* PTE - Level 1 access */ /* * Encode and de-code a swap entry diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 03b7c4ca425a..97143d87994c 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -457,10 +457,8 @@ static inline unsigned long cpu_kernelmode_gs_base(int cpu) DECLARE_PER_CPU(unsigned int, irq_count); extern asmlinkage void ignore_sysret(void); -#if IS_ENABLED(CONFIG_KVM) /* Save actual FS/GS selectors and bases to current->thread */ -void save_fsgs_for_kvm(void); -#endif +void current_save_fsgs(void); #else /* X86_64 */ #ifdef CONFIG_STACKPROTECTOR /* @@ -575,7 +573,7 @@ native_load_sp0(unsigned long sp0) this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0); } -static inline void native_swapgs(void) +static __always_inline void native_swapgs(void) { #ifdef CONFIG_X86_64 asm volatile("swapgs" ::: "memory"); @@ -678,70 +676,6 @@ static inline unsigned int cpuid_edx(unsigned int op) return edx; } -/* - * This function forces the icache and prefetched instruction stream to - * catch up with reality in two very specific cases: - * - * a) Text was modified using one virtual address and is about to be executed - * from the same physical page at a different virtual address. - * - * b) Text was modified on a different CPU, may subsequently be - * executed on this CPU, and you want to make sure the new version - * gets executed. This generally means you're calling this in a IPI. - * - * If you're calling this for a different reason, you're probably doing - * it wrong. - */ -static inline void sync_core(void) -{ - /* - * There are quite a few ways to do this. IRET-to-self is nice - * because it works on every CPU, at any CPL (so it's compatible - * with paravirtualization), and it never exits to a hypervisor. - * The only down sides are that it's a bit slow (it seems to be - * a bit more than 2x slower than the fastest options) and that - * it unmasks NMIs. The "push %cs" is needed because, in - * paravirtual environments, __KERNEL_CS may not be a valid CS - * value when we do IRET directly. - * - * In case NMI unmasking or performance ever becomes a problem, - * the next best option appears to be MOV-to-CR2 and an - * unconditional jump. That sequence also works on all CPUs, - * but it will fault at CPL3 (i.e. Xen PV). - * - * CPUID is the conventional way, but it's nasty: it doesn't - * exist on some 486-like CPUs, and it usually exits to a - * hypervisor. - * - * Like all of Linux's memory ordering operations, this is a - * compiler barrier as well. - */ -#ifdef CONFIG_X86_32 - asm volatile ( - "pushfl\n\t" - "pushl %%cs\n\t" - "pushl $1f\n\t" - "iret\n\t" - "1:" - : ASM_CALL_CONSTRAINT : : "memory"); -#else - unsigned int tmp; - - asm volatile ( - "mov %%ss, %0\n\t" - "pushq %q0\n\t" - "pushq %%rsp\n\t" - "addq $8, (%%rsp)\n\t" - "pushfq\n\t" - "mov %%cs, %0\n\t" - "pushq %q0\n\t" - "pushq $1f\n\t" - "iretq\n\t" - "1:" - : "=&r" (tmp), ASM_CALL_CONSTRAINT : : "cc", "memory"); -#endif -} - extern void select_idle_routine(const struct cpuinfo_x86 *c); extern void amd_e400_c1e_apic_setup(void); diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 255b2dde2c1b..40aa69d04862 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -209,6 +209,11 @@ static inline void user_stack_pointer_set(struct pt_regs *regs, regs->sp = val; } +static __always_inline bool regs_irqs_disabled(struct pt_regs *regs) +{ + return !(regs->flags & X86_EFLAGS_IF); +} + /* Query offset/name of register from its name/offset */ extern int regs_query_register_offset(const char *name); extern const char *regs_query_register_name(unsigned int offset); diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index 444d6fd9a6d8..d86ab942219c 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -32,6 +32,7 @@ extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); extern void __pv_init_lock_hash(void); extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock); +extern bool nopvspin; #define queued_spin_unlock queued_spin_unlock /** diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h index 33d3c88a7225..6fd8410a3910 100644 --- a/arch/x86/include/asm/signal.h +++ b/arch/x86/include/asm/signal.h @@ -35,7 +35,6 @@ typedef sigset_t compat_sigset_t; #endif /* __ASSEMBLY__ */ #include <uapi/asm/signal.h> #ifndef __ASSEMBLY__ -extern void do_signal(struct pt_regs *regs); #define __ARCH_HAS_SA_RESTORER diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index e15f364efbcc..c0538f82c9a2 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -5,16 +5,6 @@ #include <linux/cpumask.h> #include <asm/percpu.h> -/* - * We need the APIC definitions automatically as part of 'smp.h' - */ -#ifdef CONFIG_X86_LOCAL_APIC -# include <asm/mpspec.h> -# include <asm/apic.h> -# ifdef CONFIG_X86_IO_APIC -# include <asm/io_apic.h> -# endif -#endif #include <asm/thread_info.h> #include <asm/cpumask.h> diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h index 199218719a86..6bfc878f6771 100644 --- a/arch/x86/include/asm/sparsemem.h +++ b/arch/x86/include/asm/sparsemem.h @@ -10,24 +10,20 @@ * field of the struct page * * SECTION_SIZE_BITS 2^n: size of each section - * MAX_PHYSADDR_BITS 2^n: max size of physical address space - * MAX_PHYSMEM_BITS 2^n: how much memory we can have in that space + * MAX_PHYSMEM_BITS 2^n: max size of physical address space * */ #ifdef CONFIG_X86_32 # ifdef CONFIG_X86_PAE # define SECTION_SIZE_BITS 29 -# define MAX_PHYSADDR_BITS 36 # define MAX_PHYSMEM_BITS 36 # else # define SECTION_SIZE_BITS 26 -# define MAX_PHYSADDR_BITS 32 # define MAX_PHYSMEM_BITS 32 # endif #else /* CONFIG_X86_32 */ # define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */ -# define MAX_PHYSADDR_BITS (pgtable_l5_enabled() ? 52 : 44) # define MAX_PHYSMEM_BITS (pgtable_l5_enabled() ? 52 : 46) #endif diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index eb8e781c4353..59a3e13204c3 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -234,7 +234,6 @@ static inline void clwb(volatile void *__p) #define nop() asm volatile ("nop") - #endif /* __KERNEL__ */ #endif /* _ASM_X86_SPECIAL_INSNS_H */ diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 9804a7957f4e..7fb482f0f25b 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -90,6 +90,15 @@ static __always_inline void boot_init_stack_canary(void) #endif } +static inline void cpu_init_stack_canary(int cpu, struct task_struct *idle) +{ +#ifdef CONFIG_X86_64 + per_cpu(fixed_percpu_data.stack_canary, cpu) = idle->stack_canary; +#else + per_cpu(stack_canary.canary, cpu) = idle->stack_canary; +#endif +} + static inline void setup_stack_canary_segment(int cpu) { #ifdef CONFIG_X86_32 @@ -119,6 +128,9 @@ static inline void load_stack_canary_segment(void) static inline void setup_stack_canary_segment(int cpu) { } +static inline void cpu_init_stack_canary(int cpu, struct task_struct *idle) +{ } + static inline void load_stack_canary_segment(void) { #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/sync_core.h b/arch/x86/include/asm/sync_core.h index c67caafd3381..fdb5b356e59b 100644 --- a/arch/x86/include/asm/sync_core.h +++ b/arch/x86/include/asm/sync_core.h @@ -6,6 +6,78 @@ #include <asm/processor.h> #include <asm/cpufeature.h> +#ifdef CONFIG_X86_32 +static inline void iret_to_self(void) +{ + asm volatile ( + "pushfl\n\t" + "pushl %%cs\n\t" + "pushl $1f\n\t" + "iret\n\t" + "1:" + : ASM_CALL_CONSTRAINT : : "memory"); +} +#else +static inline void iret_to_self(void) +{ + unsigned int tmp; + + asm volatile ( + "mov %%ss, %0\n\t" + "pushq %q0\n\t" + "pushq %%rsp\n\t" + "addq $8, (%%rsp)\n\t" + "pushfq\n\t" + "mov %%cs, %0\n\t" + "pushq %q0\n\t" + "pushq $1f\n\t" + "iretq\n\t" + "1:" + : "=&r" (tmp), ASM_CALL_CONSTRAINT : : "cc", "memory"); +} +#endif /* CONFIG_X86_32 */ + +/* + * This function forces the icache and prefetched instruction stream to + * catch up with reality in two very specific cases: + * + * a) Text was modified using one virtual address and is about to be executed + * from the same physical page at a different virtual address. + * + * b) Text was modified on a different CPU, may subsequently be + * executed on this CPU, and you want to make sure the new version + * gets executed. This generally means you're calling this in a IPI. + * + * If you're calling this for a different reason, you're probably doing + * it wrong. + */ +static inline void sync_core(void) +{ + /* + * There are quite a few ways to do this. IRET-to-self is nice + * because it works on every CPU, at any CPL (so it's compatible + * with paravirtualization), and it never exits to a hypervisor. + * The only down sides are that it's a bit slow (it seems to be + * a bit more than 2x slower than the fastest options) and that + * it unmasks NMIs. The "push %cs" is needed because, in + * paravirtual environments, __KERNEL_CS may not be a valid CS + * value when we do IRET directly. + * + * In case NMI unmasking or performance ever becomes a problem, + * the next best option appears to be MOV-to-CR2 and an + * unconditional jump. That sequence also works on all CPUs, + * but it will fault at CPL3 (i.e. Xen PV). + * + * CPUID is the conventional way, but it's nasty: it doesn't + * exist on some 486-like CPUs, and it usually exits to a + * hypervisor. + * + * Like all of Linux's memory ordering operations, this is a + * compiler barrier as well. + */ + iret_to_self(); +} + /* * Ensure that a core serializing instruction is issued before returning * to user-mode. x86 implements return to user-space through sysexit, diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 8de8ceccb8bc..267701ae3d86 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -133,11 +133,6 @@ struct thread_info { #define _TIF_X32 (1 << TIF_X32) #define _TIF_FSCHECK (1 << TIF_FSCHECK) -/* Work to do before invoking the actual syscall. */ -#define _TIF_WORK_SYSCALL_ENTRY \ - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \ - _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT) - /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW_BASE \ (_TIF_NOCPUID | _TIF_NOTSC | _TIF_BLOCKSTEP | \ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 79d8d5496330..f4234575f3fd 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -193,7 +193,7 @@ static inline void sched_clear_itmt_support(void) } #endif /* CONFIG_SCHED_MC_PRIO */ -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && defined(CONFIG_X86_64) #include <asm/cpufeature.h> DECLARE_STATIC_KEY_FALSE(arch_scale_freq_key); diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 8a0c25c6bf09..01a300a9700b 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -6,9 +6,7 @@ #define _ASM_X86_TSC_H #include <asm/processor.h> - -#define NS_SCALE 10 /* 2^10, carefully chosen */ -#define US_SCALE 32 /* 2^32, arbitralrily chosen */ +#include <asm/cpufeature.h> /* * Standard way to access the cycle counter. diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 18dfa07d3ef0..ecefaffd15d4 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -33,7 +33,7 @@ static inline void set_fs(mm_segment_t fs) set_thread_flag(TIF_FSCHECK); } -#define segment_eq(a, b) ((a).seg == (b).seg) +#define uaccess_kernel() (get_fs().seg == KERNEL_DS.seg) #define user_addr_max() (current->thread.addr_limit.seg) /* @@ -314,11 +314,14 @@ do { \ #define __get_user_size(x, ptr, size, retval) \ do { \ + unsigned char x_u8__; \ + \ retval = 0; \ __chk_user_ptr(ptr); \ switch (size) { \ case 1: \ - __get_user_asm(x, ptr, retval, "b", "=q"); \ + __get_user_asm(x_u8__, ptr, retval, "b", "=q"); \ + (x) = x_u8__; \ break; \ case 2: \ __get_user_asm(x, ptr, retval, "w", "=r"); \ diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index 2fcc3ac12e76..70050d0136c3 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -72,7 +72,7 @@ struct uv_gam_range_entry { }; #define UV_SYSTAB_SIG "UVST" -#define UV_SYSTAB_VERSION_1 1 /* UV1/2/3 BIOS version */ +#define UV_SYSTAB_VERSION_1 1 /* UV2/3 BIOS version */ #define UV_SYSTAB_VERSION_UV4 0x400 /* UV4 BIOS base version */ #define UV_SYSTAB_VERSION_UV4_1 0x401 /* + gpa_shift */ #define UV_SYSTAB_VERSION_UV4_2 0x402 /* + TYPE_NVRAM/WINDOW/MBOX */ diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h index 3db85626048f..e48aea9ba47d 100644 --- a/arch/x86/include/asm/uv/uv.h +++ b/arch/x86/include/asm/uv/uv.h @@ -4,7 +4,7 @@ #include <asm/tlbflush.h> -enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC}; +enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC}; struct cpumask; struct mm_struct; diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index f1188bd47658..cd24804955d7 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -46,10 +46,7 @@ #define UV_ACT_STATUS_SIZE 2 #define UV_DISTRIBUTION_SIZE 256 #define UV_SW_ACK_NPENDING 8 -#define UV1_NET_ENDPOINT_INTD 0x38 -#define UV2_NET_ENDPOINT_INTD 0x28 -#define UV_NET_ENDPOINT_INTD (is_uv1_hub() ? \ - UV1_NET_ENDPOINT_INTD : UV2_NET_ENDPOINT_INTD) +#define UV_NET_ENDPOINT_INTD 0x28 #define UV_PAYLOADQ_GNODE_SHIFT 49 #define UV_PTC_BASENAME "sgi_uv/ptc_statistics" #define UV_BAU_BASENAME "sgi_uv/bau_tunables" @@ -64,14 +61,9 @@ * UV2: Bit 19 selects between * (0): 10 microsecond timebase and * (1): 80 microseconds - * we're using 560us, similar to UV1: 65 units of 10us + * we're using 560us */ -#define UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD (9UL) -#define UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD (15UL) - -#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD (is_uv1_hub() ? \ - UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD : \ - UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD) +#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD (15UL) /* assuming UV3 is the same */ #define BAU_MISC_CONTROL_MULT_MASK 3 @@ -148,7 +140,6 @@ #define UV_LB_SUBNODEID 0x10 -/* these two are the same for UV1 and UV2: */ #define UV_SA_SHFT UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT #define UV_SA_MASK UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK /* 4 bits of software ack period */ @@ -189,8 +180,7 @@ #define BAU_DESC_QUALIFIER 0x534749 enum uv_bau_version { - UV_BAU_V1 = 1, - UV_BAU_V2, + UV_BAU_V2 = 2, UV_BAU_V3, UV_BAU_V4, }; @@ -233,12 +223,12 @@ struct bau_local_cpumask { */ /** - * struct uv1_2_3_bau_msg_payload - defines payload for INTD transactions + * struct uv2_3_bau_msg_payload - defines payload for INTD transactions * @address: Signifies a page or all TLB's of the cpu * @sending_cpu: CPU from which the message originates * @acknowledge_count: CPUs on the destination Hub that received the interrupt */ -struct uv1_2_3_bau_msg_payload { +struct uv2_3_bau_msg_payload { u64 address; u16 sending_cpu; u16 acknowledge_count; @@ -260,89 +250,6 @@ struct uv4_bau_msg_payload { }; /* - * UV1 Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor) - * see table 4.2.3.0.1 in broacast_assist spec. - */ -struct uv1_bau_msg_header { - unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */ - /* bits 5:0 */ - unsigned int base_dest_nasid:15; /* nasid of the first bit */ - /* bits 20:6 */ /* in uvhub map */ - unsigned int command:8; /* message type */ - /* bits 28:21 */ - /* 0x38: SN3net EndPoint Message */ - unsigned int rsvd_1:3; /* must be zero */ - /* bits 31:29 */ - /* int will align on 32 bits */ - unsigned int rsvd_2:9; /* must be zero */ - /* bits 40:32 */ - /* Suppl_A is 56-41 */ - unsigned int sequence:16; /* message sequence number */ - /* bits 56:41 */ /* becomes bytes 16-17 of msg */ - /* Address field (96:57) is - never used as an address - (these are address bits - 42:3) */ - - unsigned int rsvd_3:1; /* must be zero */ - /* bit 57 */ - /* address bits 27:4 are payload */ - /* these next 24 (58-81) bits become bytes 12-14 of msg */ - /* bits 65:58 land in byte 12 */ - unsigned int replied_to:1; /* sent as 0 by the source to - byte 12 */ - /* bit 58 */ - unsigned int msg_type:3; /* software type of the - message */ - /* bits 61:59 */ - unsigned int canceled:1; /* message canceled, resource - is to be freed*/ - /* bit 62 */ - unsigned int payload_1a:1; /* not currently used */ - /* bit 63 */ - unsigned int payload_1b:2; /* not currently used */ - /* bits 65:64 */ - - /* bits 73:66 land in byte 13 */ - unsigned int payload_1ca:6; /* not currently used */ - /* bits 71:66 */ - unsigned int payload_1c:2; /* not currently used */ - /* bits 73:72 */ - - /* bits 81:74 land in byte 14 */ - unsigned int payload_1d:6; /* not currently used */ - /* bits 79:74 */ - unsigned int payload_1e:2; /* not currently used */ - /* bits 81:80 */ - - unsigned int rsvd_4:7; /* must be zero */ - /* bits 88:82 */ - unsigned int swack_flag:1; /* software acknowledge flag */ - /* bit 89 */ - /* INTD trasactions at - destination are to wait for - software acknowledge */ - unsigned int rsvd_5:6; /* must be zero */ - /* bits 95:90 */ - unsigned int rsvd_6:5; /* must be zero */ - /* bits 100:96 */ - unsigned int int_both:1; /* if 1, interrupt both sockets - on the uvhub */ - /* bit 101*/ - unsigned int fairness:3; /* usually zero */ - /* bits 104:102 */ - unsigned int multilevel:1; /* multi-level multicast - format */ - /* bit 105 */ - /* 0 for TLB: endpoint multi-unicast messages */ - unsigned int chaining:1; /* next descriptor is part of - this activation*/ - /* bit 106 */ - unsigned int rsvd_7:21; /* must be zero */ - /* bits 127:107 */ -}; - -/* * UV2 Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor) * see figure 9-2 of harp_sys.pdf * assuming UV3 is the same @@ -418,25 +325,14 @@ struct bau_desc { * message template, consisting of header and payload: */ union bau_msg_header { - struct uv1_bau_msg_header uv1_hdr; struct uv2_3_bau_msg_header uv2_3_hdr; } header; union bau_payload_header { - struct uv1_2_3_bau_msg_payload uv1_2_3; + struct uv2_3_bau_msg_payload uv2_3; struct uv4_bau_msg_payload uv4; } payload; }; -/* UV1: - * -payload-- ---------header------ - * bytes 0-11 bits 41-56 bits 58-81 - * A B (2) C (3) - * - * A/B/C are moved to: - * A C B - * bytes 0-11 bytes 12-14 bytes 16-17 (byte 15 filled in by hw as vector) - * ------------payload queue----------- - */ /* UV2: * -payload-- ---------header------ * bytes 0-11 bits 70-78 bits 21-44 diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 60ca0afdeaf9..100d66806503 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -224,17 +224,11 @@ static inline struct uv_hub_info_s *uv_cpu_hub_info(int cpu) * This is a software convention - NOT the hardware revision numbers in * the hub chip. */ -#define UV1_HUB_REVISION_BASE 1 #define UV2_HUB_REVISION_BASE 3 #define UV3_HUB_REVISION_BASE 5 #define UV4_HUB_REVISION_BASE 7 #define UV4A_HUB_REVISION_BASE 8 /* UV4 (fixed) rev 2 */ -static inline int is_uv1_hub(void) -{ - return is_uv_hubbed(uv(1)); -} - static inline int is_uv2_hub(void) { return is_uv_hubbed(uv(2)); @@ -265,7 +259,7 @@ static inline int is_uvx_hub(void) static inline int is_uv_hub(void) { - return is_uv1_hub() || is_uvx_hub(); + return is_uvx_hub(); } union uvh_apicid { @@ -292,11 +286,6 @@ union uvh_apicid { #define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra) #define UV_PNODE_TO_NASID(p) (UV_PNODE_TO_GNODE(p) << 1) -#define UV1_LOCAL_MMR_BASE 0xf4000000UL -#define UV1_GLOBAL_MMR32_BASE 0xf8000000UL -#define UV1_LOCAL_MMR_SIZE (64UL * 1024 * 1024) -#define UV1_GLOBAL_MMR32_SIZE (64UL * 1024 * 1024) - #define UV2_LOCAL_MMR_BASE 0xfa000000UL #define UV2_GLOBAL_MMR32_BASE 0xfc000000UL #define UV2_LOCAL_MMR_SIZE (32UL * 1024 * 1024) @@ -313,25 +302,21 @@ union uvh_apicid { #define UV4_GLOBAL_MMR32_SIZE (16UL * 1024 * 1024) #define UV_LOCAL_MMR_BASE ( \ - is_uv1_hub() ? UV1_LOCAL_MMR_BASE : \ is_uv2_hub() ? UV2_LOCAL_MMR_BASE : \ is_uv3_hub() ? UV3_LOCAL_MMR_BASE : \ /*is_uv4_hub*/ UV4_LOCAL_MMR_BASE) #define UV_GLOBAL_MMR32_BASE ( \ - is_uv1_hub() ? UV1_GLOBAL_MMR32_BASE : \ is_uv2_hub() ? UV2_GLOBAL_MMR32_BASE : \ is_uv3_hub() ? UV3_GLOBAL_MMR32_BASE : \ /*is_uv4_hub*/ UV4_GLOBAL_MMR32_BASE) #define UV_LOCAL_MMR_SIZE ( \ - is_uv1_hub() ? UV1_LOCAL_MMR_SIZE : \ is_uv2_hub() ? UV2_LOCAL_MMR_SIZE : \ is_uv3_hub() ? UV3_LOCAL_MMR_SIZE : \ /*is_uv4_hub*/ UV4_LOCAL_MMR_SIZE) #define UV_GLOBAL_MMR32_SIZE ( \ - is_uv1_hub() ? UV1_GLOBAL_MMR32_SIZE : \ is_uv2_hub() ? UV2_GLOBAL_MMR32_SIZE : \ is_uv3_hub() ? UV3_GLOBAL_MMR32_SIZE : \ /*is_uv4_hub*/ UV4_GLOBAL_MMR32_SIZE) @@ -352,8 +337,6 @@ union uvh_apicid { #define UVH_APICID 0x002D0E00L #define UV_APIC_PNODE_SHIFT 6 -#define UV_APICID_HIBIT_MASK 0xffff0000 - /* Local Bus from cpu's perspective */ #define LOCAL_BUS_BASE 0x1c00000 #define LOCAL_BUS_SIZE (4 * 1024 * 1024) @@ -560,15 +543,6 @@ static inline int uv_apicid_to_pnode(int apicid) return s2pn ? s2pn[pnode - uv_hub_info->min_socket] : pnode; } -/* Convert an apicid to the socket number on the blade */ -static inline int uv_apicid_to_socket(int apicid) -{ - if (is_uv1_hub()) - return (apicid >> (uv_hub_info->apic_pnode_shift - 1)) & 1; - else - return 0; -} - /* * Access global MMRs using the low memory MMR32 space. This region supports * faster MMR access but not all MMRs are accessible in this space. @@ -660,7 +634,7 @@ static inline int uv_cpu_blade_processor_id(int cpu) return uv_cpu_info_per(cpu)->blade_cpu_id; } -/* Blade number to Node number (UV1..UV4 is 1:1) */ +/* Blade number to Node number (UV2..UV4 is 1:1) */ static inline int uv_blade_to_node(int blade) { return blade; @@ -674,7 +648,7 @@ static inline int uv_numa_blade_id(void) /* * Convert linux node number to the UV blade number. - * .. Currently for UV1 thru UV4 the node and the blade are identical. + * .. Currently for UV2 thru UV4 the node and the blade are identical. * .. If this changes then you MUST check references to this function! */ static inline int uv_node_to_blade_id(int nid) @@ -682,7 +656,7 @@ static inline int uv_node_to_blade_id(int nid) return nid; } -/* Convert a cpu number to the the UV blade number */ +/* Convert a CPU number to the UV blade number */ static inline int uv_cpu_to_blade_id(int cpu) { return uv_node_to_blade_id(cpu_to_node(cpu)); @@ -821,8 +795,6 @@ static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value) } } -extern unsigned int uv_apicid_hibits; - /* * Get the minimum revision number of the hub chips within the partition. * (See UVx_HUB_REVISION_BASE above for specific values.) diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h index 9ee5ed6e8b34..775bf143a072 100644 --- a/arch/x86/include/asm/uv/uv_mmrs.h +++ b/arch/x86/include/asm/uv/uv_mmrs.h @@ -19,7 +19,6 @@ * * UVH - definitions common to all UV hub types. * UVXH - definitions common to all UV eXtended hub types (currently 2, 3, 4). - * UV1H - definitions specific to UV type 1 hub. * UV2H - definitions specific to UV type 2 hub. * UV3H - definitions specific to UV type 3 hub. * UV4H - definitions specific to UV type 4 hub. @@ -35,19 +34,6 @@ * * If the MMR exists on all hub types but have different addresses, * use a conditional operator to define the value at runtime. - * #define UV1Hxxx a - * #define UV2Hxxx b - * #define UV3Hxxx c - * #define UV4Hxxx d - * #define UV4AHxxx e - * #define UVHxxx (is_uv1_hub() ? UV1Hxxx : - * (is_uv2_hub() ? UV2Hxxx : - * (is_uv3_hub() ? UV3Hxxx : - * (is_uv4a_hub() ? UV4AHxxx : - * UV4Hxxx)) - * - * If the MMR exists on all hub types > 1 but have different addresses, the - * variation using "UVX" as the prefix exists. * #define UV2Hxxx b * #define UV3Hxxx c * #define UV4Hxxx d @@ -61,8 +47,6 @@ * unsigned long v; * struct uvh_xxx_s { # Common fields only * } s; - * struct uv1h_xxx_s { # Full UV1 definition (*) - * } s1; * struct uv2h_xxx_s { # Full UV2 definition (*) * } s2; * struct uv3h_xxx_s { # Full UV3 definition (*) @@ -92,7 +76,6 @@ #define UV_MMR_ENABLE (1UL << 63) -#define UV1_HUB_PART_NUMBER 0x88a5 #define UV2_HUB_PART_NUMBER 0x8eb8 #define UV2_HUB_PART_NUMBER_X 0x1111 #define UV3_HUB_PART_NUMBER 0x9578 @@ -107,12 +90,10 @@ extern unsigned long uv_undefined(char *str); /* ========================================================================= */ #define UVH_BAU_DATA_BROADCAST 0x61688UL -#define UV1H_BAU_DATA_BROADCAST_32 0x440 #define UV2H_BAU_DATA_BROADCAST_32 0x440 #define UV3H_BAU_DATA_BROADCAST_32 0x440 #define UV4H_BAU_DATA_BROADCAST_32 0x360 #define UVH_BAU_DATA_BROADCAST_32 ( \ - is_uv1_hub() ? UV1H_BAU_DATA_BROADCAST_32 : \ is_uv2_hub() ? UV2H_BAU_DATA_BROADCAST_32 : \ is_uv3_hub() ? UV3H_BAU_DATA_BROADCAST_32 : \ /*is_uv4_hub*/ UV4H_BAU_DATA_BROADCAST_32) @@ -134,12 +115,10 @@ union uvh_bau_data_broadcast_u { /* ========================================================================= */ #define UVH_BAU_DATA_CONFIG 0x61680UL -#define UV1H_BAU_DATA_CONFIG_32 0x438 #define UV2H_BAU_DATA_CONFIG_32 0x438 #define UV3H_BAU_DATA_CONFIG_32 0x438 #define UV4H_BAU_DATA_CONFIG_32 0x358 #define UVH_BAU_DATA_CONFIG_32 ( \ - is_uv1_hub() ? UV1H_BAU_DATA_CONFIG_32 : \ is_uv2_hub() ? UV2H_BAU_DATA_CONFIG_32 : \ is_uv3_hub() ? UV3H_BAU_DATA_CONFIG_32 : \ /*is_uv4_hub*/ UV4H_BAU_DATA_CONFIG_32) @@ -189,117 +168,6 @@ union uvh_bau_data_config_u { #define UVH_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL #define UVH_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL -#define UV1H_EVENT_OCCURRED0_GR0_HCERR_SHFT 1 -#define UV1H_EVENT_OCCURRED0_GR1_HCERR_SHFT 2 -#define UV1H_EVENT_OCCURRED0_LH_HCERR_SHFT 3 -#define UV1H_EVENT_OCCURRED0_RH_HCERR_SHFT 4 -#define UV1H_EVENT_OCCURRED0_XN_HCERR_SHFT 5 -#define UV1H_EVENT_OCCURRED0_SI_HCERR_SHFT 6 -#define UV1H_EVENT_OCCURRED0_LB_AOERR0_SHFT 7 -#define UV1H_EVENT_OCCURRED0_GR0_AOERR0_SHFT 8 -#define UV1H_EVENT_OCCURRED0_GR1_AOERR0_SHFT 9 -#define UV1H_EVENT_OCCURRED0_LH_AOERR0_SHFT 10 -#define UV1H_EVENT_OCCURRED0_XN_AOERR0_SHFT 12 -#define UV1H_EVENT_OCCURRED0_SI_AOERR0_SHFT 13 -#define UV1H_EVENT_OCCURRED0_LB_AOERR1_SHFT 14 -#define UV1H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 15 -#define UV1H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 16 -#define UV1H_EVENT_OCCURRED0_LH_AOERR1_SHFT 17 -#define UV1H_EVENT_OCCURRED0_RH_AOERR1_SHFT 18 -#define UV1H_EVENT_OCCURRED0_XN_AOERR1_SHFT 19 -#define UV1H_EVENT_OCCURRED0_SI_AOERR1_SHFT 20 -#define UV1H_EVENT_OCCURRED0_RH_VPI_INT_SHFT 21 -#define UV1H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 22 -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 23 -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 24 -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 25 -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 26 -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 27 -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 28 -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 29 -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 30 -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 31 -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 32 -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 33 -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 34 -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 35 -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 36 -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 37 -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 38 -#define UV1H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 39 -#define UV1H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 40 -#define UV1H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 41 -#define UV1H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 42 -#define UV1H_EVENT_OCCURRED0_LTC_INT_SHFT 43 -#define UV1H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 44 -#define UV1H_EVENT_OCCURRED0_IPI_INT_SHFT 45 -#define UV1H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 46 -#define UV1H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 47 -#define UV1H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 48 -#define UV1H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 49 -#define UV1H_EVENT_OCCURRED0_PROFILE_INT_SHFT 50 -#define UV1H_EVENT_OCCURRED0_RTC0_SHFT 51 -#define UV1H_EVENT_OCCURRED0_RTC1_SHFT 52 -#define UV1H_EVENT_OCCURRED0_RTC2_SHFT 53 -#define UV1H_EVENT_OCCURRED0_RTC3_SHFT 54 -#define UV1H_EVENT_OCCURRED0_BAU_DATA_SHFT 55 -#define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_SHFT 56 -#define UV1H_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000002UL -#define UV1H_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000004UL -#define UV1H_EVENT_OCCURRED0_LH_HCERR_MASK 0x0000000000000008UL -#define UV1H_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000010UL -#define UV1H_EVENT_OCCURRED0_XN_HCERR_MASK 0x0000000000000020UL -#define UV1H_EVENT_OCCURRED0_SI_HCERR_MASK 0x0000000000000040UL -#define UV1H_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000080UL -#define UV1H_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000000100UL -#define UV1H_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000000200UL -#define UV1H_EVENT_OCCURRED0_LH_AOERR0_MASK 0x0000000000000400UL -#define UV1H_EVENT_OCCURRED0_XN_AOERR0_MASK 0x0000000000001000UL -#define UV1H_EVENT_OCCURRED0_SI_AOERR0_MASK 0x0000000000002000UL -#define UV1H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000004000UL -#define UV1H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000000008000UL -#define UV1H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000000010000UL -#define UV1H_EVENT_OCCURRED0_LH_AOERR1_MASK 0x0000000000020000UL -#define UV1H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000040000UL -#define UV1H_EVENT_OCCURRED0_XN_AOERR1_MASK 0x0000000000080000UL -#define UV1H_EVENT_OCCURRED0_SI_AOERR1_MASK 0x0000000000100000UL -#define UV1H_EVENT_OCCURRED0_RH_VPI_INT_MASK 0x0000000000200000UL -#define UV1H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000000800000UL -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000001000000UL -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000002000000UL -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000004000000UL -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000000008000000UL -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000000010000000UL -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000000020000000UL -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000000040000000UL -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000000080000000UL -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000000100000000UL -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000000200000000UL -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000000400000000UL -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000000800000000UL -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000001000000000UL -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000002000000000UL -#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000004000000000UL -#define UV1H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0000008000000000UL -#define UV1H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0000010000000000UL -#define UV1H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0000020000000000UL -#define UV1H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0000040000000000UL -#define UV1H_EVENT_OCCURRED0_LTC_INT_MASK 0x0000080000000000UL -#define UV1H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL -#define UV1H_EVENT_OCCURRED0_IPI_INT_MASK 0x0000200000000000UL -#define UV1H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0000400000000000UL -#define UV1H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0000800000000000UL -#define UV1H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0001000000000000UL -#define UV1H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0002000000000000UL -#define UV1H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0004000000000000UL -#define UV1H_EVENT_OCCURRED0_RTC0_MASK 0x0008000000000000UL -#define UV1H_EVENT_OCCURRED0_RTC1_MASK 0x0010000000000000UL -#define UV1H_EVENT_OCCURRED0_RTC2_MASK 0x0020000000000000UL -#define UV1H_EVENT_OCCURRED0_RTC3_MASK 0x0040000000000000UL -#define UV1H_EVENT_OCCURRED0_BAU_DATA_MASK 0x0080000000000000UL -#define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK 0x0100000000000000UL - #define UVXH_EVENT_OCCURRED0_RH_HCERR_SHFT 2 #define UVXH_EVENT_OCCURRED0_LH0_HCERR_SHFT 3 #define UVXH_EVENT_OCCURRED0_LH1_HCERR_SHFT 4 @@ -605,7 +473,6 @@ union uvh_bau_data_config_u { #define UV4H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x8000000000000000UL #define UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT ( \ - is_uv1_hub() ? UV1H_EVENT_OCCURRED0_EXTIO_INT0_SHFT : \ is_uv2_hub() ? UV2H_EVENT_OCCURRED0_EXTIO_INT0_SHFT : \ is_uv3_hub() ? UV3H_EVENT_OCCURRED0_EXTIO_INT0_SHFT : \ /*is_uv4_hub*/ UV4H_EVENT_OCCURRED0_EXTIO_INT0_SHFT) @@ -718,12 +585,10 @@ union uvh_event_occurred0_u { /* ========================================================================= */ #define UVH_EXTIO_INT0_BROADCAST 0x61448UL -#define UV1H_EXTIO_INT0_BROADCAST_32 0x3f0 #define UV2H_EXTIO_INT0_BROADCAST_32 0x3f0 #define UV3H_EXTIO_INT0_BROADCAST_32 0x3f0 #define UV4H_EXTIO_INT0_BROADCAST_32 0x310 #define UVH_EXTIO_INT0_BROADCAST_32 ( \ - is_uv1_hub() ? UV1H_EXTIO_INT0_BROADCAST_32 : \ is_uv2_hub() ? UV2H_EXTIO_INT0_BROADCAST_32 : \ is_uv3_hub() ? UV3H_EXTIO_INT0_BROADCAST_32 : \ /*is_uv4_hub*/ UV4H_EXTIO_INT0_BROADCAST_32) @@ -821,12 +686,10 @@ union uvh_gr0_tlb_int1_config_u { /* ========================================================================= */ /* UVH_GR0_TLB_MMR_CONTROL */ /* ========================================================================= */ -#define UV1H_GR0_TLB_MMR_CONTROL 0x401080UL #define UV2H_GR0_TLB_MMR_CONTROL 0xc01080UL #define UV3H_GR0_TLB_MMR_CONTROL 0xc01080UL #define UV4H_GR0_TLB_MMR_CONTROL 0x601080UL #define UVH_GR0_TLB_MMR_CONTROL ( \ - is_uv1_hub() ? UV1H_GR0_TLB_MMR_CONTROL : \ is_uv2_hub() ? UV2H_GR0_TLB_MMR_CONTROL : \ is_uv3_hub() ? UV3H_GR0_TLB_MMR_CONTROL : \ /*is_uv4_hub*/ UV4H_GR0_TLB_MMR_CONTROL) @@ -841,29 +704,6 @@ union uvh_gr0_tlb_int1_config_u { #define UVH_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL #define UVH_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL -#define UV1H_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0 -#define UV1H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 -#define UV1H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 -#define UV1H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 -#define UV1H_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 -#define UV1H_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31 -#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_CON_SHFT 48 -#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_SHFT 52 -#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBPGSIZE_SHFT 54 -#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_SHFT 56 -#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_SHFT 60 -#define UV1H_GR0_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL -#define UV1H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL -#define UV1H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL -#define UV1H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL -#define UV1H_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL -#define UV1H_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL -#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_CON_MASK 0x0001000000000000UL -#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK 0x0010000000000000UL -#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBPGSIZE_MASK 0x0040000000000000UL -#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_MASK 0x0100000000000000UL -#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_MASK 0x1000000000000000UL - #define UVXH_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0 #define UVXH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 #define UVXH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 @@ -932,17 +772,14 @@ union uvh_gr0_tlb_int1_config_u { #define UV4H_GR0_TLB_MMR_CONTROL_PAGE_SIZE_MASK 0xf800000000000000UL #define UVH_GR0_TLB_MMR_CONTROL_INDEX_MASK ( \ - is_uv1_hub() ? UV1H_GR0_TLB_MMR_CONTROL_INDEX_MASK : \ is_uv2_hub() ? UV2H_GR0_TLB_MMR_CONTROL_INDEX_MASK : \ is_uv3_hub() ? UV3H_GR0_TLB_MMR_CONTROL_INDEX_MASK : \ /*is_uv4_hub*/ UV4H_GR0_TLB_MMR_CONTROL_INDEX_MASK) #define UVH_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK ( \ - is_uv1_hub() ? UV1H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK : \ is_uv2_hub() ? UV2H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK : \ is_uv3_hub() ? UV3H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK : \ /*is_uv4_hub*/ UV4H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK) #define UVH_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT ( \ - is_uv1_hub() ? UV1H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT : \ is_uv2_hub() ? UV2H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT : \ is_uv3_hub() ? UV3H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT : \ /*is_uv4_hub*/ UV4H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT) @@ -961,28 +798,6 @@ union uvh_gr0_tlb_mmr_control_u { unsigned long rsvd_49_51:3; unsigned long rsvd_52_63:12; } s; - struct uv1h_gr0_tlb_mmr_control_s { - unsigned long index:12; /* RW */ - unsigned long mem_sel:2; /* RW */ - unsigned long rsvd_14_15:2; - unsigned long auto_valid_en:1; /* RW */ - unsigned long rsvd_17_19:3; - unsigned long mmr_hash_index_en:1; /* RW */ - unsigned long rsvd_21_29:9; - unsigned long mmr_write:1; /* WP */ - unsigned long mmr_read:1; /* WP */ - unsigned long rsvd_32_47:16; - unsigned long mmr_inj_con:1; /* RW */ - unsigned long rsvd_49_51:3; - unsigned long mmr_inj_tlbram:1; /* RW */ - unsigned long rsvd_53:1; - unsigned long mmr_inj_tlbpgsize:1; /* RW */ - unsigned long rsvd_55:1; - unsigned long mmr_inj_tlbrreg:1; /* RW */ - unsigned long rsvd_57_59:3; - unsigned long mmr_inj_tlblruv:1; /* RW */ - unsigned long rsvd_61_63:3; - } s1; struct uvxh_gr0_tlb_mmr_control_s { unsigned long rsvd_0_15:16; unsigned long auto_valid_en:1; /* RW */ @@ -1055,27 +870,16 @@ union uvh_gr0_tlb_mmr_control_u { /* ========================================================================= */ /* UVH_GR0_TLB_MMR_READ_DATA_HI */ /* ========================================================================= */ -#define UV1H_GR0_TLB_MMR_READ_DATA_HI 0x4010a0UL #define UV2H_GR0_TLB_MMR_READ_DATA_HI 0xc010a0UL #define UV3H_GR0_TLB_MMR_READ_DATA_HI 0xc010a0UL #define UV4H_GR0_TLB_MMR_READ_DATA_HI 0x6010a0UL #define UVH_GR0_TLB_MMR_READ_DATA_HI ( \ - is_uv1_hub() ? UV1H_GR0_TLB_MMR_READ_DATA_HI : \ is_uv2_hub() ? UV2H_GR0_TLB_MMR_READ_DATA_HI : \ is_uv3_hub() ? UV3H_GR0_TLB_MMR_READ_DATA_HI : \ /*is_uv4_hub*/ UV4H_GR0_TLB_MMR_READ_DATA_HI) #define UVH_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 -#define UV1H_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 -#define UV1H_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 -#define UV1H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43 -#define UV1H_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44 -#define UV1H_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL -#define UV1H_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL -#define UV1H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL -#define UV1H_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL - #define UVXH_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 #define UV2H_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 @@ -1118,13 +922,6 @@ union uvh_gr0_tlb_mmr_control_u { union uvh_gr0_tlb_mmr_read_data_hi_u { unsigned long v; - struct uv1h_gr0_tlb_mmr_read_data_hi_s { - unsigned long pfn:41; /* RO */ - unsigned long gaa:2; /* RO */ - unsigned long dirty:1; /* RO */ - unsigned long larger:1; /* RO */ - unsigned long rsvd_45_63:19; - } s1; struct uv2h_gr0_tlb_mmr_read_data_hi_s { unsigned long pfn:41; /* RO */ unsigned long gaa:2; /* RO */ @@ -1156,12 +953,10 @@ union uvh_gr0_tlb_mmr_read_data_hi_u { /* ========================================================================= */ /* UVH_GR0_TLB_MMR_READ_DATA_LO */ /* ========================================================================= */ -#define UV1H_GR0_TLB_MMR_READ_DATA_LO 0x4010a8UL #define UV2H_GR0_TLB_MMR_READ_DATA_LO 0xc010a8UL #define UV3H_GR0_TLB_MMR_READ_DATA_LO 0xc010a8UL #define UV4H_GR0_TLB_MMR_READ_DATA_LO 0x6010a8UL #define UVH_GR0_TLB_MMR_READ_DATA_LO ( \ - is_uv1_hub() ? UV1H_GR0_TLB_MMR_READ_DATA_LO : \ is_uv2_hub() ? UV2H_GR0_TLB_MMR_READ_DATA_LO : \ is_uv3_hub() ? UV3H_GR0_TLB_MMR_READ_DATA_LO : \ /*is_uv4_hub*/ UV4H_GR0_TLB_MMR_READ_DATA_LO) @@ -1173,13 +968,6 @@ union uvh_gr0_tlb_mmr_read_data_hi_u { #define UVH_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL #define UVH_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL -#define UV1H_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 -#define UV1H_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 -#define UV1H_GR0_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 -#define UV1H_GR0_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL -#define UV1H_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL -#define UV1H_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL - #define UVXH_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 #define UVXH_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 #define UVXH_GR0_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 @@ -1216,11 +1004,6 @@ union uvh_gr0_tlb_mmr_read_data_lo_u { unsigned long asid:24; /* RO */ unsigned long valid:1; /* RO */ } s; - struct uv1h_gr0_tlb_mmr_read_data_lo_s { - unsigned long vpn:39; /* RO */ - unsigned long asid:24; /* RO */ - unsigned long valid:1; /* RO */ - } s1; struct uvxh_gr0_tlb_mmr_read_data_lo_s { unsigned long vpn:39; /* RO */ unsigned long asid:24; /* RO */ @@ -1246,12 +1029,10 @@ union uvh_gr0_tlb_mmr_read_data_lo_u { /* ========================================================================= */ /* UVH_GR1_TLB_INT0_CONFIG */ /* ========================================================================= */ -#define UV1H_GR1_TLB_INT0_CONFIG 0x61f00UL #define UV2H_GR1_TLB_INT0_CONFIG 0x61f00UL #define UV3H_GR1_TLB_INT0_CONFIG 0x61f00UL #define UV4H_GR1_TLB_INT0_CONFIG 0x62100UL #define UVH_GR1_TLB_INT0_CONFIG ( \ - is_uv1_hub() ? UV1H_GR1_TLB_INT0_CONFIG : \ is_uv2_hub() ? UV2H_GR1_TLB_INT0_CONFIG : \ is_uv3_hub() ? UV3H_GR1_TLB_INT0_CONFIG : \ /*is_uv4_hub*/ UV4H_GR1_TLB_INT0_CONFIG) @@ -1293,12 +1074,10 @@ union uvh_gr1_tlb_int0_config_u { /* ========================================================================= */ /* UVH_GR1_TLB_INT1_CONFIG */ /* ========================================================================= */ -#define UV1H_GR1_TLB_INT1_CONFIG 0x61f40UL #define UV2H_GR1_TLB_INT1_CONFIG 0x61f40UL #define UV3H_GR1_TLB_INT1_CONFIG 0x61f40UL #define UV4H_GR1_TLB_INT1_CONFIG 0x62140UL #define UVH_GR1_TLB_INT1_CONFIG ( \ - is_uv1_hub() ? UV1H_GR1_TLB_INT1_CONFIG : \ is_uv2_hub() ? UV2H_GR1_TLB_INT1_CONFIG : \ is_uv3_hub() ? UV3H_GR1_TLB_INT1_CONFIG : \ /*is_uv4_hub*/ UV4H_GR1_TLB_INT1_CONFIG) @@ -1340,12 +1119,10 @@ union uvh_gr1_tlb_int1_config_u { /* ========================================================================= */ /* UVH_GR1_TLB_MMR_CONTROL */ /* ========================================================================= */ -#define UV1H_GR1_TLB_MMR_CONTROL 0x801080UL #define UV2H_GR1_TLB_MMR_CONTROL 0x1001080UL #define UV3H_GR1_TLB_MMR_CONTROL 0x1001080UL #define UV4H_GR1_TLB_MMR_CONTROL 0x701080UL #define UVH_GR1_TLB_MMR_CONTROL ( \ - is_uv1_hub() ? UV1H_GR1_TLB_MMR_CONTROL : \ is_uv2_hub() ? UV2H_GR1_TLB_MMR_CONTROL : \ is_uv3_hub() ? UV3H_GR1_TLB_MMR_CONTROL : \ /*is_uv4_hub*/ UV4H_GR1_TLB_MMR_CONTROL) @@ -1360,29 +1137,6 @@ union uvh_gr1_tlb_int1_config_u { #define UVH_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL #define UVH_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL -#define UV1H_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0 -#define UV1H_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 -#define UV1H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 -#define UV1H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 -#define UV1H_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 -#define UV1H_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31 -#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_CON_SHFT 48 -#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_SHFT 52 -#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBPGSIZE_SHFT 54 -#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_SHFT 56 -#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_SHFT 60 -#define UV1H_GR1_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL -#define UV1H_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL -#define UV1H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL -#define UV1H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL -#define UV1H_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL -#define UV1H_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL -#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_CON_MASK 0x0001000000000000UL -#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK 0x0010000000000000UL -#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBPGSIZE_MASK 0x0040000000000000UL -#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_MASK 0x0100000000000000UL -#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_MASK 0x1000000000000000UL - #define UVXH_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0 #define UVXH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 #define UVXH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 @@ -1465,28 +1219,6 @@ union uvh_gr1_tlb_mmr_control_u { unsigned long rsvd_49_51:3; unsigned long rsvd_52_63:12; } s; - struct uv1h_gr1_tlb_mmr_control_s { - unsigned long index:12; /* RW */ - unsigned long mem_sel:2; /* RW */ - unsigned long rsvd_14_15:2; - unsigned long auto_valid_en:1; /* RW */ - unsigned long rsvd_17_19:3; - unsigned long mmr_hash_index_en:1; /* RW */ - unsigned long rsvd_21_29:9; - unsigned long mmr_write:1; /* WP */ - unsigned long mmr_read:1; /* WP */ - unsigned long rsvd_32_47:16; - unsigned long mmr_inj_con:1; /* RW */ - unsigned long rsvd_49_51:3; - unsigned long mmr_inj_tlbram:1; /* RW */ - unsigned long rsvd_53:1; - unsigned long mmr_inj_tlbpgsize:1; /* RW */ - unsigned long rsvd_55:1; - unsigned long mmr_inj_tlbrreg:1; /* RW */ - unsigned long rsvd_57_59:3; - unsigned long mmr_inj_tlblruv:1; /* RW */ - unsigned long rsvd_61_63:3; - } s1; struct uvxh_gr1_tlb_mmr_control_s { unsigned long rsvd_0_15:16; unsigned long auto_valid_en:1; /* RW */ @@ -1559,27 +1291,16 @@ union uvh_gr1_tlb_mmr_control_u { /* ========================================================================= */ /* UVH_GR1_TLB_MMR_READ_DATA_HI */ /* ========================================================================= */ -#define UV1H_GR1_TLB_MMR_READ_DATA_HI 0x8010a0UL #define UV2H_GR1_TLB_MMR_READ_DATA_HI 0x10010a0UL #define UV3H_GR1_TLB_MMR_READ_DATA_HI 0x10010a0UL #define UV4H_GR1_TLB_MMR_READ_DATA_HI 0x7010a0UL #define UVH_GR1_TLB_MMR_READ_DATA_HI ( \ - is_uv1_hub() ? UV1H_GR1_TLB_MMR_READ_DATA_HI : \ is_uv2_hub() ? UV2H_GR1_TLB_MMR_READ_DATA_HI : \ is_uv3_hub() ? UV3H_GR1_TLB_MMR_READ_DATA_HI : \ /*is_uv4_hub*/ UV4H_GR1_TLB_MMR_READ_DATA_HI) #define UVH_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 -#define UV1H_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 -#define UV1H_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 -#define UV1H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43 -#define UV1H_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44 -#define UV1H_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL -#define UV1H_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL -#define UV1H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL -#define UV1H_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL - #define UVXH_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 #define UV2H_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 @@ -1622,13 +1343,6 @@ union uvh_gr1_tlb_mmr_control_u { union uvh_gr1_tlb_mmr_read_data_hi_u { unsigned long v; - struct uv1h_gr1_tlb_mmr_read_data_hi_s { - unsigned long pfn:41; /* RO */ - unsigned long gaa:2; /* RO */ - unsigned long dirty:1; /* RO */ - unsigned long larger:1; /* RO */ - unsigned long rsvd_45_63:19; - } s1; struct uv2h_gr1_tlb_mmr_read_data_hi_s { unsigned long pfn:41; /* RO */ unsigned long gaa:2; /* RO */ @@ -1660,12 +1374,10 @@ union uvh_gr1_tlb_mmr_read_data_hi_u { /* ========================================================================= */ /* UVH_GR1_TLB_MMR_READ_DATA_LO */ /* ========================================================================= */ -#define UV1H_GR1_TLB_MMR_READ_DATA_LO 0x8010a8UL #define UV2H_GR1_TLB_MMR_READ_DATA_LO 0x10010a8UL #define UV3H_GR1_TLB_MMR_READ_DATA_LO 0x10010a8UL #define UV4H_GR1_TLB_MMR_READ_DATA_LO 0x7010a8UL #define UVH_GR1_TLB_MMR_READ_DATA_LO ( \ - is_uv1_hub() ? UV1H_GR1_TLB_MMR_READ_DATA_LO : \ is_uv2_hub() ? UV2H_GR1_TLB_MMR_READ_DATA_LO : \ is_uv3_hub() ? UV3H_GR1_TLB_MMR_READ_DATA_LO : \ /*is_uv4_hub*/ UV4H_GR1_TLB_MMR_READ_DATA_LO) @@ -1677,13 +1389,6 @@ union uvh_gr1_tlb_mmr_read_data_hi_u { #define UVH_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL #define UVH_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL -#define UV1H_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 -#define UV1H_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 -#define UV1H_GR1_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 -#define UV1H_GR1_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL -#define UV1H_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL -#define UV1H_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL - #define UVXH_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 #define UVXH_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 #define UVXH_GR1_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 @@ -1720,11 +1425,6 @@ union uvh_gr1_tlb_mmr_read_data_lo_u { unsigned long asid:24; /* RO */ unsigned long valid:1; /* RO */ } s; - struct uv1h_gr1_tlb_mmr_read_data_lo_s { - unsigned long vpn:39; /* RO */ - unsigned long asid:24; /* RO */ - unsigned long valid:1; /* RO */ - } s1; struct uvxh_gr1_tlb_mmr_read_data_lo_s { unsigned long vpn:39; /* RO */ unsigned long asid:24; /* RO */ @@ -1770,9 +1470,6 @@ union uvh_int_cmpb_u { #define UVH_INT_CMPC 0x22100UL -#define UV1H_INT_CMPC_REAL_TIME_CMPC_SHFT 0 -#define UV1H_INT_CMPC_REAL_TIME_CMPC_MASK 0x00ffffffffffffffUL - #define UVXH_INT_CMPC_REAL_TIME_CMP_2_SHFT 0 #define UVXH_INT_CMPC_REAL_TIME_CMP_2_MASK 0x00ffffffffffffffUL @@ -1791,9 +1488,6 @@ union uvh_int_cmpc_u { #define UVH_INT_CMPD 0x22180UL -#define UV1H_INT_CMPD_REAL_TIME_CMPD_SHFT 0 -#define UV1H_INT_CMPD_REAL_TIME_CMPD_MASK 0x00ffffffffffffffUL - #define UVXH_INT_CMPD_REAL_TIME_CMP_3_SHFT 0 #define UVXH_INT_CMPD_REAL_TIME_CMP_3_MASK 0x00ffffffffffffffUL @@ -1811,12 +1505,10 @@ union uvh_int_cmpd_u { /* ========================================================================= */ #define UVH_IPI_INT 0x60500UL -#define UV1H_IPI_INT_32 0x348 #define UV2H_IPI_INT_32 0x348 #define UV3H_IPI_INT_32 0x348 #define UV4H_IPI_INT_32 0x268 #define UVH_IPI_INT_32 ( \ - is_uv1_hub() ? UV1H_IPI_INT_32 : \ is_uv2_hub() ? UV2H_IPI_INT_32 : \ is_uv3_hub() ? UV3H_IPI_INT_32 : \ /*is_uv4_hub*/ UV4H_IPI_INT_32) @@ -1849,24 +1541,16 @@ union uvh_ipi_int_u { /* ========================================================================= */ /* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST */ /* ========================================================================= */ -#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL #define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL #define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL #define UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST uv_undefined("UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST") #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST ( \ - is_uv1_hub() ? UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST : \ is_uv2_hub() ? UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST : \ is_uv3_hub() ? UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST : \ /*is_uv4_hub*/ UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST) #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x9c0 -#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4 -#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_SHFT 49 -#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL -#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_MASK 0x7ffe000000000000UL - - #define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4 #define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_SHFT 49 #define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL @@ -1880,13 +1564,6 @@ union uvh_ipi_int_u { union uvh_lb_bau_intd_payload_queue_first_u { unsigned long v; - struct uv1h_lb_bau_intd_payload_queue_first_s { - unsigned long rsvd_0_3:4; - unsigned long address:39; /* RW */ - unsigned long rsvd_43_48:6; - unsigned long node_id:14; /* RW */ - unsigned long rsvd_63:1; - } s1; struct uv2h_lb_bau_intd_payload_queue_first_s { unsigned long rsvd_0_3:4; unsigned long address:39; /* RW */ @@ -1906,22 +1583,16 @@ union uvh_lb_bau_intd_payload_queue_first_u { /* ========================================================================= */ /* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST */ /* ========================================================================= */ -#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL #define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL #define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL #define UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST uv_undefined("UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST") #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST ( \ - is_uv1_hub() ? UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST : \ is_uv2_hub() ? UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST : \ is_uv3_hub() ? UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST : \ /*is_uv4_hub*/ UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST) #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x9c8 -#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4 -#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL - - #define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4 #define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL @@ -1931,11 +1602,6 @@ union uvh_lb_bau_intd_payload_queue_first_u { union uvh_lb_bau_intd_payload_queue_last_u { unsigned long v; - struct uv1h_lb_bau_intd_payload_queue_last_s { - unsigned long rsvd_0_3:4; - unsigned long address:39; /* RW */ - unsigned long rsvd_43_63:21; - } s1; struct uv2h_lb_bau_intd_payload_queue_last_s { unsigned long rsvd_0_3:4; unsigned long address:39; /* RW */ @@ -1951,22 +1617,16 @@ union uvh_lb_bau_intd_payload_queue_last_u { /* ========================================================================= */ /* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL */ /* ========================================================================= */ -#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL #define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL #define UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL #define UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL uv_undefined("UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL") #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL ( \ - is_uv1_hub() ? UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL : \ is_uv2_hub() ? UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL : \ is_uv3_hub() ? UV3H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL : \ /*is_uv4_hub*/ UV4H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL) #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x9d0 -#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4 -#define UV1H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL - - #define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4 #define UV2H_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL @@ -1976,11 +1636,6 @@ union uvh_lb_bau_intd_payload_queue_last_u { union uvh_lb_bau_intd_payload_queue_tail_u { unsigned long v; - struct uv1h_lb_bau_intd_payload_queue_tail_s { - unsigned long rsvd_0_3:4; - unsigned long address:39; /* RW */ - unsigned long rsvd_43_63:21; - } s1; struct uv2h_lb_bau_intd_payload_queue_tail_s { unsigned long rsvd_0_3:4; unsigned long address:39; /* RW */ @@ -1996,52 +1651,16 @@ union uvh_lb_bau_intd_payload_queue_tail_u { /* ========================================================================= */ /* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE */ /* ========================================================================= */ -#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL #define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL #define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL #define UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE uv_undefined("UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE") #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE ( \ - is_uv1_hub() ? UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE : \ is_uv2_hub() ? UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE : \ is_uv3_hub() ? UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE : \ /*is_uv4_hub*/ UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE) #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0xa68 -#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0 -#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_SHFT 1 -#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_SHFT 2 -#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_SHFT 3 -#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_SHFT 4 -#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_SHFT 5 -#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_SHFT 6 -#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_SHFT 7 -#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_SHFT 8 -#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_SHFT 9 -#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_SHFT 10 -#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_SHFT 11 -#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_SHFT 12 -#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_SHFT 13 -#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_SHFT 14 -#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15 -#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL -#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL -#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL -#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL -#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL -#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL -#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL -#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL -#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL -#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL -#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL -#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL -#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL -#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL -#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL -#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL - - #define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0 #define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_SHFT 1 #define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_SHFT 2 @@ -2111,25 +1730,6 @@ union uvh_lb_bau_intd_payload_queue_tail_u { union uvh_lb_bau_intd_software_acknowledge_u { unsigned long v; - struct uv1h_lb_bau_intd_software_acknowledge_s { - unsigned long pending_0:1; /* RW, W1C */ - unsigned long pending_1:1; /* RW, W1C */ - unsigned long pending_2:1; /* RW, W1C */ - unsigned long pending_3:1; /* RW, W1C */ - unsigned long pending_4:1; /* RW, W1C */ - unsigned long pending_5:1; /* RW, W1C */ - unsigned long pending_6:1; /* RW, W1C */ - unsigned long pending_7:1; /* RW, W1C */ - unsigned long timeout_0:1; /* RW, W1C */ - unsigned long timeout_1:1; /* RW, W1C */ - unsigned long timeout_2:1; /* RW, W1C */ - unsigned long timeout_3:1; /* RW, W1C */ - unsigned long timeout_4:1; /* RW, W1C */ - unsigned long timeout_5:1; /* RW, W1C */ - unsigned long timeout_6:1; /* RW, W1C */ - unsigned long timeout_7:1; /* RW, W1C */ - unsigned long rsvd_16_63:48; - } s1; struct uv2h_lb_bau_intd_software_acknowledge_s { unsigned long pending_0:1; /* RW */ unsigned long pending_1:1; /* RW */ @@ -2173,12 +1773,10 @@ union uvh_lb_bau_intd_software_acknowledge_u { /* ========================================================================= */ /* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS */ /* ========================================================================= */ -#define UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x320088UL #define UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x320088UL #define UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x320088UL #define UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS uv_undefined("UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS") #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS ( \ - is_uv1_hub() ? UV1H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS : \ is_uv2_hub() ? UV2H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS : \ is_uv3_hub() ? UV3H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS : \ /*is_uv4_hub*/ UV4H_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS) @@ -2188,22 +1786,18 @@ union uvh_lb_bau_intd_software_acknowledge_u { /* ========================================================================= */ /* UVH_LB_BAU_MISC_CONTROL */ /* ========================================================================= */ -#define UV1H_LB_BAU_MISC_CONTROL 0x320170UL #define UV2H_LB_BAU_MISC_CONTROL 0x320170UL #define UV3H_LB_BAU_MISC_CONTROL 0x320170UL #define UV4H_LB_BAU_MISC_CONTROL 0xc8170UL #define UVH_LB_BAU_MISC_CONTROL ( \ - is_uv1_hub() ? UV1H_LB_BAU_MISC_CONTROL : \ is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL : \ is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL : \ /*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL) -#define UV1H_LB_BAU_MISC_CONTROL_32 0xa10 #define UV2H_LB_BAU_MISC_CONTROL_32 0xa10 #define UV3H_LB_BAU_MISC_CONTROL_32 0xa10 #define UV4H_LB_BAU_MISC_CONTROL_32 0xa18 #define UVH_LB_BAU_MISC_CONTROL_32 ( \ - is_uv1_hub() ? UV1H_LB_BAU_MISC_CONTROL_32 : \ is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL_32 : \ is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL_32 : \ /*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL_32) @@ -2237,39 +1831,6 @@ union uvh_lb_bau_intd_software_acknowledge_u { #define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL #define UVH_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL -#define UV1H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 -#define UV1H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8 -#define UV1H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9 -#define UV1H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10 -#define UV1H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11 -#define UV1H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14 -#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15 -#define UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16 -#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20 -#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21 -#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22 -#define UV1H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23 -#define UV1H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24 -#define UV1H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27 -#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28 -#define UV1H_LB_BAU_MISC_CONTROL_FUN_SHFT 48 -#define UV1H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL -#define UV1H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL -#define UV1H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL -#define UV1H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL -#define UV1H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL -#define UV1H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL -#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL -#define UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL -#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL -#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL -#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL -#define UV1H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL -#define UV1H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL -#define UV1H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL -#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL -#define UV1H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL - #define UVXH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 #define UVXH_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8 #define UVXH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9 @@ -2469,28 +2030,24 @@ union uvh_lb_bau_intd_software_acknowledge_u { #define UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK \ uv_undefined("UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK") #define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK ( \ - is_uv1_hub() ? UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK : \ is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK : \ is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK : \ /*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK) #define UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT \ uv_undefined("UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT") #define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT ( \ - is_uv1_hub() ? UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT : \ is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT : \ is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT : \ /*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT) #define UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK \ uv_undefined("UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK") #define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK ( \ - is_uv1_hub() ? UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK : \ is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK : \ is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK : \ /*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK) #define UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT \ uv_undefined("UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT") #define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT ( \ - is_uv1_hub() ? UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT : \ is_uv2_hub() ? UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT : \ is_uv3_hub() ? UV3H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT : \ /*is_uv4_hub*/ UV4H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT) @@ -2515,25 +2072,6 @@ union uvh_lb_bau_misc_control_u { unsigned long rsvd_29_47:19; unsigned long fun:16; /* RW */ } s; - struct uv1h_lb_bau_misc_control_s { - unsigned long rejection_delay:8; /* RW */ - unsigned long apic_mode:1; /* RW */ - unsigned long force_broadcast:1; /* RW */ - unsigned long force_lock_nop:1; /* RW */ - unsigned long qpi_agent_presence_vector:3; /* RW */ - unsigned long descriptor_fetch_mode:1; /* RW */ - unsigned long enable_intd_soft_ack_mode:1; /* RW */ - unsigned long intd_soft_ack_timeout_period:4; /* RW */ - unsigned long enable_dual_mapping_mode:1; /* RW */ - unsigned long vga_io_port_decode_enable:1; /* RW */ - unsigned long vga_io_port_16_bit_decode:1; /* RW */ - unsigned long suppress_dest_registration:1; /* RW */ - unsigned long programmed_initial_priority:3; /* RW */ - unsigned long use_incoming_priority:1; /* RW */ - unsigned long enable_programmed_initial_priority:1;/* RW */ - unsigned long rsvd_29_47:19; - unsigned long fun:16; /* RW */ - } s1; struct uvxh_lb_bau_misc_control_s { unsigned long rejection_delay:8; /* RW */ unsigned long apic_mode:1; /* RW */ @@ -2648,22 +2186,18 @@ union uvh_lb_bau_misc_control_u { /* ========================================================================= */ /* UVH_LB_BAU_SB_ACTIVATION_CONTROL */ /* ========================================================================= */ -#define UV1H_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL #define UV2H_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL #define UV3H_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL #define UV4H_LB_BAU_SB_ACTIVATION_CONTROL 0xc8020UL #define UVH_LB_BAU_SB_ACTIVATION_CONTROL ( \ - is_uv1_hub() ? UV1H_LB_BAU_SB_ACTIVATION_CONTROL : \ is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_CONTROL : \ is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_CONTROL : \ /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_CONTROL) -#define UV1H_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8 #define UV2H_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8 #define UV3H_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8 #define UV4H_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9c8 #define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 ( \ - is_uv1_hub() ? UV1H_LB_BAU_SB_ACTIVATION_CONTROL_32 : \ is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_CONTROL_32 : \ is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_CONTROL_32 : \ /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_CONTROL_32) @@ -2689,22 +2223,18 @@ union uvh_lb_bau_sb_activation_control_u { /* ========================================================================= */ /* UVH_LB_BAU_SB_ACTIVATION_STATUS_0 */ /* ========================================================================= */ -#define UV1H_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL #define UV2H_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL #define UV3H_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL #define UV4H_LB_BAU_SB_ACTIVATION_STATUS_0 0xc8030UL #define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 ( \ - is_uv1_hub() ? UV1H_LB_BAU_SB_ACTIVATION_STATUS_0 : \ is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_STATUS_0 : \ is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_STATUS_0 : \ /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_STATUS_0) -#define UV1H_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0 #define UV2H_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0 #define UV3H_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0 #define UV4H_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9d0 #define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 ( \ - is_uv1_hub() ? UV1H_LB_BAU_SB_ACTIVATION_STATUS_0_32 : \ is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_STATUS_0_32 : \ is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_STATUS_0_32 : \ /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_STATUS_0_32) @@ -2723,22 +2253,18 @@ union uvh_lb_bau_sb_activation_status_0_u { /* ========================================================================= */ /* UVH_LB_BAU_SB_ACTIVATION_STATUS_1 */ /* ========================================================================= */ -#define UV1H_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL #define UV2H_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL #define UV3H_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL #define UV4H_LB_BAU_SB_ACTIVATION_STATUS_1 0xc8040UL #define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 ( \ - is_uv1_hub() ? UV1H_LB_BAU_SB_ACTIVATION_STATUS_1 : \ is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_STATUS_1 : \ is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_STATUS_1 : \ /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_STATUS_1) -#define UV1H_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8 #define UV2H_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8 #define UV3H_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8 #define UV4H_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9d8 #define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 ( \ - is_uv1_hub() ? UV1H_LB_BAU_SB_ACTIVATION_STATUS_1_32 : \ is_uv2_hub() ? UV2H_LB_BAU_SB_ACTIVATION_STATUS_1_32 : \ is_uv3_hub() ? UV3H_LB_BAU_SB_ACTIVATION_STATUS_1_32 : \ /*is_uv4_hub*/ UV4H_LB_BAU_SB_ACTIVATION_STATUS_1_32) @@ -2757,32 +2283,24 @@ union uvh_lb_bau_sb_activation_status_1_u { /* ========================================================================= */ /* UVH_LB_BAU_SB_DESCRIPTOR_BASE */ /* ========================================================================= */ -#define UV1H_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL #define UV2H_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL #define UV3H_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL #define UV4H_LB_BAU_SB_DESCRIPTOR_BASE 0xc8010UL #define UVH_LB_BAU_SB_DESCRIPTOR_BASE ( \ - is_uv1_hub() ? UV1H_LB_BAU_SB_DESCRIPTOR_BASE : \ is_uv2_hub() ? UV2H_LB_BAU_SB_DESCRIPTOR_BASE : \ is_uv3_hub() ? UV3H_LB_BAU_SB_DESCRIPTOR_BASE : \ /*is_uv4_hub*/ UV4H_LB_BAU_SB_DESCRIPTOR_BASE) -#define UV1H_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0 #define UV2H_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0 #define UV3H_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0 #define UV4H_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9c0 #define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 ( \ - is_uv1_hub() ? UV1H_LB_BAU_SB_DESCRIPTOR_BASE_32 : \ is_uv2_hub() ? UV2H_LB_BAU_SB_DESCRIPTOR_BASE_32 : \ is_uv3_hub() ? UV3H_LB_BAU_SB_DESCRIPTOR_BASE_32 : \ /*is_uv4_hub*/ UV4H_LB_BAU_SB_DESCRIPTOR_BASE_32) #define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_SHFT 12 -#define UV1H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 49 -#define UV1H_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL -#define UV1H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0x7ffe000000000000UL - #define UV2H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 49 #define UV2H_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL #define UV2H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0x7ffe000000000000UL @@ -2800,21 +2318,18 @@ union uvh_lb_bau_sb_activation_status_1_u { #define UV4AH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0xffe0000000000000UL #define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT ( \ - is_uv1_hub() ? UV1H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT : \ is_uv2_hub() ? UV2H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT : \ is_uv3_hub() ? UV3H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT : \ is_uv4a_hub() ? UV4AH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT : \ /*is_uv4_hub*/ UV4H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT) #define UVH_LB_BAU_SB_DESCRIPTOR_PAGE_ADDRESS_MASK ( \ - is_uv1_hub() ? UV1H_LB_BAU_SB_DESCRIPTOR_PAGE_ADDRESS_MASK : \ is_uv2_hub() ? UV2H_LB_BAU_SB_DESCRIPTOR_PAGE_ADDRESS_MASK : \ is_uv3_hub() ? UV3H_LB_BAU_SB_DESCRIPTOR_PAGE_ADDRESS_MASK : \ is_uv4a_hub() ? UV4AH_LB_BAU_SB_DESCRIPTOR_PAGE_ADDRESS_MASK : \ /*is_uv4_hub*/ UV4H_LB_BAU_SB_DESCRIPTOR_PAGE_ADDRESS_MASK) #define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK ( \ - is_uv1_hub() ? UV1H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK : \ is_uv2_hub() ? UV2H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK : \ is_uv3_hub() ? UV3H_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK : \ is_uv4a_hub() ? UV4AH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK : \ @@ -2824,7 +2339,6 @@ union uvh_lb_bau_sb_activation_status_1_u { /* UVH_NODE_ID */ /* ========================================================================= */ #define UVH_NODE_ID 0x0UL -#define UV1H_NODE_ID 0x0UL #define UV2H_NODE_ID 0x0UL #define UV3H_NODE_ID 0x0UL #define UV4H_NODE_ID 0x0UL @@ -2840,21 +2354,6 @@ union uvh_lb_bau_sb_activation_status_1_u { #define UVH_NODE_ID_REVISION_MASK 0x00000000f0000000UL #define UVH_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL -#define UV1H_NODE_ID_FORCE1_SHFT 0 -#define UV1H_NODE_ID_MANUFACTURER_SHFT 1 -#define UV1H_NODE_ID_PART_NUMBER_SHFT 12 -#define UV1H_NODE_ID_REVISION_SHFT 28 -#define UV1H_NODE_ID_NODE_ID_SHFT 32 -#define UV1H_NODE_ID_NODES_PER_BIT_SHFT 48 -#define UV1H_NODE_ID_NI_PORT_SHFT 56 -#define UV1H_NODE_ID_FORCE1_MASK 0x0000000000000001UL -#define UV1H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL -#define UV1H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL -#define UV1H_NODE_ID_REVISION_MASK 0x00000000f0000000UL -#define UV1H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL -#define UV1H_NODE_ID_NODES_PER_BIT_MASK 0x007f000000000000UL -#define UV1H_NODE_ID_NI_PORT_MASK 0x0f00000000000000UL - #define UVXH_NODE_ID_FORCE1_SHFT 0 #define UVXH_NODE_ID_MANUFACTURER_SHFT 1 #define UVXH_NODE_ID_PART_NUMBER_SHFT 12 @@ -2934,18 +2433,6 @@ union uvh_node_id_u { unsigned long node_id:15; /* RW */ unsigned long rsvd_47_63:17; } s; - struct uv1h_node_id_s { - unsigned long force1:1; /* RO */ - unsigned long manufacturer:11; /* RO */ - unsigned long part_number:16; /* RO */ - unsigned long revision:4; /* RO */ - unsigned long node_id:15; /* RW */ - unsigned long rsvd_47:1; - unsigned long nodes_per_bit:7; /* RW */ - unsigned long rsvd_55:1; - unsigned long ni_port:4; /* RO */ - unsigned long rsvd_60_63:4; - } s1; struct uvxh_node_id_s { unsigned long force1:1; /* RO */ unsigned long manufacturer:11; /* RO */ @@ -3001,12 +2488,10 @@ union uvh_node_id_u { /* ========================================================================= */ #define UVH_NODE_PRESENT_TABLE 0x1400UL -#define UV1H_NODE_PRESENT_TABLE_DEPTH 16 #define UV2H_NODE_PRESENT_TABLE_DEPTH 16 #define UV3H_NODE_PRESENT_TABLE_DEPTH 16 #define UV4H_NODE_PRESENT_TABLE_DEPTH 4 #define UVH_NODE_PRESENT_TABLE_DEPTH ( \ - is_uv1_hub() ? UV1H_NODE_PRESENT_TABLE_DEPTH : \ is_uv2_hub() ? UV2H_NODE_PRESENT_TABLE_DEPTH : \ is_uv3_hub() ? UV3H_NODE_PRESENT_TABLE_DEPTH : \ /*is_uv4_hub*/ UV4H_NODE_PRESENT_TABLE_DEPTH) @@ -3025,12 +2510,10 @@ union uvh_node_present_table_u { /* ========================================================================= */ /* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR */ /* ========================================================================= */ -#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL #define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL #define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL #define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x4800c8UL #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR ( \ - is_uv1_hub() ? UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR : \ is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR : \ is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR : \ /*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR) @@ -3042,13 +2525,6 @@ union uvh_node_present_table_u { #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL -#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24 -#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48 -#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_SHFT 63 -#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL -#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL -#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL - #define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24 #define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48 #define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_SHFT 63 @@ -3088,14 +2564,6 @@ union uvh_rh_gam_alias210_overlay_config_0_mmr_u { unsigned long rsvd_53_62:10; unsigned long enable:1; /* RW */ } s; - struct uv1h_rh_gam_alias210_overlay_config_0_mmr_s { - unsigned long rsvd_0_23:24; - unsigned long base:8; /* RW */ - unsigned long rsvd_32_47:16; - unsigned long m_alias:5; /* RW */ - unsigned long rsvd_53_62:10; - unsigned long enable:1; /* RW */ - } s1; struct uvxh_rh_gam_alias210_overlay_config_0_mmr_s { unsigned long rsvd_0_23:24; unsigned long base:8; /* RW */ @@ -3133,12 +2601,10 @@ union uvh_rh_gam_alias210_overlay_config_0_mmr_u { /* ========================================================================= */ /* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR */ /* ========================================================================= */ -#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL #define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL #define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL #define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x4800d8UL #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR ( \ - is_uv1_hub() ? UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR : \ is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR : \ is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR : \ /*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR) @@ -3150,13 +2616,6 @@ union uvh_rh_gam_alias210_overlay_config_0_mmr_u { #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL -#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24 -#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48 -#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_SHFT 63 -#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL -#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL -#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL - #define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24 #define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48 #define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_SHFT 63 @@ -3196,14 +2655,6 @@ union uvh_rh_gam_alias210_overlay_config_1_mmr_u { unsigned long rsvd_53_62:10; unsigned long enable:1; /* RW */ } s; - struct uv1h_rh_gam_alias210_overlay_config_1_mmr_s { - unsigned long rsvd_0_23:24; - unsigned long base:8; /* RW */ - unsigned long rsvd_32_47:16; - unsigned long m_alias:5; /* RW */ - unsigned long rsvd_53_62:10; - unsigned long enable:1; /* RW */ - } s1; struct uvxh_rh_gam_alias210_overlay_config_1_mmr_s { unsigned long rsvd_0_23:24; unsigned long base:8; /* RW */ @@ -3241,12 +2692,10 @@ union uvh_rh_gam_alias210_overlay_config_1_mmr_u { /* ========================================================================= */ /* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR */ /* ========================================================================= */ -#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL #define UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL #define UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL #define UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x4800e8UL #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR ( \ - is_uv1_hub() ? UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR : \ is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR : \ is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR : \ /*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR) @@ -3258,13 +2707,6 @@ union uvh_rh_gam_alias210_overlay_config_1_mmr_u { #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL -#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24 -#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48 -#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_SHFT 63 -#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL -#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL -#define UV1H_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL - #define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24 #define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48 #define UVXH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_SHFT 63 @@ -3304,14 +2746,6 @@ union uvh_rh_gam_alias210_overlay_config_2_mmr_u { unsigned long rsvd_53_62:10; unsigned long enable:1; /* RW */ } s; - struct uv1h_rh_gam_alias210_overlay_config_2_mmr_s { - unsigned long rsvd_0_23:24; - unsigned long base:8; /* RW */ - unsigned long rsvd_32_47:16; - unsigned long m_alias:5; /* RW */ - unsigned long rsvd_53_62:10; - unsigned long enable:1; /* RW */ - } s1; struct uvxh_rh_gam_alias210_overlay_config_2_mmr_s { unsigned long rsvd_0_23:24; unsigned long base:8; /* RW */ @@ -3349,12 +2783,10 @@ union uvh_rh_gam_alias210_overlay_config_2_mmr_u { /* ========================================================================= */ /* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR */ /* ========================================================================= */ -#define UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL #define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL #define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL #define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x4800d0UL #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR ( \ - is_uv1_hub() ? UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR : \ is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR : \ is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR : \ /*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR) @@ -3362,9 +2794,6 @@ union uvh_rh_gam_alias210_overlay_config_2_mmr_u { #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL -#define UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24 -#define UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL - #define UVXH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24 #define UVXH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL @@ -3385,11 +2814,6 @@ union uvh_rh_gam_alias210_redirect_config_0_mmr_u { unsigned long dest_base:22; /* RW */ unsigned long rsvd_46_63:18; } s; - struct uv1h_rh_gam_alias210_redirect_config_0_mmr_s { - unsigned long rsvd_0_23:24; - unsigned long dest_base:22; /* RW */ - unsigned long rsvd_46_63:18; - } s1; struct uvxh_rh_gam_alias210_redirect_config_0_mmr_s { unsigned long rsvd_0_23:24; unsigned long dest_base:22; /* RW */ @@ -3415,12 +2839,10 @@ union uvh_rh_gam_alias210_redirect_config_0_mmr_u { /* ========================================================================= */ /* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR */ /* ========================================================================= */ -#define UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL #define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL #define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL #define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x4800e0UL #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR ( \ - is_uv1_hub() ? UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR : \ is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR : \ is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR : \ /*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR) @@ -3428,9 +2850,6 @@ union uvh_rh_gam_alias210_redirect_config_0_mmr_u { #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL -#define UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24 -#define UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL - #define UVXH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24 #define UVXH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL @@ -3451,11 +2870,6 @@ union uvh_rh_gam_alias210_redirect_config_1_mmr_u { unsigned long dest_base:22; /* RW */ unsigned long rsvd_46_63:18; } s; - struct uv1h_rh_gam_alias210_redirect_config_1_mmr_s { - unsigned long rsvd_0_23:24; - unsigned long dest_base:22; /* RW */ - unsigned long rsvd_46_63:18; - } s1; struct uvxh_rh_gam_alias210_redirect_config_1_mmr_s { unsigned long rsvd_0_23:24; unsigned long dest_base:22; /* RW */ @@ -3481,12 +2895,10 @@ union uvh_rh_gam_alias210_redirect_config_1_mmr_u { /* ========================================================================= */ /* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR */ /* ========================================================================= */ -#define UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL #define UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL #define UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL #define UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x4800f0UL #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR ( \ - is_uv1_hub() ? UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR : \ is_uv2_hub() ? UV2H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR : \ is_uv3_hub() ? UV3H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR : \ /*is_uv4_hub*/ UV4H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR) @@ -3494,9 +2906,6 @@ union uvh_rh_gam_alias210_redirect_config_1_mmr_u { #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL -#define UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24 -#define UV1H_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL - #define UVXH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24 #define UVXH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL @@ -3517,11 +2926,6 @@ union uvh_rh_gam_alias210_redirect_config_2_mmr_u { unsigned long dest_base:22; /* RW */ unsigned long rsvd_46_63:18; } s; - struct uv1h_rh_gam_alias210_redirect_config_2_mmr_s { - unsigned long rsvd_0_23:24; - unsigned long dest_base:22; /* RW */ - unsigned long rsvd_46_63:18; - } s1; struct uvxh_rh_gam_alias210_redirect_config_2_mmr_s { unsigned long rsvd_0_23:24; unsigned long dest_base:22; /* RW */ @@ -3547,12 +2951,10 @@ union uvh_rh_gam_alias210_redirect_config_2_mmr_u { /* ========================================================================= */ /* UVH_RH_GAM_CONFIG_MMR */ /* ========================================================================= */ -#define UV1H_RH_GAM_CONFIG_MMR 0x1600000UL #define UV2H_RH_GAM_CONFIG_MMR 0x1600000UL #define UV3H_RH_GAM_CONFIG_MMR 0x1600000UL #define UV4H_RH_GAM_CONFIG_MMR 0x480000UL #define UVH_RH_GAM_CONFIG_MMR ( \ - is_uv1_hub() ? UV1H_RH_GAM_CONFIG_MMR : \ is_uv2_hub() ? UV2H_RH_GAM_CONFIG_MMR : \ is_uv3_hub() ? UV3H_RH_GAM_CONFIG_MMR : \ /*is_uv4_hub*/ UV4H_RH_GAM_CONFIG_MMR) @@ -3560,13 +2962,6 @@ union uvh_rh_gam_alias210_redirect_config_2_mmr_u { #define UVH_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 #define UVH_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL -#define UV1H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0 -#define UV1H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 -#define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_SHFT 12 -#define UV1H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL -#define UV1H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL -#define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_MASK 0x0000000000001000UL - #define UVXH_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 #define UVXH_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL @@ -3591,13 +2986,6 @@ union uvh_rh_gam_config_mmr_u { unsigned long n_skt:4; /* RW */ unsigned long rsvd_10_63:54; } s; - struct uv1h_rh_gam_config_mmr_s { - unsigned long m_skt:6; /* RW */ - unsigned long n_skt:4; /* RW */ - unsigned long rsvd_10_11:2; - unsigned long mmiol_cfg:1; /* RW */ - unsigned long rsvd_13_63:51; - } s1; struct uvxh_rh_gam_config_mmr_s { unsigned long rsvd_0_5:6; unsigned long n_skt:4; /* RW */ @@ -3623,12 +3011,10 @@ union uvh_rh_gam_config_mmr_u { /* ========================================================================= */ /* UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR */ /* ========================================================================= */ -#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL #define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL #define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL #define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x480010UL #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR ( \ - is_uv1_hub() ? UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR : \ is_uv2_hub() ? UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR : \ is_uv3_hub() ? UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR : \ /*is_uv4_hub*/ UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR) @@ -3638,15 +3024,6 @@ union uvh_rh_gam_config_mmr_u { #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL -#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28 -#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_SHFT 48 -#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 -#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 -#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL -#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_MASK 0x0001000000000000UL -#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL -#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL - #define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 #define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 #define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL @@ -3676,12 +3053,10 @@ union uvh_rh_gam_config_mmr_u { #define UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK ( \ - is_uv1_hub() ? UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK : \ is_uv2_hub() ? UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK : \ is_uv3_hub() ? UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK : \ /*is_uv4_hub*/ UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK) #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT ( \ - is_uv1_hub() ? UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT : \ is_uv2_hub() ? UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT : \ is_uv3_hub() ? UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT : \ /*is_uv4_hub*/ UV4H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT) @@ -3694,16 +3069,6 @@ union uvh_rh_gam_gru_overlay_config_mmr_u { unsigned long rsvd_56_62:7; unsigned long enable:1; /* RW */ } s; - struct uv1h_rh_gam_gru_overlay_config_mmr_s { - unsigned long rsvd_0_27:28; - unsigned long base:18; /* RW */ - unsigned long rsvd_46_47:2; - unsigned long gr4:1; /* RW */ - unsigned long rsvd_49_51:3; - unsigned long n_gru:4; /* RW */ - unsigned long rsvd_56_62:7; - unsigned long enable:1; /* RW */ - } s1; struct uvxh_rh_gam_gru_overlay_config_mmr_s { unsigned long rsvd_0_45:46; unsigned long rsvd_46_51:6; @@ -3742,12 +3107,10 @@ union uvh_rh_gam_gru_overlay_config_mmr_u { /* ========================================================================= */ /* UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR */ /* ========================================================================= */ -#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR uv_undefined("UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR") #define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR uv_undefined("UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR") #define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR 0x1603000UL #define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR 0x483000UL #define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR ( \ - is_uv1_hub() ? UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR : \ is_uv2_hub() ? UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR : \ is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR : \ /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR) @@ -3823,12 +3186,10 @@ union uvh_rh_gam_mmioh_overlay_config0_mmr_u { /* ========================================================================= */ /* UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR */ /* ========================================================================= */ -#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR uv_undefined("UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR") #define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR uv_undefined("UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR") #define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR 0x1603000UL #define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR 0x484000UL #define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR ( \ - is_uv1_hub() ? UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR : \ is_uv2_hub() ? UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR : \ is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR : \ /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR) @@ -3898,27 +3259,15 @@ union uvh_rh_gam_mmioh_overlay_config1_mmr_u { /* ========================================================================= */ /* UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR */ /* ========================================================================= */ -#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL #define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL #define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR uv_undefined("UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR") #define UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR uv_undefined("UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR") #define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR ( \ - is_uv1_hub() ? UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR : \ is_uv2_hub() ? UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR : \ is_uv3_hub() ? UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR : \ /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR) -#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 30 -#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46 -#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52 -#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 -#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003fffc0000000UL -#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL -#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL -#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL - - #define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 27 #define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46 #define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52 @@ -3931,14 +3280,6 @@ union uvh_rh_gam_mmioh_overlay_config1_mmr_u { union uvh_rh_gam_mmioh_overlay_config_mmr_u { unsigned long v; - struct uv1h_rh_gam_mmioh_overlay_config_mmr_s { - unsigned long rsvd_0_29:30; - unsigned long base:16; /* RW */ - unsigned long m_io:6; /* RW */ - unsigned long n_io:4; /* RW */ - unsigned long rsvd_56_62:7; - unsigned long enable:1; /* RW */ - } s1; struct uv2h_rh_gam_mmioh_overlay_config_mmr_s { unsigned long rsvd_0_26:27; unsigned long base:19; /* RW */ @@ -3952,22 +3293,18 @@ union uvh_rh_gam_mmioh_overlay_config_mmr_u { /* ========================================================================= */ /* UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR */ /* ========================================================================= */ -#define UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR uv_undefined("UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR") #define UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR uv_undefined("UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR") #define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR 0x1603800UL #define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR 0x483800UL #define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR ( \ - is_uv1_hub() ? UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR : \ is_uv2_hub() ? UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR : \ is_uv3_hub() ? UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR : \ /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR) -#define UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH uv_undefined("UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH") #define UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH uv_undefined("UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH") #define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH 128 #define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH 128 #define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH ( \ - is_uv1_hub() ? UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH : \ is_uv2_hub() ? UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH : \ is_uv3_hub() ? UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH : \ /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH) @@ -4005,22 +3342,18 @@ union uvh_rh_gam_mmioh_redirect_config0_mmr_u { /* ========================================================================= */ /* UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR */ /* ========================================================================= */ -#define UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR uv_undefined("UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR") #define UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR uv_undefined("UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR") #define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR 0x1604800UL #define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR 0x484800UL #define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR ( \ - is_uv1_hub() ? UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR : \ is_uv2_hub() ? UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR : \ is_uv3_hub() ? UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR : \ /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR) -#define UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH uv_undefined("UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH") #define UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH uv_undefined("UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH") #define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH 128 #define UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH 128 #define UVH_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH ( \ - is_uv1_hub() ? UV1H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH : \ is_uv2_hub() ? UV2H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH : \ is_uv3_hub() ? UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH : \ /*is_uv4_hub*/ UV4H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH) @@ -4058,12 +3391,10 @@ union uvh_rh_gam_mmioh_redirect_config1_mmr_u { /* ========================================================================= */ /* UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR */ /* ========================================================================= */ -#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL #define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL #define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL #define UV4H_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x480028UL #define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR ( \ - is_uv1_hub() ? UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR : \ is_uv2_hub() ? UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR : \ is_uv3_hub() ? UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR : \ /*is_uv4_hub*/ UV4H_RH_GAM_MMR_OVERLAY_CONFIG_MMR) @@ -4073,13 +3404,6 @@ union uvh_rh_gam_mmioh_redirect_config1_mmr_u { #define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL #define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL -#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 -#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_SHFT 46 -#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 -#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL -#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_MASK 0x0000400000000000UL -#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL - #define UVXH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 #define UVXH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 #define UVXH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL @@ -4109,13 +3433,6 @@ union uvh_rh_gam_mmr_overlay_config_mmr_u { unsigned long rsvd_46_62:17; unsigned long enable:1; /* RW */ } s; - struct uv1h_rh_gam_mmr_overlay_config_mmr_s { - unsigned long rsvd_0_25:26; - unsigned long base:20; /* RW */ - unsigned long dual_hub:1; /* RW */ - unsigned long rsvd_47_62:16; - unsigned long enable:1; /* RW */ - } s1; struct uvxh_rh_gam_mmr_overlay_config_mmr_s { unsigned long rsvd_0_25:26; unsigned long base:20; /* RW */ @@ -4145,12 +3462,10 @@ union uvh_rh_gam_mmr_overlay_config_mmr_u { /* ========================================================================= */ /* UVH_RTC */ /* ========================================================================= */ -#define UV1H_RTC 0x340000UL #define UV2H_RTC 0x340000UL #define UV3H_RTC 0x340000UL #define UV4H_RTC 0xe0000UL #define UVH_RTC ( \ - is_uv1_hub() ? UV1H_RTC : \ is_uv2_hub() ? UV2H_RTC : \ is_uv3_hub() ? UV3H_RTC : \ /*is_uv4_hub*/ UV4H_RTC) @@ -4209,22 +3524,18 @@ union uvh_rtc1_int_config_u { /* ========================================================================= */ /* UVH_SCRATCH5 */ /* ========================================================================= */ -#define UV1H_SCRATCH5 0x2d0200UL #define UV2H_SCRATCH5 0x2d0200UL #define UV3H_SCRATCH5 0x2d0200UL #define UV4H_SCRATCH5 0xb0200UL #define UVH_SCRATCH5 ( \ - is_uv1_hub() ? UV1H_SCRATCH5 : \ is_uv2_hub() ? UV2H_SCRATCH5 : \ is_uv3_hub() ? UV3H_SCRATCH5 : \ /*is_uv4_hub*/ UV4H_SCRATCH5) -#define UV1H_SCRATCH5_32 0x778 #define UV2H_SCRATCH5_32 0x778 #define UV3H_SCRATCH5_32 0x778 #define UV4H_SCRATCH5_32 0x798 #define UVH_SCRATCH5_32 ( \ - is_uv1_hub() ? UV1H_SCRATCH5_32 : \ is_uv2_hub() ? UV2H_SCRATCH5_32 : \ is_uv3_hub() ? UV3H_SCRATCH5_32 : \ /*is_uv4_hub*/ UV4H_SCRATCH5_32) @@ -4243,22 +3554,18 @@ union uvh_scratch5_u { /* ========================================================================= */ /* UVH_SCRATCH5_ALIAS */ /* ========================================================================= */ -#define UV1H_SCRATCH5_ALIAS 0x2d0208UL #define UV2H_SCRATCH5_ALIAS 0x2d0208UL #define UV3H_SCRATCH5_ALIAS 0x2d0208UL #define UV4H_SCRATCH5_ALIAS 0xb0208UL #define UVH_SCRATCH5_ALIAS ( \ - is_uv1_hub() ? UV1H_SCRATCH5_ALIAS : \ is_uv2_hub() ? UV2H_SCRATCH5_ALIAS : \ is_uv3_hub() ? UV3H_SCRATCH5_ALIAS : \ /*is_uv4_hub*/ UV4H_SCRATCH5_ALIAS) -#define UV1H_SCRATCH5_ALIAS_32 0x780 #define UV2H_SCRATCH5_ALIAS_32 0x780 #define UV3H_SCRATCH5_ALIAS_32 0x780 #define UV4H_SCRATCH5_ALIAS_32 0x7a0 #define UVH_SCRATCH5_ALIAS_32 ( \ - is_uv1_hub() ? UV1H_SCRATCH5_ALIAS_32 : \ is_uv2_hub() ? UV2H_SCRATCH5_ALIAS_32 : \ is_uv3_hub() ? UV3H_SCRATCH5_ALIAS_32 : \ /*is_uv4_hub*/ UV4H_SCRATCH5_ALIAS_32) @@ -4267,12 +3574,10 @@ union uvh_scratch5_u { /* ========================================================================= */ /* UVH_SCRATCH5_ALIAS_2 */ /* ========================================================================= */ -#define UV1H_SCRATCH5_ALIAS_2 0x2d0210UL #define UV2H_SCRATCH5_ALIAS_2 0x2d0210UL #define UV3H_SCRATCH5_ALIAS_2 0x2d0210UL #define UV4H_SCRATCH5_ALIAS_2 0xb0210UL #define UVH_SCRATCH5_ALIAS_2 ( \ - is_uv1_hub() ? UV1H_SCRATCH5_ALIAS_2 : \ is_uv2_hub() ? UV2H_SCRATCH5_ALIAS_2 : \ is_uv3_hub() ? UV3H_SCRATCH5_ALIAS_2 : \ /*is_uv4_hub*/ UV4H_SCRATCH5_ALIAS_2) @@ -4719,23 +4024,6 @@ union uvxh_lb_bau_sb_activation_status_2_u { }; /* ========================================================================= */ -/* UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK */ -/* ========================================================================= */ -#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK 0x320130UL -#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_32 0x9f0 - -#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_SHFT 0 -#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_MASK 0x00000000ffffffffUL - -union uv1h_lb_target_physical_apic_id_mask_u { - unsigned long v; - struct uv1h_lb_target_physical_apic_id_mask_s { - unsigned long bit_enables:32; /* RW */ - unsigned long rsvd_32_63:32; - } s1; -}; - -/* ========================================================================= */ /* UV3H_GR0_GAM_GR_CONFIG */ /* ========================================================================= */ #define UV3H_GR0_GAM_GR_CONFIG 0xc00028UL diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index ba4c1b15908b..454b20815f35 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -82,7 +82,7 @@ struct xen_dm_op_buf; * - clobber the rest * * The result certainly isn't pretty, and it really shows up cpp's - * weakness as as macro language. Sorry. (But let's just give thanks + * weakness as a macro language. Sorry. (But let's just give thanks * there aren't more than 5 arguments...) */ diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 8669c6bdbb84..600a141c8805 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -255,7 +255,7 @@ struct boot_params { * currently supportd through this PV boot path. * @X86_SUBARCH_INTEL_MID: Used for Intel MID (Mobile Internet Device) platform * systems which do not have the PCI legacy interfaces. - * @X86_SUBARCH_CE4100: Used for Intel CE media processor (CE4100) SoC for + * @X86_SUBARCH_CE4100: Used for Intel CE media processor (CE4100) SoC * for settop boxes and media devices, the use of a subarch for CE4100 * is more of a hack... */ diff --git a/arch/x86/include/uapi/asm/hwcap2.h b/arch/x86/include/uapi/asm/hwcap2.h index 8b2effe6efb8..5fdfcb47000f 100644 --- a/arch/x86/include/uapi/asm/hwcap2.h +++ b/arch/x86/include/uapi/asm/hwcap2.h @@ -5,4 +5,7 @@ /* MONITOR/MWAIT enabled in Ring 3 */ #define HWCAP2_RING3MWAIT (1 << 0) +/* Kernel allows FSGSBASE instructions available in Ring 3 */ +#define HWCAP2_FSGSBASE BIT(1) + #endif diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 8fd39ff74a49..d1175533d125 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -3,9 +3,11 @@ #include <linux/module.h> #include <linux/sched.h> +#include <linux/perf_event.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/stringify.h> +#include <linux/highmem.h> #include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/memory.h> @@ -15,6 +17,7 @@ #include <linux/kprobes.h> #include <linux/mmu_context.h> #include <linux/bsearch.h> +#include <linux/sync_core.h> #include <asm/text-patching.h> #include <asm/alternative.h> #include <asm/sections.h> @@ -53,7 +56,7 @@ __setup("noreplace-smp", setup_noreplace_smp); #define DPRINTK(fmt, args...) \ do { \ if (debug_alternative) \ - printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \ + printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \ } while (0) #define DUMP_BYTES(buf, len, fmt, args...) \ @@ -64,7 +67,7 @@ do { \ if (!(len)) \ break; \ \ - printk(KERN_DEBUG fmt, ##args); \ + printk(KERN_DEBUG pr_fmt(fmt), ##args); \ for (j = 0; j < (len) - 1; j++) \ printk(KERN_CONT "%02hhx ", buf[j]); \ printk(KERN_CONT "%02hhx\n", buf[j]); \ @@ -1001,6 +1004,7 @@ struct text_poke_loc { s32 rel32; u8 opcode; const u8 text[POKE_MAX_OPCODE_SIZE]; + u8 old; }; struct bp_patching_desc { @@ -1044,7 +1048,7 @@ static __always_inline int patch_cmp(const void *key, const void *elt) return 0; } -int noinstr poke_int3_handler(struct pt_regs *regs) +noinstr int poke_int3_handler(struct pt_regs *regs) { struct bp_patching_desc *desc; struct text_poke_loc *tp; @@ -1168,8 +1172,10 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries /* * First step: add a int3 trap to the address that will be patched. */ - for (i = 0; i < nr_entries; i++) + for (i = 0; i < nr_entries; i++) { + tp[i].old = *(u8 *)text_poke_addr(&tp[i]); text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE); + } text_poke_sync(); @@ -1177,14 +1183,45 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries * Second step: update all but the first byte of the patched range. */ for (do_sync = 0, i = 0; i < nr_entries; i++) { + u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, }; int len = text_opcode_size(tp[i].opcode); if (len - INT3_INSN_SIZE > 0) { + memcpy(old + INT3_INSN_SIZE, + text_poke_addr(&tp[i]) + INT3_INSN_SIZE, + len - INT3_INSN_SIZE); text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE, (const char *)tp[i].text + INT3_INSN_SIZE, len - INT3_INSN_SIZE); do_sync++; } + + /* + * Emit a perf event to record the text poke, primarily to + * support Intel PT decoding which must walk the executable code + * to reconstruct the trace. The flow up to here is: + * - write INT3 byte + * - IPI-SYNC + * - write instruction tail + * At this point the actual control flow will be through the + * INT3 and handler and not hit the old or new instruction. + * Intel PT outputs FUP/TIP packets for the INT3, so the flow + * can still be decoded. Subsequently: + * - emit RECORD_TEXT_POKE with the new instruction + * - IPI-SYNC + * - write first byte + * - IPI-SYNC + * So before the text poke event timestamp, the decoder will see + * either the old instruction flow or FUP/TIP of INT3. After the + * text poke event timestamp, the decoder will see either the + * new instruction flow or FUP/TIP of INT3. Thus decoders can + * use the timestamp as the point at which to modify the + * executable code. + * The old instruction is recorded so that the event can be + * processed forwards or backwards. + */ + perf_event_text_poke(text_poke_addr(&tp[i]), old, len, + tp[i].text, len); } if (do_sync) { diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index 17cb5b933dcf..e89031e9c847 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -6,7 +6,7 @@ * This allows to use PCI devices that only support 32bit addresses on systems * with more than 4GB. * - * See Documentation/DMA-API-HOWTO.txt for the interface specification. + * See Documentation/core-api/dma-api-howto.rst for the interface specification. * * Copyright 2002 Andi Kleen, SuSE Labs. */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e0e2f020ec02..5f943b938167 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -40,13 +40,13 @@ #include <asm/irq_remapping.h> #include <asm/perf_event.h> #include <asm/x86_init.h> -#include <asm/pgalloc.h> #include <linux/atomic.h> #include <asm/mpspec.h> #include <asm/i8259.h> #include <asm/proto.h> #include <asm/traps.h> #include <asm/apic.h> +#include <asm/acpi.h> #include <asm/io_apic.h> #include <asm/desc.h> #include <asm/hpet.h> diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 98c9bb75d185..780c702969b7 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -10,6 +10,7 @@ * like self-ipi, etc... */ #include <linux/cpumask.h> +#include <linux/thread_info.h> #include <asm/apic.h> diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 38b5b51d42f6..98d015a4405a 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -9,6 +9,7 @@ #include <linux/smp.h> #include <asm/apic.h> +#include <asm/io_apic.h> #include "local.h" diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index d1fc62a67320..34a992e275ef 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -9,6 +9,7 @@ * Bits copied from original nmi.c file * */ +#include <linux/thread_info.h> #include <asm/apic.h> #include <asm/nmi.h> diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 81ffcfbfaef2..21325a4a78b9 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2335,8 +2335,13 @@ static int mp_irqdomain_create(int ioapic) static void ioapic_destroy_irqdomain(int idx) { + struct ioapic_domain_cfg *cfg = &ioapics[idx].irqdomain_cfg; + struct fwnode_handle *fn = ioapics[idx].irqdomain->fwnode; + if (ioapics[idx].irqdomain) { irq_domain_remove(ioapics[idx].irqdomain); + if (!cfg->dev) + irq_domain_free_fwnode(fn); ioapics[idx].irqdomain = NULL; } } diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 6ca0f91372fd..387154e39e08 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -2,6 +2,7 @@ #include <linux/cpumask.h> #include <linux/smp.h> +#include <asm/io_apic.h> #include "local.h" diff --git a/arch/x86/kernel/apic/local.h b/arch/x86/kernel/apic/local.h index 04797f05ce94..a997d849509a 100644 --- a/arch/x86/kernel/apic/local.h +++ b/arch/x86/kernel/apic/local.h @@ -10,6 +10,7 @@ #include <linux/jump_label.h> +#include <asm/irq_vectors.h> #include <asm/apic.h> /* APIC flat 64 */ diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 67b33d67002f..7bda71def557 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -10,6 +10,7 @@ #include <linux/errno.h> #include <linux/smp.h> +#include <asm/io_apic.h> #include <asm/apic.h> #include <asm/acpi.h> diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index 29f0e0984557..bd3835d6b535 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -8,6 +8,7 @@ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and * James Cleverdon. */ +#include <linux/thread_info.h> #include <asm/apic.h> #include "local.h" diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 7649da2478d8..dae32d948bf2 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -560,6 +560,10 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq, * as that can corrupt the affinity move state. */ irqd_set_handle_enforce_irqctx(irqd); + + /* Don't invoke affinity setter on deactivated interrupts */ + irqd_set_affinity_on_activate(irqd); + /* * Legacy vectors are already assigned when the IOAPIC * takes them over. They stay on the same vector. This is diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 69e70ed0f5e6..0b6eea3f54e6 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -24,8 +24,6 @@ #include <asm/uv/uv.h> #include <asm/apic.h> -static DEFINE_PER_CPU(int, x2apic_extra_bits); - static enum uv_system_type uv_system_type; static int uv_hubbed_system; static int uv_hubless_system; @@ -40,7 +38,7 @@ static u8 oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1]; static struct { unsigned int apicid_shift; unsigned int apicid_mask; - unsigned int socketid_shift; /* aka pnode_shift for UV1/2/3 */ + unsigned int socketid_shift; /* aka pnode_shift for UV2/3 */ unsigned int pnode_mask; unsigned int gpa_shift; unsigned int gnode_shift; @@ -48,8 +46,6 @@ static struct { static int uv_min_hub_revision_id; -unsigned int uv_apicid_hibits; - static struct apic apic_x2apic_uv_x; static struct uv_hub_info_s uv_hub_info_node0; @@ -139,12 +135,8 @@ static void __init uv_tsc_check_sync(void) /* Accommodate different UV arch BIOSes */ mmr = uv_early_read_mmr(UVH_TSC_SYNC_MMR); mmr_shift = - is_uv1_hub() ? 0 : is_uv2_hub() ? UVH_TSC_SYNC_SHIFT_UV2K : UVH_TSC_SYNC_SHIFT; - if (mmr_shift) - sync_state = (mmr >> mmr_shift) & UVH_TSC_SYNC_MASK; - else - sync_state = 0; + sync_state = (mmr >> mmr_shift) & UVH_TSC_SYNC_MASK; switch (sync_state) { case UVH_TSC_SYNC_VALID: @@ -223,21 +215,6 @@ static void __init early_get_apic_socketid_shift(void) pr_info("UV: socketid_shift:%d pnode_mask:0x%x\n", uv_cpuid.socketid_shift, uv_cpuid.pnode_mask); } -/* - * Add an extra bit as dictated by bios to the destination apicid of - * interrupts potentially passing through the UV HUB. This prevents - * a deadlock between interrupts and IO port operations. - */ -static void __init uv_set_apicid_hibit(void) -{ - union uv1h_lb_target_physical_apic_id_mask_u apicid_mask; - - if (is_uv1_hub()) { - apicid_mask.v = uv_early_read_mmr(UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK); - uv_apicid_hibits = apicid_mask.s1.bit_enables & UV_APICID_HIBIT_MASK; - } -} - static void __init uv_stringify(int len, char *to, char *from) { /* Relies on 'to' being NULL chars so result will be NULL terminated */ @@ -280,36 +257,25 @@ static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id) /* * Determine UV arch type. - * SGI: UV100/1000 * SGI2: UV2000/3000 * SGI3: UV300 (truncated to 4 chars because of different varieties) * SGI4: UV400 (truncated to 4 chars because of different varieties) */ - uv_hub_info->hub_revision = - !strncmp(oem_id, "SGI4", 4) ? UV4_HUB_REVISION_BASE : - !strncmp(oem_id, "SGI3", 4) ? UV3_HUB_REVISION_BASE : - !strcmp(oem_id, "SGI2") ? UV2_HUB_REVISION_BASE : - !strcmp(oem_id, "SGI") ? UV1_HUB_REVISION_BASE : 0; - - if (uv_hub_info->hub_revision == 0) - goto badbios; - - switch (uv_hub_info->hub_revision) { - case UV4_HUB_REVISION_BASE: + if (!strncmp(oem_id, "SGI4", 4)) { + uv_hub_info->hub_revision = UV4_HUB_REVISION_BASE; uv_hubbed_system = 0x11; - break; - case UV3_HUB_REVISION_BASE: + } else if (!strncmp(oem_id, "SGI3", 4)) { + uv_hub_info->hub_revision = UV3_HUB_REVISION_BASE; uv_hubbed_system = 0x9; - break; - case UV2_HUB_REVISION_BASE: + } else if (!strcmp(oem_id, "SGI2")) { + uv_hub_info->hub_revision = UV2_HUB_REVISION_BASE; uv_hubbed_system = 0x5; - break; - case UV1_HUB_REVISION_BASE: - uv_hubbed_system = 0x3; - break; + } else { + uv_hub_info->hub_revision = 0; + goto badbios; } pnodeid = early_get_pnodeid(); @@ -323,14 +289,6 @@ static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id) uv_system_type = UV_X2APIC; uv_apic = 0; - } else if (!strcmp(oem_table_id, "UVH")) { - /* Only UV1 systems: */ - uv_system_type = UV_NON_UNIQUE_APIC; - x86_platform.legacy.warm_reset = 0; - __this_cpu_write(x2apic_extra_bits, pnodeid << uvh_apicid.s.pnode_shift); - uv_set_apicid_hibit(); - uv_apic = 1; - } else if (!strcmp(oem_table_id, "UVL")) { /* Only used for very small systems: */ uv_system_type = UV_LEGACY_APIC; @@ -347,7 +305,7 @@ static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id) badbios: pr_err("UV: OEM_ID:%s OEM_TABLE_ID:%s\n", oem_id, oem_table_id); - pr_err("Current BIOS not supported, update kernel and/or BIOS\n"); + pr_err("Current UV Type or BIOS not supported\n"); BUG(); } @@ -545,7 +503,6 @@ static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) int pnode; pnode = uv_apicid_to_pnode(phys_apicid); - phys_apicid |= uv_apicid_hibits; val = (1UL << UVH_IPI_INT_SEND_SHFT) | (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | @@ -576,7 +533,7 @@ static void uv_send_IPI_one(int cpu, int vector) dmode = dest_Fixed; val = (1UL << UVH_IPI_INT_SEND_SHFT) | - ((apicid | uv_apicid_hibits) << UVH_IPI_INT_APIC_ID_SHFT) | + (apicid << UVH_IPI_INT_APIC_ID_SHFT) | (dmode << UVH_IPI_INT_DELIVERY_MODE_SHFT) | (vector << UVH_IPI_INT_VECTOR_SHFT); @@ -634,22 +591,16 @@ static void uv_init_apic_ldr(void) static u32 apic_uv_calc_apicid(unsigned int cpu) { - return apic_default_calc_apicid(cpu) | uv_apicid_hibits; + return apic_default_calc_apicid(cpu); } -static unsigned int x2apic_get_apic_id(unsigned long x) +static unsigned int x2apic_get_apic_id(unsigned long id) { - unsigned int id; - - WARN_ON(preemptible() && num_online_cpus() > 1); - id = x | __this_cpu_read(x2apic_extra_bits); - return id; } static u32 set_apic_id(unsigned int id) { - /* CHECKME: Do we need to mask out the xapic extra bits? */ return id; } @@ -721,11 +672,6 @@ static struct apic apic_x2apic_uv_x __ro_after_init = { .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, }; -static void set_x2apic_extra_bits(int pnode) -{ - __this_cpu_write(x2apic_extra_bits, pnode << uvh_apicid.s.pnode_shift); -} - #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_LENGTH 3 #define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT @@ -920,15 +866,7 @@ static __init void map_mmioh_high(int min_pnode, int max_pnode) return; } - if (is_uv1_hub()) { - mmr = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR; - shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; - mmioh.v = uv_read_local_mmr(mmr); - enable = !!mmioh.s1.enable; - base = mmioh.s1.base; - m_io = mmioh.s1.m_io; - n_io = mmioh.s1.n_io; - } else if (is_uv2_hub()) { + if (is_uv2_hub()) { mmr = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR; shift = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; mmioh.v = uv_read_local_mmr(mmr); @@ -936,16 +874,15 @@ static __init void map_mmioh_high(int min_pnode, int max_pnode) base = mmioh.s2.base; m_io = mmioh.s2.m_io; n_io = mmioh.s2.n_io; - } else { - return; - } - if (enable) { - max_pnode &= (1 << n_io) - 1; - pr_info("UV: base:0x%lx shift:%d N_IO:%d M_IO:%d max_pnode:0x%x\n", base, shift, m_io, n_io, max_pnode); - map_high("MMIOH", base, shift, m_io, max_pnode, map_uc); - } else { - pr_info("UV: MMIOH disabled\n"); + if (enable) { + max_pnode &= (1 << n_io) - 1; + pr_info("UV: base:0x%lx shift:%d N_IO:%d M_IO:%d max_pnode:0x%x\n", + base, shift, m_io, n_io, max_pnode); + map_high("MMIOH", base, shift, m_io, max_pnode, map_uc); + } else { + pr_info("UV: MMIOH disabled\n"); + } } } @@ -1081,9 +1018,6 @@ void uv_cpu_init(void) return; uv_hub_info->nr_online_cpus++; - - if (get_uv_system_type() == UV_NON_UNIQUE_APIC) - set_x2apic_extra_bits(uv_hub_info->pnode); } struct mn { @@ -1114,9 +1048,6 @@ static void get_mn(struct mn *mnp) } else if (is_uv2_hub()) { mnp->m_val = m_n_config.s2.m_skt; mnp->n_lshift = mnp->m_val == 40 ? 40 : 39; - } else if (is_uv1_hub()) { - mnp->m_val = m_n_config.s1.m_skt; - mnp->n_lshift = mnp->m_val; } mnp->m_shift = mnp->m_val ? 64 - mnp->m_val : 0; } @@ -1318,7 +1249,7 @@ static void __init build_socket_tables(void) size_t bytes; if (!gre) { - if (is_uv1_hub() || is_uv2_hub() || is_uv3_hub()) { + if (is_uv2_hub() || is_uv3_hub()) { pr_info("UV: No UVsystab socket table, ignoring\n"); return; } @@ -1500,8 +1431,7 @@ static void __init uv_system_init_hub(void) unsigned short min_pnode = 9999, max_pnode = 0; char *hub = is_uv4_hub() ? "UV400" : is_uv3_hub() ? "UV300" : - is_uv2_hub() ? "UV2000/3000" : - is_uv1_hub() ? "UV100/1000" : NULL; + is_uv2_hub() ? "UV2000/3000" : NULL; if (!hub) { pr_err("UV: Unknown/unsupported UV hub\n"); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index dba6a83bc349..93792b457b81 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -17,8 +17,7 @@ KCOV_INSTRUMENT_perf_event.o := n KCSAN_SANITIZE_common.o := n # Make sure load_percpu_segment has no stackprotector -nostackp := $(call cc-option, -fno-stack-protector) -CFLAGS_common.o := $(nostackp) +CFLAGS_common.o := -fno-stack-protector obj-y := cacheinfo.o scattered.o topology.o obj-y += common.o diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index d4806eac9325..dcc3d943c68f 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -15,6 +15,7 @@ #include <asm/cpu.h> #include <asm/spec-ctrl.h> #include <asm/smp.h> +#include <asm/numa.h> #include <asm/pci-direct.h> #include <asm/delay.h> #include <asm/debugreg.h> diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 0b71970d2d3d..f0b743a2fe9c 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -543,14 +543,12 @@ static void __init spectre_v1_select_mitigation(void) * If FSGSBASE is enabled, the user can put a kernel address in * GS, in which case SMAP provides no protection. * - * [ NOTE: Don't check for X86_FEATURE_FSGSBASE until the - * FSGSBASE enablement patches have been merged. ] - * * If FSGSBASE is disabled, the user can only put a user space * address in GS. That makes an attack harder, but still * possible if there's no SMAP protection. */ - if (!smap_works_speculatively()) { + if (boot_cpu_has(X86_FEATURE_FSGSBASE) || + !smap_works_speculatively()) { /* * Mitigation can be provided from SWAPGS itself or * PTI as the CR3 write in the Meltdown mitigation @@ -763,10 +761,12 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) } /* - * If enhanced IBRS is enabled or SMT impossible, STIBP is not + * If no STIBP, enhanced IBRS is enabled or SMT impossible, STIBP is not * required. */ - if (!smt_possible || spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) + if (!boot_cpu_has(X86_FEATURE_STIBP) || + !smt_possible || + spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) return; /* @@ -778,12 +778,6 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd) boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON)) mode = SPECTRE_V2_USER_STRICT_PREFERRED; - /* - * If STIBP is not available, clear the STIBP mode. - */ - if (!boot_cpu_has(X86_FEATURE_STIBP)) - mode = SPECTRE_V2_USER_NONE; - spectre_v2_user_stibp = mode; set_mode: @@ -1270,7 +1264,6 @@ static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) * Indirect branch speculation is always disabled in strict * mode. It can neither be enabled if it was force-disabled * by a previous prctl call. - */ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 95c090a45b4b..c5d6f17d9b9d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -45,6 +45,7 @@ #include <asm/mtrr.h> #include <asm/hwcap2.h> #include <linux/numa.h> +#include <asm/numa.h> #include <asm/asm.h> #include <asm/bugs.h> #include <asm/cpu.h> @@ -441,6 +442,22 @@ static void __init setup_cr_pinning(void) static_key_enable(&cr_pinning.key); } +static __init int x86_nofsgsbase_setup(char *arg) +{ + /* Require an exact match without trailing characters. */ + if (strlen(arg)) + return 0; + + /* Do not emit a message if the feature is not present. */ + if (!boot_cpu_has(X86_FEATURE_FSGSBASE)) + return 1; + + setup_clear_cpu_cap(X86_FEATURE_FSGSBASE); + pr_info("FSGSBASE disabled via kernel command line\n"); + return 1; +} +__setup("nofsgsbase", x86_nofsgsbase_setup); + /* * Protection Keys are not available in 32-bit mode. */ @@ -1495,6 +1512,12 @@ static void identify_cpu(struct cpuinfo_x86 *c) setup_smap(c); setup_umip(c); + /* Enable FSGSBASE instructions if available. */ + if (cpu_has(c, X86_FEATURE_FSGSBASE)) { + cr4_set_bits(X86_CR4_FSGSBASE); + elf_hwcap2 |= HWCAP2_FSGSBASE; + } + /* * The vendor-specific functions might have changed features. * Now we do "generic changes." diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index 4e28c1fc8749..ac6c30e5801d 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -10,6 +10,7 @@ #include <asm/cpu.h> #include <asm/smp.h> +#include <asm/numa.h> #include <asm/cacheinfo.h> #include <asm/spec-ctrl.h> #include <asm/delay.h> diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 0ab48f1cdf84..59a1e3ce3f14 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -23,6 +23,7 @@ #include <asm/cmdline.h> #include <asm/traps.h> #include <asm/resctrl.h> +#include <asm/numa.h> #ifdef CONFIG_X86_64 #include <linux/topology.h> @@ -1156,6 +1157,8 @@ static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, 1), X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, 1), X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, 1), + X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, 1), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, 1), {} }; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 14e4b4d17ee5..f43a78bde670 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -42,6 +42,7 @@ #include <linux/export.h> #include <linux/jump_label.h> #include <linux/set_memory.h> +#include <linux/sync_core.h> #include <linux/task_work.h> #include <linux/hardirq.h> @@ -244,6 +245,8 @@ static void __print_mce(struct mce *m) pr_cont("ADDR %llx ", m->addr); if (m->misc) pr_cont("MISC %llx ", m->misc); + if (m->ppin) + pr_cont("PPIN %llx ", m->ppin); if (mce_flags.smca) { if (m->synd) @@ -1212,7 +1215,7 @@ static void kill_me_maybe(struct callback_head *cb) * backing the user stack, tracing that reads the user stack will cause * potentially infinite recursion. */ -void noinstr do_machine_check(struct pt_regs *regs) +noinstr void do_machine_check(struct pt_regs *regs) { DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); DECLARE_BITMAP(toclear, MAX_NR_BANKS); @@ -1927,11 +1930,11 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) static __always_inline void exc_machine_check_user(struct pt_regs *regs) { - idtentry_enter_user(regs); + irqentry_enter_from_user_mode(regs); instrumentation_begin(); machine_check_vector(regs); instrumentation_end(); - idtentry_exit_user(regs); + irqentry_exit_to_user_mode(regs); } #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c index 43c466020ed5..03e51053592a 100644 --- a/arch/x86/kernel/cpu/mce/dev-mcelog.c +++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c @@ -345,7 +345,7 @@ static __init int dev_mcelog_init_device(void) int err; mce_log_len = max(MCE_LOG_MIN_LEN, num_online_cpus()); - mcelog = kzalloc(sizeof(*mcelog) + mce_log_len * sizeof(struct mce), GFP_KERNEL); + mcelog = kzalloc(struct_size(mcelog, entry, mce_log_len), GFP_KERNEL); if (!mcelog) return -ENOMEM; diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 0593b192eb8f..7843ab3fde09 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -511,7 +511,7 @@ static void do_inject(void) */ if (inj_type == DFR_INT_INJ) { i_mce.status |= MCI_STATUS_DEFERRED; - i_mce.status |= (i_mce.status & ~MCI_STATUS_UC); + i_mce.status &= ~MCI_STATUS_UC; } /* diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index baec68b7e010..ec6f0415bc6d 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -145,7 +145,6 @@ extern struct builtin_fw __end_builtin_fw[]; bool get_builtin_firmware(struct cpio_data *cd, const char *name) { -#ifdef CONFIG_FW_LOADER struct builtin_fw *b_fw; for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) { @@ -155,7 +154,6 @@ bool get_builtin_firmware(struct cpio_data *cd, const char *name) return true; } } -#endif return false; } diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 8d85e00bb40a..a0e8fc7d85f1 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -20,6 +20,7 @@ #include <asm/irqdomain.h> #include <asm/hpet.h> #include <asm/apic.h> +#include <asm/io_apic.h> #include <asm/pci_x86.h> #include <asm/setup.h> #include <asm/i8259.h> diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 7401cc12c3cc..48ce44576947 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -133,15 +133,15 @@ void show_ip(struct pt_regs *regs, const char *loglvl) show_opcodes(regs, loglvl); } -void show_iret_regs(struct pt_regs *regs) +void show_iret_regs(struct pt_regs *regs, const char *log_lvl) { - show_ip(regs, KERN_DEFAULT); - printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss, + show_ip(regs, log_lvl); + printk("%sRSP: %04x:%016lx EFLAGS: %08lx", log_lvl, (int)regs->ss, regs->sp, regs->flags); } static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, - bool partial) + bool partial, const char *log_lvl) { /* * These on_stack() checks aren't strictly necessary: the unwind code @@ -153,7 +153,7 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, * they can be printed in the right context. */ if (!partial && on_stack(info, regs, sizeof(*regs))) { - __show_regs(regs, SHOW_REGS_SHORT); + __show_regs(regs, SHOW_REGS_SHORT, log_lvl); } else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET, IRET_FRAME_SIZE)) { @@ -162,7 +162,7 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, * full pt_regs might not have been saved yet. In that case * just print the iret frame. */ - show_iret_regs(regs); + show_iret_regs(regs, log_lvl); } } @@ -217,7 +217,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, printk("%s <%s>\n", log_lvl, stack_name); if (regs) - show_regs_if_on_stack(&stack_info, regs, partial); + show_regs_if_on_stack(&stack_info, regs, partial, log_lvl); /* * Scan the stack, printing any text addresses we find. At the @@ -278,7 +278,7 @@ next: /* if the frame has entry regs, print them */ regs = unwind_get_entry_regs(&state, &partial); if (regs) - show_regs_if_on_stack(&stack_info, regs, partial); + show_regs_if_on_stack(&stack_info, regs, partial, log_lvl); } if (stack_name) @@ -352,7 +352,7 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr) oops_exit(); /* Executive summary in case the oops scrolled away */ - __show_regs(&exec_summary_regs, SHOW_REGS_ALL); + __show_regs(&exec_summary_regs, SHOW_REGS_ALL, KERN_DEFAULT); if (!signr) return; @@ -444,9 +444,12 @@ void die_addr(const char *str, struct pt_regs *regs, long err, long gp_addr) void show_regs(struct pt_regs *regs) { + enum show_regs_mode print_kernel_regs; + show_regs_print_info(KERN_DEFAULT); - __show_regs(regs, user_mode(regs) ? SHOW_REGS_USER : SHOW_REGS_ALL); + print_kernel_regs = user_mode(regs) ? SHOW_REGS_USER : SHOW_REGS_ALL; + __show_regs(regs, print_kernel_regs, KERN_DEFAULT); /* * When in-kernel, we also print out the stack at the time of the fault.. diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 2f9ec14be3b1..a4b5af03dcc1 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -550,6 +550,7 @@ static const struct pci_device_id intel_early_ids[] __initconst = { INTEL_ICL_11_IDS(&gen11_early_ops), INTEL_EHL_IDS(&gen11_early_ops), INTEL_TGL_12_IDS(&gen11_early_ops), + INTEL_RKL_IDS(&gen11_early_ops), }; struct resource intel_graphics_stolen_res __ro_after_init = DEFINE_RES_MEM(0, 0); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 15247b96c6ea..eb86a2b831b1 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -82,6 +82,45 @@ bool irq_fpu_usable(void) } EXPORT_SYMBOL(irq_fpu_usable); +/* + * These must be called with preempt disabled. Returns + * 'true' if the FPU state is still intact and we can + * keep registers active. + * + * The legacy FNSAVE instruction cleared all FPU state + * unconditionally, so registers are essentially destroyed. + * Modern FPU state can be kept in registers, if there are + * no pending FP exceptions. + */ +int copy_fpregs_to_fpstate(struct fpu *fpu) +{ + if (likely(use_xsave())) { + copy_xregs_to_kernel(&fpu->state.xsave); + + /* + * AVX512 state is tracked here because its use is + * known to slow the max clock speed of the core. + */ + if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512) + fpu->avx512_timestamp = jiffies; + return 1; + } + + if (likely(use_fxsr())) { + copy_fxregs_to_kernel(fpu); + return 1; + } + + /* + * Legacy FPU register saving, FNSAVE always clears FPU registers, + * so we have to mark them inactive: + */ + asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave)); + + return 0; +} +EXPORT_SYMBOL(copy_fpregs_to_fpstate); + void kernel_fpu_begin(void) { preempt_disable(); diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index bd1d0649f8ce..c413756ba89f 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -27,8 +27,7 @@ int regset_xregset_fpregs_active(struct task_struct *target, const struct user_r } int xfpregs_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { struct fpu *fpu = &target->thread.fpu; @@ -38,8 +37,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, fpu__prepare_read(fpu); fpstate_sanitize_xstate(fpu); - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &fpu->state.fxsave, 0, -1); + return membuf_write(&to, &fpu->state.fxsave, sizeof(struct fxregs_state)); } int xfpregs_set(struct task_struct *target, const struct user_regset *regset, @@ -74,12 +72,10 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, } int xstateregs_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { struct fpu *fpu = &target->thread.fpu; struct xregs_state *xsave; - int ret; if (!boot_cpu_has(X86_FEATURE_XSAVE)) return -ENODEV; @@ -89,10 +85,8 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, fpu__prepare_read(fpu); if (using_compacted_format()) { - if (kbuf) - ret = copy_xstate_to_kernel(kbuf, xsave, pos, count); - else - ret = copy_xstate_to_user(ubuf, xsave, pos, count); + copy_xstate_to_kernel(to, xsave); + return 0; } else { fpstate_sanitize_xstate(fpu); /* @@ -105,9 +99,8 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, /* * Copy the xstate memory layout. */ - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + return membuf_write(&to, xsave, fpu_user_xstate_size); } - return ret; } int xstateregs_set(struct task_struct *target, const struct user_regset *regset, @@ -293,8 +286,7 @@ void convert_to_fxsr(struct fxregs_state *fxsave, } int fpregs_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { struct fpu *fpu = &target->thread.fpu; struct user_i387_ia32_struct env; @@ -302,23 +294,22 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, fpu__prepare_read(fpu); if (!boot_cpu_has(X86_FEATURE_FPU)) - return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf); + return fpregs_soft_get(target, regset, to); - if (!boot_cpu_has(X86_FEATURE_FXSR)) - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &fpu->state.fsave, 0, - -1); + if (!boot_cpu_has(X86_FEATURE_FXSR)) { + return membuf_write(&to, &fpu->state.fsave, + sizeof(struct fregs_state)); + } fpstate_sanitize_xstate(fpu); - if (kbuf && pos == 0 && count == sizeof(env)) { - convert_from_fxsr(kbuf, target); + if (to.left == sizeof(env)) { + convert_from_fxsr(to.p, target); return 0; } convert_from_fxsr(&env, target); - - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &env, 0, -1); + return membuf_write(&to, &env, sizeof(env)); } int fpregs_set(struct task_struct *target, const struct user_regset *regset, @@ -356,20 +347,4 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, return ret; } -/* - * FPU state for core dumps. - * This is only used for a.out dumps now. - * It is declared generically using elf_fpregset_t (which is - * struct user_i387_struct) but is in fact only used for 32-bit - * dumps, so on 64-bit it is really struct user_i387_ia32_struct. - */ -int dump_fpu(struct pt_regs *regs, struct user_i387_struct *ufpu) -{ - struct task_struct *tsk = current; - - return !fpregs_get(tsk, NULL, 0, sizeof(struct user_i387_ia32_struct), - ufpu, NULL); -} -EXPORT_SYMBOL(dump_fpu); - #endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */ diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 9393a445d73c..a4ec65317a7f 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -170,14 +170,15 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) || IS_ENABLED(CONFIG_IA32_EMULATION)); + if (!static_cpu_has(X86_FEATURE_FPU)) { + struct user_i387_ia32_struct fp; + fpregs_soft_get(current, NULL, (struct membuf){.p = &fp, + .left = sizeof(fp)}); + return copy_to_user(buf, &fp, sizeof(fp)) ? -EFAULT : 0; + } + if (!access_ok(buf, size)) return -EACCES; - - if (!static_cpu_has(X86_FEATURE_FPU)) - return fpregs_soft_get(current, NULL, 0, - sizeof(struct user_i387_ia32_struct), NULL, - (struct _fpstate_32 __user *) buf) ? -1 : 1; - retry: /* * Load the FPU registers if they are not valid for the current task. diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index ad3a2b37927d..7a2bf884fede 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -233,8 +233,10 @@ void fpu__init_cpu_xstate(void) /* * MSR_IA32_XSS sets supervisor states managed by XSAVES. */ - if (boot_cpu_has(X86_FEATURE_XSAVES)) - wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); + if (boot_cpu_has(X86_FEATURE_XSAVES)) { + wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | + xfeatures_mask_dynamic()); + } } static bool xfeature_enabled(enum xfeature xfeature) @@ -486,7 +488,7 @@ static int xfeature_uncompacted_offset(int xfeature_nr) return ebx; } -static int xfeature_size(int xfeature_nr) +int xfeature_size(int xfeature_nr) { u32 eax, ebx, ecx, edx; @@ -598,7 +600,8 @@ static void check_xstate_against_struct(int nr) */ if ((nr < XFEATURE_YMM) || (nr >= XFEATURE_MAX) || - (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR)) { + (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) || + ((nr >= XFEATURE_RSRVD_COMP_10) && (nr <= XFEATURE_LBR))) { WARN_ONCE(1, "no structure for xstate: %d\n", nr); XSTATE_WARN_ON(1); } @@ -847,8 +850,10 @@ void fpu__resume_cpu(void) * Restore IA32_XSS. The same CPUID bit enumerates support * of XSAVES and MSR_IA32_XSS. */ - if (boot_cpu_has(X86_FEATURE_XSAVES)) - wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); + if (boot_cpu_has(X86_FEATURE_XSAVES)) { + wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | + xfeatures_mask_dynamic()); + } } /* @@ -1009,32 +1014,20 @@ static inline bool xfeatures_mxcsr_quirk(u64 xfeatures) return true; } -static void fill_gap(unsigned to, void **kbuf, unsigned *pos, unsigned *count) +static void fill_gap(struct membuf *to, unsigned *last, unsigned offset) { - if (*pos < to) { - unsigned size = to - *pos; - - if (size > *count) - size = *count; - memcpy(*kbuf, (void *)&init_fpstate.xsave + *pos, size); - *kbuf += size; - *pos += size; - *count -= size; - } + if (*last >= offset) + return; + membuf_write(to, (void *)&init_fpstate.xsave + *last, offset - *last); + *last = offset; } -static void copy_part(unsigned offset, unsigned size, void *from, - void **kbuf, unsigned *pos, unsigned *count) +static void copy_part(struct membuf *to, unsigned *last, unsigned offset, + unsigned size, void *from) { - fill_gap(offset, kbuf, pos, count); - if (size > *count) - size = *count; - if (size) { - memcpy(*kbuf, from, size); - *kbuf += size; - *pos += size; - *count -= size; - } + fill_gap(to, last, offset); + membuf_write(to, from, size); + *last = offset + size; } /* @@ -1044,20 +1037,15 @@ static void copy_part(unsigned offset, unsigned size, void *from, * It supports partial copy but pos always starts from zero. This is called * from xstateregs_get() and there we check the CPU has XSAVES. */ -int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total) +void copy_xstate_to_kernel(struct membuf to, struct xregs_state *xsave) { struct xstate_header header; const unsigned off_mxcsr = offsetof(struct fxregs_state, mxcsr); - unsigned count = size_total; + unsigned size = to.left; + unsigned last = 0; int i; /* - * Currently copy_regset_to_user() starts from pos 0: - */ - if (unlikely(offset_start != 0)) - return -EFAULT; - - /* * The destination is a ptrace buffer; we put in only user xstates: */ memset(&header, 0, sizeof(header)); @@ -1065,27 +1053,26 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of header.xfeatures &= xfeatures_mask_user(); if (header.xfeatures & XFEATURE_MASK_FP) - copy_part(0, off_mxcsr, - &xsave->i387, &kbuf, &offset_start, &count); + copy_part(&to, &last, 0, off_mxcsr, &xsave->i387); if (header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM)) - copy_part(off_mxcsr, MXCSR_AND_FLAGS_SIZE, - &xsave->i387.mxcsr, &kbuf, &offset_start, &count); + copy_part(&to, &last, off_mxcsr, + MXCSR_AND_FLAGS_SIZE, &xsave->i387.mxcsr); if (header.xfeatures & XFEATURE_MASK_FP) - copy_part(offsetof(struct fxregs_state, st_space), 128, - &xsave->i387.st_space, &kbuf, &offset_start, &count); + copy_part(&to, &last, offsetof(struct fxregs_state, st_space), + 128, &xsave->i387.st_space); if (header.xfeatures & XFEATURE_MASK_SSE) - copy_part(xstate_offsets[XFEATURE_SSE], 256, - &xsave->i387.xmm_space, &kbuf, &offset_start, &count); + copy_part(&to, &last, xstate_offsets[XFEATURE_SSE], + 256, &xsave->i387.xmm_space); /* * Fill xsave->i387.sw_reserved value for ptrace frame: */ - copy_part(offsetof(struct fxregs_state, sw_reserved), 48, - xstate_fx_sw_bytes, &kbuf, &offset_start, &count); + copy_part(&to, &last, offsetof(struct fxregs_state, sw_reserved), + 48, xstate_fx_sw_bytes); /* * Copy xregs_state->header: */ - copy_part(offsetof(struct xregs_state, header), sizeof(header), - &header, &kbuf, &offset_start, &count); + copy_part(&to, &last, offsetof(struct xregs_state, header), + sizeof(header), &header); for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { /* @@ -1094,104 +1081,12 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of if ((header.xfeatures >> i) & 1) { void *src = __raw_xsave_addr(xsave, i); - copy_part(xstate_offsets[i], xstate_sizes[i], - src, &kbuf, &offset_start, &count); - } - - } - fill_gap(size_total, &kbuf, &offset_start, &count); - - return 0; -} - -static inline int -__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int offset, unsigned int size, unsigned int size_total) -{ - if (!size) - return 0; - - if (offset < size_total) { - unsigned int copy = min(size, size_total - offset); - - if (__copy_to_user(ubuf + offset, data, copy)) - return -EFAULT; - } - return 0; -} - -/* - * Convert from kernel XSAVES compacted format to standard format and copy - * to a user-space buffer. It supports partial copy but pos always starts from - * zero. This is called from xstateregs_get() and there we check the CPU - * has XSAVES. - */ -int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total) -{ - unsigned int offset, size; - int ret, i; - struct xstate_header header; - - /* - * Currently copy_regset_to_user() starts from pos 0: - */ - if (unlikely(offset_start != 0)) - return -EFAULT; - - /* - * The destination is a ptrace buffer; we put in only user xstates: - */ - memset(&header, 0, sizeof(header)); - header.xfeatures = xsave->header.xfeatures; - header.xfeatures &= xfeatures_mask_user(); - - /* - * Copy xregs_state->header: - */ - offset = offsetof(struct xregs_state, header); - size = sizeof(header); - - ret = __copy_xstate_to_user(ubuf, &header, offset, size, size_total); - if (ret) - return ret; - - for (i = 0; i < XFEATURE_MAX; i++) { - /* - * Copy only in-use xstates: - */ - if ((header.xfeatures >> i) & 1) { - void *src = __raw_xsave_addr(xsave, i); - - offset = xstate_offsets[i]; - size = xstate_sizes[i]; - - /* The next component has to fit fully into the output buffer: */ - if (offset + size > size_total) - break; - - ret = __copy_xstate_to_user(ubuf, src, offset, size, size_total); - if (ret) - return ret; + copy_part(&to, &last, xstate_offsets[i], + xstate_sizes[i], src); } } - - if (xfeatures_mxcsr_quirk(header.xfeatures)) { - offset = offsetof(struct fxregs_state, mxcsr); - size = MXCSR_AND_FLAGS_SIZE; - __copy_xstate_to_user(ubuf, &xsave->i387.mxcsr, offset, size, size_total); - } - - /* - * Fill xsave->i387.sw_reserved value for ptrace frame: - */ - offset = offsetof(struct fxregs_state, sw_reserved); - size = sizeof(xstate_fx_sw_bytes); - - ret = __copy_xstate_to_user(ubuf, xstate_fx_sw_bytes, offset, size, size_total); - if (ret) - return ret; - - return 0; + fill_gap(&to, &last, size); } /* @@ -1356,6 +1251,78 @@ void copy_supervisor_to_kernel(struct xregs_state *xstate) } } +/** + * copy_dynamic_supervisor_to_kernel() - Save dynamic supervisor states to + * an xsave area + * @xstate: A pointer to an xsave area + * @mask: Represent the dynamic supervisor features saved into the xsave area + * + * Only the dynamic supervisor states sets in the mask are saved into the xsave + * area (See the comment in XFEATURE_MASK_DYNAMIC for the details of dynamic + * supervisor feature). Besides the dynamic supervisor states, the legacy + * region and XSAVE header are also saved into the xsave area. The supervisor + * features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and + * XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not saved. + * + * The xsave area must be 64-bytes aligned. + */ +void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask) +{ + u64 dynamic_mask = xfeatures_mask_dynamic() & mask; + u32 lmask, hmask; + int err; + + if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES))) + return; + + if (WARN_ON_FPU(!dynamic_mask)) + return; + + lmask = dynamic_mask; + hmask = dynamic_mask >> 32; + + XSTATE_OP(XSAVES, xstate, lmask, hmask, err); + + /* Should never fault when copying to a kernel buffer */ + WARN_ON_FPU(err); +} + +/** + * copy_kernel_to_dynamic_supervisor() - Restore dynamic supervisor states from + * an xsave area + * @xstate: A pointer to an xsave area + * @mask: Represent the dynamic supervisor features restored from the xsave area + * + * Only the dynamic supervisor states sets in the mask are restored from the + * xsave area (See the comment in XFEATURE_MASK_DYNAMIC for the details of + * dynamic supervisor feature). Besides the dynamic supervisor states, the + * legacy region and XSAVE header are also restored from the xsave area. The + * supervisor features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and + * XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not restored. + * + * The xsave area must be 64-bytes aligned. + */ +void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask) +{ + u64 dynamic_mask = xfeatures_mask_dynamic() & mask; + u32 lmask, hmask; + int err; + + if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES))) + return; + + if (WARN_ON_FPU(!dynamic_mask)) + return; + + lmask = dynamic_mask; + hmask = dynamic_mask >> 32; + + XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); + + /* Should never fault when copying from a kernel buffer */ + WARN_ON_FPU(err); +} + #ifdef CONFIG_PROC_PID_ARCH_STATUS /* * Report the amount of time elapsed in millisecond since last AVX512 diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 51504566b3a6..7edbd5ee5ed4 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -286,6 +286,7 @@ extern void ftrace_regs_caller_ret(void); extern void ftrace_caller_end(void); extern void ftrace_caller_op_ptr(void); extern void ftrace_regs_caller_op_ptr(void); +extern void ftrace_regs_caller_jmp(void); /* movq function_trace_op(%rip), %rdx */ /* 0x48 0x8b 0x15 <offset-to-ftrace_trace_op (4 bytes)> */ @@ -316,6 +317,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) unsigned long end_offset; unsigned long op_offset; unsigned long call_offset; + unsigned long jmp_offset; unsigned long offset; unsigned long npages; unsigned long size; @@ -333,11 +335,13 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) end_offset = (unsigned long)ftrace_regs_caller_end; op_offset = (unsigned long)ftrace_regs_caller_op_ptr; call_offset = (unsigned long)ftrace_regs_call; + jmp_offset = (unsigned long)ftrace_regs_caller_jmp; } else { start_offset = (unsigned long)ftrace_caller; end_offset = (unsigned long)ftrace_caller_end; op_offset = (unsigned long)ftrace_caller_op_ptr; call_offset = (unsigned long)ftrace_call; + jmp_offset = 0; } size = end_offset - start_offset; @@ -367,10 +371,14 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) if (WARN_ON(ret < 0)) goto fail; + /* No need to test direct calls on created trampolines */ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { - ip = trampoline + (ftrace_regs_caller_ret - ftrace_regs_caller); - ret = copy_from_kernel_nofault(ip, (void *)retq, RET_SIZE); - if (WARN_ON(ret < 0)) + /* NOP the jnz 1f; but make sure it's a 2 byte jnz */ + ip = trampoline + (jmp_offset - start_offset); + if (WARN_ON(*(char *)ip != 0x75)) + goto fail; + ret = copy_from_kernel_nofault(ip, ideal_nops[2], 2); + if (ret < 0) goto fail; } diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 083a3da7bb73..ac3d5f22fe64 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -241,22 +241,10 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) */ movq ORIG_RAX(%rsp), %rax testq %rax, %rax - jz 1f +SYM_INNER_LABEL(ftrace_regs_caller_jmp, SYM_L_GLOBAL) + jnz 1f - /* Swap the flags with orig_rax */ - movq MCOUNT_REG_SIZE(%rsp), %rdi - movq %rdi, MCOUNT_REG_SIZE-8(%rsp) - movq %rax, MCOUNT_REG_SIZE(%rsp) - - restore_mcount_regs 8 - /* Restore flags */ - popfq - -SYM_INNER_LABEL(ftrace_regs_caller_ret, SYM_L_GLOBAL); - UNWIND_HINT_RET_OFFSET - jmp ftrace_epilogue - -1: restore_mcount_regs + restore_mcount_regs /* Restore flags */ popfq @@ -269,6 +257,17 @@ SYM_INNER_LABEL(ftrace_regs_caller_ret, SYM_L_GLOBAL); SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) jmp ftrace_epilogue + /* Swap the flags with orig_rax */ +1: movq MCOUNT_REG_SIZE(%rsp), %rdi + movq %rdi, MCOUNT_REG_SIZE-8(%rsp) + movq %rax, MCOUNT_REG_SIZE(%rsp) + + restore_mcount_regs 8 + /* Restore flags */ + popfq + UNWIND_HINT_RET_OFFSET + jmp ftrace_epilogue + SYM_FUNC_END(ftrace_regs_caller) diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 0db21206f2f3..7ecf9babf0cb 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -160,7 +160,7 @@ static const __initconst struct idt_data apic_idts[] = { /* Must be page-aligned because the real IDT is used in the cpu entry area */ static gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss; -struct desc_ptr idt_descr __ro_after_init = { +static struct desc_ptr idt_descr __ro_after_init = { .size = IDT_TABLE_SIZE - 1, .address = (unsigned long) idt_table, }; diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index dd73135d7cee..beb1bada1b0a 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -22,6 +22,8 @@ #include <asm/timer.h> #include <asm/hw_irq.h> #include <asm/desc.h> +#include <asm/io_apic.h> +#include <asm/acpi.h> #include <asm/apic.h> #include <asm/setup.h> #include <asm/i8259.h> diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index 6eb8b50ea07e..4eb8f2d19a87 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -13,6 +13,8 @@ #include <linux/reboot.h> #include <linux/serial_8250.h> #include <asm/apic.h> +#include <asm/io_apic.h> +#include <asm/acpi.h> #include <asm/cpu.h> #include <asm/hypervisor.h> #include <asm/i8259.h> diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index db6578d45157..57c2ecf43134 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -170,15 +170,6 @@ setup_efi_state(struct boot_params *params, unsigned long params_load_addr, if (!current_ei->efi_memmap_size) return 0; - /* - * If 1:1 mapping is not enabled, second kernel can not setup EFI - * and use EFI run time services. User space will have to pass - * acpi_rsdp=<addr> on kernel command line to make second kernel boot - * without efi. - */ - if (efi_have_uv1_memmap()) - return 0; - params->secure_boot = boot_params.secure_boot; ei->efi_loader_signature = current_ei->efi_loader_signature; ei->efi_systab = current_ei->efi_systab; diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index ada39ddbc922..fdadc37d72af 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -33,6 +33,7 @@ #include <linux/hardirq.h> #include <linux/preempt.h> #include <linux/sched/debug.h> +#include <linux/perf_event.h> #include <linux/extable.h> #include <linux/kdebug.h> #include <linux/kallsyms.h> @@ -472,6 +473,9 @@ static int arch_copy_kprobe(struct kprobe *p) /* Also, displacement change doesn't affect the first byte */ p->opcode = buf[0]; + p->ainsn.tp_len = len; + perf_event_text_poke(p->ainsn.insn, NULL, 0, buf, len); + /* OK, write back the instruction(s) into ROX insn buffer */ text_poke(p->ainsn.insn, buf, len); @@ -503,12 +507,18 @@ int arch_prepare_kprobe(struct kprobe *p) void arch_arm_kprobe(struct kprobe *p) { - text_poke(p->addr, ((unsigned char []){INT3_INSN_OPCODE}), 1); + u8 int3 = INT3_INSN_OPCODE; + + text_poke(p->addr, &int3, 1); text_poke_sync(); + perf_event_text_poke(p->addr, &p->opcode, 1, &int3, 1); } void arch_disarm_kprobe(struct kprobe *p) { + u8 int3 = INT3_INSN_OPCODE; + + perf_event_text_poke(p->addr, &int3, 1, &p->opcode, 1); text_poke(p->addr, &p->opcode, 1); text_poke_sync(); } @@ -516,6 +526,9 @@ void arch_disarm_kprobe(struct kprobe *p) void arch_remove_kprobe(struct kprobe *p) { if (p->ainsn.insn) { + /* Record the perf event before freeing the slot */ + perf_event_text_poke(p->ainsn.insn, p->ainsn.insn, + p->ainsn.tp_len, NULL, 0); free_insn_slot(p->ainsn.insn, p->ainsn.boostable); p->ainsn.insn = NULL; } diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 7af4c61dde52..40f380461e6d 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -6,6 +6,7 @@ * Copyright (C) Hitachi Ltd., 2012 */ #include <linux/kprobes.h> +#include <linux/perf_event.h> #include <linux/ptrace.h> #include <linux/string.h> #include <linux/slab.h> @@ -352,8 +353,15 @@ int arch_within_optimized_kprobe(struct optimized_kprobe *op, static void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) { - if (op->optinsn.insn) { - free_optinsn_slot(op->optinsn.insn, dirty); + u8 *slot = op->optinsn.insn; + if (slot) { + int len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE; + + /* Record the perf event before freeing the slot */ + if (dirty) + perf_event_text_poke(slot, slot, len, NULL, 0); + + free_optinsn_slot(slot, dirty); op->optinsn.insn = NULL; op->optinsn.size = 0; } @@ -424,8 +432,15 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, (u8 *)op->kp.addr + op->optinsn.size); len += JMP32_INSN_SIZE; + /* + * Note len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE is also + * used in __arch_remove_optimized_kprobe(). + */ + /* We have to use text_poke() for instruction buffer because it is RO */ + perf_event_text_poke(slot, NULL, 0, buf, len); text_poke(slot, buf, len); + ret = 0; out: kfree(buf); @@ -477,10 +492,23 @@ void arch_optimize_kprobes(struct list_head *oplist) */ void arch_unoptimize_kprobe(struct optimized_kprobe *op) { - arch_arm_kprobe(&op->kp); - text_poke(op->kp.addr + INT3_INSN_SIZE, - op->optinsn.copied_insn, DISP32_SIZE); + u8 new[JMP32_INSN_SIZE] = { INT3_INSN_OPCODE, }; + u8 old[JMP32_INSN_SIZE]; + u8 *addr = op->kp.addr; + + memcpy(old, op->kp.addr, JMP32_INSN_SIZE); + memcpy(new + INT3_INSN_SIZE, + op->optinsn.copied_insn, + JMP32_INSN_SIZE - INT3_INSN_SIZE); + + text_poke(addr, new, INT3_INSN_SIZE); text_poke_sync(); + text_poke(addr + INT3_INSN_SIZE, + new + INT3_INSN_SIZE, + JMP32_INSN_SIZE - INT3_INSN_SIZE); + text_poke_sync(); + + perf_event_text_poke(op->kp.addr, old, JMP32_INSN_SIZE, new, JMP32_INSN_SIZE); } /* diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index df63786e7bfa..08320b0b2b27 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -7,8 +7,11 @@ * Authors: Anthony Liguori <aliguori@us.ibm.com> */ +#define pr_fmt(fmt) "kvm-guest: " fmt + #include <linux/context_tracking.h> #include <linux/init.h> +#include <linux/irq.h> #include <linux/kernel.h> #include <linux/kvm_para.h> #include <linux/cpu.h> @@ -232,18 +235,13 @@ EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags); noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) { - u32 reason = kvm_read_and_reset_apf_flags(); - bool rcu_exit; + u32 flags = kvm_read_and_reset_apf_flags(); + irqentry_state_t state; - switch (reason) { - case KVM_PV_REASON_PAGE_NOT_PRESENT: - case KVM_PV_REASON_PAGE_READY: - break; - default: + if (!flags) return false; - } - rcu_exit = idtentry_enter_cond_rcu(regs); + state = irqentry_enter(regs); instrumentation_begin(); /* @@ -254,20 +252,41 @@ noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) if (unlikely(!(regs->flags & X86_EFLAGS_IF))) panic("Host injected async #PF in interrupt disabled region\n"); - if (reason == KVM_PV_REASON_PAGE_NOT_PRESENT) { + if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) { if (unlikely(!(user_mode(regs)))) panic("Host injected async #PF in kernel mode\n"); /* Page is swapped out by the host. */ kvm_async_pf_task_wait_schedule(token); } else { - kvm_async_pf_task_wake(token); + WARN_ONCE(1, "Unexpected async PF flags: %x\n", flags); } instrumentation_end(); - idtentry_exit_cond_rcu(regs, rcu_exit); + irqentry_exit(regs, state); return true; } +DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + u32 token; + irqentry_state_t state; + + state = irqentry_enter(regs); + + inc_irq_stat(irq_hv_callback_count); + + if (__this_cpu_read(apf_reason.enabled)) { + token = __this_cpu_read(apf_reason.token); + kvm_async_pf_task_wake(token); + __this_cpu_write(apf_reason.token, 0); + wrmsrl(MSR_KVM_ASYNC_PF_ACK, 1); + } + + irqentry_exit(regs, state); + set_irq_regs(old_regs); +} + static void __init paravirt_ops_setup(void) { pv_info.name = "KVM"; @@ -289,8 +308,8 @@ static void kvm_register_steal_time(void) return; wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); - pr_info("kvm-stealtime: cpu %d, msr %llx\n", - cpu, (unsigned long long) slow_virt_to_phys(st)); + pr_info("stealtime: cpu %d, msr %llx\n", cpu, + (unsigned long long) slow_virt_to_phys(st)); } static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; @@ -311,17 +330,19 @@ static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val) static void kvm_guest_cpu_init(void) { - if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { - u64 pa; + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) { + u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled)); pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); - pa |= KVM_ASYNC_PF_ENABLED; + pa |= KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT)) pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; + wrmsrl(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR); + wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); __this_cpu_write(apf_reason.enabled, 1); pr_info("KVM setup async PF for cpu %d\n", smp_processor_id()); @@ -493,7 +514,8 @@ static void __send_ipi_mask(const struct cpumask *mask, int vector) } else { ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr); - WARN_ONCE(ret < 0, "KVM: failed to send PV IPI: %ld", ret); + WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld", + ret); min = max = apic_id; ipi_bitmap = 0; } @@ -503,7 +525,8 @@ static void __send_ipi_mask(const struct cpumask *mask, int vector) if (ipi_bitmap) { ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr); - WARN_ONCE(ret < 0, "KVM: failed to send PV IPI: %ld", ret); + WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld", + ret); } local_irq_restore(flags); @@ -533,7 +556,7 @@ static void kvm_setup_pv_ipi(void) { apic->send_IPI_mask = kvm_send_ipi_mask; apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself; - pr_info("KVM setup pv IPIs\n"); + pr_info("setup PV IPIs\n"); } static void kvm_smp_send_call_func_ipi(const struct cpumask *mask) @@ -551,13 +574,6 @@ static void kvm_smp_send_call_func_ipi(const struct cpumask *mask) } } -static void __init kvm_smp_prepare_cpus(unsigned int max_cpus) -{ - native_smp_prepare_cpus(max_cpus); - if (kvm_para_has_hint(KVM_HINTS_REALTIME)) - static_branch_disable(&virt_spin_lock_key); -} - static void __init kvm_smp_prepare_boot_cpu(void) { /* @@ -646,19 +662,20 @@ static void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) apic_set_eoi_write(kvm_guest_apic_eoi_write); - if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) { static_branch_enable(&kvm_async_pf_enabled); + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_kvm_asyncpf_interrupt); + } #ifdef CONFIG_SMP - smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus; smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; if (pv_sched_yield_supported()) { smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi; - pr_info("KVM setup pv sched yield\n"); + pr_info("setup PV sched yield\n"); } if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online", kvm_cpu_online, kvm_cpu_down_prepare) < 0) - pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n"); + pr_err("failed to install cpu hotplug callbacks\n"); #else sev_map_percpu_data(); kvm_guest_cpu_init(); @@ -854,16 +871,36 @@ asm( */ void __init kvm_spinlock_init(void) { - /* Does host kernel support KVM_FEATURE_PV_UNHALT? */ - if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) + /* + * In case host doesn't support KVM_FEATURE_PV_UNHALT there is still an + * advantage of keeping virt_spin_lock_key enabled: virt_spin_lock() is + * preferred over native qspinlock when vCPU is preempted. + */ + if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) { + pr_info("PV spinlocks disabled, no host support\n"); return; + } - if (kvm_para_has_hint(KVM_HINTS_REALTIME)) - return; + /* + * Disable PV spinlocks and use native qspinlock when dedicated pCPUs + * are available. + */ + if (kvm_para_has_hint(KVM_HINTS_REALTIME)) { + pr_info("PV spinlocks disabled with KVM_HINTS_REALTIME hints\n"); + goto out; + } - /* Don't use the pvqspinlock code if there is only 1 vCPU. */ - if (num_possible_cpus() == 1) - return; + if (num_possible_cpus() == 1) { + pr_info("PV spinlocks disabled, single CPU\n"); + goto out; + } + + if (nopvspin) { + pr_info("PV spinlocks disabled, forced by \"nopvspin\" parameter\n"); + goto out; + } + + pr_info("PV spinlocks enabled\n"); __pv_init_lock_hash(); pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; @@ -876,6 +913,13 @@ void __init kvm_spinlock_init(void) pv_ops.lock.vcpu_is_preempted = PV_CALLEE_SAVE(__kvm_vcpu_is_preempted); } + /* + * When PV spinlock is enabled which is preferred over + * virt_spin_lock(), virt_spin_lock_key's value is meaningless. + * Just disable it anyway. + */ +out: + static_branch_disable(&virt_spin_lock_key); } #endif /* CONFIG_PARAVIRT_SPINLOCKS */ @@ -895,8 +939,8 @@ static void kvm_enable_host_haltpoll(void *i) void arch_haltpoll_enable(unsigned int cpu) { if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) { - pr_err_once("kvm: host does not support poll control\n"); - pr_err_once("kvm: host upgrade recommended\n"); + pr_err_once("host does not support poll control\n"); + pr_err_once("host upgrade recommended\n"); return; } diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index afac7ccce72f..411af4aa7b51 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -19,10 +19,11 @@ #include <linux/smp.h> #include <linux/pci.h> +#include <asm/io_apic.h> +#include <asm/acpi.h> #include <asm/irqdomain.h> #include <asm/mtrr.h> #include <asm/mpspec.h> -#include <asm/pgalloc.h> #include <asm/io_apic.h> #include <asm/proto.h> #include <asm/bios_ebda.h> diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 1547be359d7f..49dcfb85e773 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -42,6 +42,14 @@ static struct class *msr_class; static enum cpuhp_state cpuhp_msr_state; +enum allow_write_msrs { + MSR_WRITES_ON, + MSR_WRITES_OFF, + MSR_WRITES_DEFAULT, +}; + +static enum allow_write_msrs allow_writes = MSR_WRITES_DEFAULT; + static ssize_t msr_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -70,6 +78,24 @@ static ssize_t msr_read(struct file *file, char __user *buf, return bytes ? bytes : err; } +static int filter_write(u32 reg) +{ + switch (allow_writes) { + case MSR_WRITES_ON: return 0; + case MSR_WRITES_OFF: return -EPERM; + default: break; + } + + if (reg == MSR_IA32_ENERGY_PERF_BIAS) + return 0; + + pr_err_ratelimited("Write to unrecognized MSR 0x%x by %s\n" + "Please report to x86@kernel.org\n", + reg, current->comm); + + return 0; +} + static ssize_t msr_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -84,6 +110,10 @@ static ssize_t msr_write(struct file *file, const char __user *buf, if (err) return err; + err = filter_write(reg); + if (err) + return err; + if (count % 8) return -EINVAL; /* Invalid chunk size */ @@ -92,9 +122,13 @@ static ssize_t msr_write(struct file *file, const char __user *buf, err = -EFAULT; break; } + + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]); if (err) break; + tmp += 2; bytes += 8; } @@ -242,6 +276,41 @@ static void __exit msr_exit(void) } module_exit(msr_exit) +static int set_allow_writes(const char *val, const struct kernel_param *cp) +{ + /* val is NUL-terminated, see kernfs_fop_write() */ + char *s = strstrip((char *)val); + + if (!strcmp(s, "on")) + allow_writes = MSR_WRITES_ON; + else if (!strcmp(s, "off")) + allow_writes = MSR_WRITES_OFF; + else + allow_writes = MSR_WRITES_DEFAULT; + + return 0; +} + +static int get_allow_writes(char *buf, const struct kernel_param *kp) +{ + const char *res; + + switch (allow_writes) { + case MSR_WRITES_ON: res = "on"; break; + case MSR_WRITES_OFF: res = "off"; break; + default: res = "default"; break; + } + + return sprintf(buf, "%s\n", res); +} + +static const struct kernel_param_ops allow_writes_ops = { + .set = set_allow_writes, + .get = get_allow_writes +}; + +module_param_cb(allow_writes, &allow_writes_ops, NULL, 0600); + MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>"); MODULE_DESCRIPTION("x86 generic MSR driver"); MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index d7c5e44b26f7..4fc9954a9560 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -330,7 +330,6 @@ static noinstr void default_do_nmi(struct pt_regs *regs) __this_cpu_write(last_nmi_rip, regs->ip); instrumentation_begin(); - trace_hardirqs_off_finish(); handled = nmi_handle(NMI_LOCAL, regs); __this_cpu_add(nmi_stats.normal, handled); @@ -417,8 +416,6 @@ static noinstr void default_do_nmi(struct pt_regs *regs) unknown_nmi_error(reason, regs); out: - if (regs->flags & X86_EFLAGS_IF) - trace_hardirqs_on_prepare(); instrumentation_end(); } @@ -478,6 +475,8 @@ static DEFINE_PER_CPU(unsigned long, nmi_dr7); DEFINE_IDTENTRY_RAW(exc_nmi) { + bool irq_state; + if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id())) return; @@ -491,14 +490,14 @@ nmi_restart: this_cpu_write(nmi_dr7, local_db_save()); - nmi_enter(); + irq_state = idtentry_enter_nmi(regs); inc_irq_stat(__nmi_count); if (!ignore_nmis) default_do_nmi(regs); - nmi_exit(); + idtentry_exit_nmi(regs, irq_state); local_db_restore(this_cpu_read(nmi_dr7)); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index fe67dbd76e51..994d8393f2f7 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -121,8 +121,8 @@ static int set_new_tls(struct task_struct *p, unsigned long tls) return do_set_thread_area_64(p, ARCH_SET_FS, tls); } -int copy_thread_tls(unsigned long clone_flags, unsigned long sp, - unsigned long arg, struct task_struct *p, unsigned long tls) +int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, + struct task_struct *p, unsigned long tls) { struct inactive_task_frame *frame; struct fork_frame *fork_frame; @@ -140,10 +140,12 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); #ifdef CONFIG_X86_64 - savesegment(gs, p->thread.gsindex); - p->thread.gsbase = p->thread.gsindex ? 0 : current->thread.gsbase; - savesegment(fs, p->thread.fsindex); - p->thread.fsbase = p->thread.fsindex ? 0 : current->thread.fsbase; + current_save_fsgs(); + p->thread.fsindex = current->thread.fsindex; + p->thread.fsbase = current->thread.fsbase; + p->thread.gsindex = current->thread.gsindex; + p->thread.gsbase = current->thread.gsbase; + savesegment(es, p->thread.es); savesegment(ds, p->thread.ds); #else diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index acfd6d2a0cbf..4f2f54e1281c 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -56,7 +56,8 @@ #include "process.h" -void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) +void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, + const char *log_lvl) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; unsigned long d0, d1, d2, d3, d6, d7; @@ -67,14 +68,14 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) else savesegment(gs, gs); - show_ip(regs, KERN_DEFAULT); + show_ip(regs, log_lvl); - printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", - regs->ax, regs->bx, regs->cx, regs->dx); - printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", - regs->si, regs->di, regs->bp, regs->sp); - printk(KERN_DEFAULT "DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x EFLAGS: %08lx\n", - (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, regs->ss, regs->flags); + printk("%sEAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", + log_lvl, regs->ax, regs->bx, regs->cx, regs->dx); + printk("%sESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", + log_lvl, regs->si, regs->di, regs->bp, regs->sp); + printk("%sDS: %04x ES: %04x FS: %04x GS: %04x SS: %04x EFLAGS: %08lx\n", + log_lvl, (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, regs->ss, regs->flags); if (mode != SHOW_REGS_ALL) return; @@ -83,8 +84,8 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) cr2 = read_cr2(); cr3 = __read_cr3(); cr4 = __read_cr4(); - printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", - cr0, cr2, cr3, cr4); + printk("%sCR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", + log_lvl, cr0, cr2, cr3, cr4); get_debugreg(d0, 0); get_debugreg(d1, 1); @@ -98,10 +99,10 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) (d6 == DR6_RESERVED) && (d7 == 0x400)) return; - printk(KERN_DEFAULT "DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", - d0, d1, d2, d3); - printk(KERN_DEFAULT "DR6: %08lx DR7: %08lx\n", - d6, d7); + printk("%sDR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", + log_lvl, d0, d1, d2, d3); + printk("%sDR6: %08lx DR7: %08lx\n", + log_lvl, d6, d7); } void release_thread(struct task_struct *dead_task) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 9a97415b2139..9afefe325acb 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -62,30 +62,31 @@ #include "process.h" /* Prints also some state that isn't saved in the pt_regs */ -void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) +void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, + const char *log_lvl) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; unsigned long d0, d1, d2, d3, d6, d7; unsigned int fsindex, gsindex; unsigned int ds, es; - show_iret_regs(regs); + show_iret_regs(regs, log_lvl); if (regs->orig_ax != -1) pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax); else pr_cont("\n"); - printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n", - regs->ax, regs->bx, regs->cx); - printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n", - regs->dx, regs->si, regs->di); - printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n", - regs->bp, regs->r8, regs->r9); - printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n", - regs->r10, regs->r11, regs->r12); - printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n", - regs->r13, regs->r14, regs->r15); + printk("%sRAX: %016lx RBX: %016lx RCX: %016lx\n", + log_lvl, regs->ax, regs->bx, regs->cx); + printk("%sRDX: %016lx RSI: %016lx RDI: %016lx\n", + log_lvl, regs->dx, regs->si, regs->di); + printk("%sRBP: %016lx R08: %016lx R09: %016lx\n", + log_lvl, regs->bp, regs->r8, regs->r9); + printk("%sR10: %016lx R11: %016lx R12: %016lx\n", + log_lvl, regs->r10, regs->r11, regs->r12); + printk("%sR13: %016lx R14: %016lx R15: %016lx\n", + log_lvl, regs->r13, regs->r14, regs->r15); if (mode == SHOW_REGS_SHORT) return; @@ -93,8 +94,8 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) if (mode == SHOW_REGS_USER) { rdmsrl(MSR_FS_BASE, fs); rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); - printk(KERN_DEFAULT "FS: %016lx GS: %016lx\n", - fs, shadowgs); + printk("%sFS: %016lx GS: %016lx\n", + log_lvl, fs, shadowgs); return; } @@ -112,12 +113,12 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) cr3 = __read_cr3(); cr4 = __read_cr4(); - printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", - fs, fsindex, gs, gsindex, shadowgs); - printk(KERN_DEFAULT "CS: %04lx DS: %04x ES: %04x CR0: %016lx\n", regs->cs, ds, - es, cr0); - printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, - cr4); + printk("%sFS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", + log_lvl, fs, fsindex, gs, gsindex, shadowgs); + printk("%sCS: %04lx DS: %04x ES: %04x CR0: %016lx\n", + log_lvl, regs->cs, ds, es, cr0); + printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n", + log_lvl, cr2, cr3, cr4); get_debugreg(d0, 0); get_debugreg(d1, 1); @@ -129,14 +130,14 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) /* Only print out debug registers if they are in their non-default state. */ if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && (d6 == DR6_RESERVED) && (d7 == 0x400))) { - printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", - d0, d1, d2); - printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", - d3, d6, d7); + printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n", + log_lvl, d0, d1, d2); + printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n", + log_lvl, d3, d6, d7); } if (boot_cpu_has(X86_FEATURE_OSPKE)) - printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru()); + printk("%sPKRU: %08x\n", log_lvl, read_pkru()); } void release_thread(struct task_struct *dead_task) @@ -150,6 +151,56 @@ enum which_selector { }; /* + * Out of line to be protected from kprobes and tracing. If this would be + * traced or probed than any access to a per CPU variable happens with + * the wrong GS. + * + * It is not used on Xen paravirt. When paravirt support is needed, it + * needs to be renamed with native_ prefix. + */ +static noinstr unsigned long __rdgsbase_inactive(void) +{ + unsigned long gsbase; + + lockdep_assert_irqs_disabled(); + + if (!static_cpu_has(X86_FEATURE_XENPV)) { + native_swapgs(); + gsbase = rdgsbase(); + native_swapgs(); + } else { + instrumentation_begin(); + rdmsrl(MSR_KERNEL_GS_BASE, gsbase); + instrumentation_end(); + } + + return gsbase; +} + +/* + * Out of line to be protected from kprobes and tracing. If this would be + * traced or probed than any access to a per CPU variable happens with + * the wrong GS. + * + * It is not used on Xen paravirt. When paravirt support is needed, it + * needs to be renamed with native_ prefix. + */ +static noinstr void __wrgsbase_inactive(unsigned long gsbase) +{ + lockdep_assert_irqs_disabled(); + + if (!static_cpu_has(X86_FEATURE_XENPV)) { + native_swapgs(); + wrgsbase(gsbase); + native_swapgs(); + } else { + instrumentation_begin(); + wrmsrl(MSR_KERNEL_GS_BASE, gsbase); + instrumentation_end(); + } +} + +/* * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are * not available. The goal is to be reasonably fast on non-FSGSBASE systems. * It's forcibly inlined because it'll generate better code and this function @@ -198,22 +249,35 @@ static __always_inline void save_fsgs(struct task_struct *task) { savesegment(fs, task->thread.fsindex); savesegment(gs, task->thread.gsindex); - save_base_legacy(task, task->thread.fsindex, FS); - save_base_legacy(task, task->thread.gsindex, GS); + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + /* + * If FSGSBASE is enabled, we can't make any useful guesses + * about the base, and user code expects us to save the current + * value. Fortunately, reading the base directly is efficient. + */ + task->thread.fsbase = rdfsbase(); + task->thread.gsbase = __rdgsbase_inactive(); + } else { + save_base_legacy(task, task->thread.fsindex, FS); + save_base_legacy(task, task->thread.gsindex, GS); + } } -#if IS_ENABLED(CONFIG_KVM) /* * While a process is running,current->thread.fsbase and current->thread.gsbase - * may not match the corresponding CPU registers (see save_base_legacy()). KVM - * wants an efficient way to save and restore FSBASE and GSBASE. - * When FSGSBASE extensions are enabled, this will have to use RD{FS,GS}BASE. + * may not match the corresponding CPU registers (see save_base_legacy()). */ -void save_fsgs_for_kvm(void) +void current_save_fsgs(void) { + unsigned long flags; + + /* Interrupts need to be off for FSGSBASE */ + local_irq_save(flags); save_fsgs(current); + local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(save_fsgs_for_kvm); +#if IS_ENABLED(CONFIG_KVM) +EXPORT_SYMBOL_GPL(current_save_fsgs); #endif static __always_inline void loadseg(enum which_selector which, @@ -278,14 +342,26 @@ static __always_inline void load_seg_legacy(unsigned short prev_index, static __always_inline void x86_fsgsbase_load(struct thread_struct *prev, struct thread_struct *next) { - load_seg_legacy(prev->fsindex, prev->fsbase, - next->fsindex, next->fsbase, FS); - load_seg_legacy(prev->gsindex, prev->gsbase, - next->gsindex, next->gsbase, GS); + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + /* Update the FS and GS selectors if they could have changed. */ + if (unlikely(prev->fsindex || next->fsindex)) + loadseg(FS, next->fsindex); + if (unlikely(prev->gsindex || next->gsindex)) + loadseg(GS, next->gsindex); + + /* Update the bases. */ + wrfsbase(next->fsbase); + __wrgsbase_inactive(next->gsbase); + } else { + load_seg_legacy(prev->fsindex, prev->fsbase, + next->fsindex, next->fsbase, FS); + load_seg_legacy(prev->gsindex, prev->gsbase, + next->gsindex, next->gsbase, GS); + } } -static unsigned long x86_fsgsbase_read_task(struct task_struct *task, - unsigned short selector) +unsigned long x86_fsgsbase_read_task(struct task_struct *task, + unsigned short selector) { unsigned short idx = selector >> 3; unsigned long base; @@ -314,7 +390,7 @@ static unsigned long x86_fsgsbase_read_task(struct task_struct *task, */ mutex_lock(&task->mm->context.lock); ldt = task->mm->context.ldt; - if (unlikely(idx >= ldt->nr_entries)) + if (unlikely(!ldt || idx >= ldt->nr_entries)) base = 0; else base = get_desc_base(ldt->entries + idx); @@ -327,13 +403,44 @@ static unsigned long x86_fsgsbase_read_task(struct task_struct *task, return base; } +unsigned long x86_gsbase_read_cpu_inactive(void) +{ + unsigned long gsbase; + + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + unsigned long flags; + + local_irq_save(flags); + gsbase = __rdgsbase_inactive(); + local_irq_restore(flags); + } else { + rdmsrl(MSR_KERNEL_GS_BASE, gsbase); + } + + return gsbase; +} + +void x86_gsbase_write_cpu_inactive(unsigned long gsbase) +{ + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + unsigned long flags; + + local_irq_save(flags); + __wrgsbase_inactive(gsbase); + local_irq_restore(flags); + } else { + wrmsrl(MSR_KERNEL_GS_BASE, gsbase); + } +} + unsigned long x86_fsbase_read_task(struct task_struct *task) { unsigned long fsbase; if (task == current) fsbase = x86_fsbase_read_cpu(); - else if (task->thread.fsindex == 0) + else if (static_cpu_has(X86_FEATURE_FSGSBASE) || + (task->thread.fsindex == 0)) fsbase = task->thread.fsbase; else fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex); @@ -347,7 +454,8 @@ unsigned long x86_gsbase_read_task(struct task_struct *task) if (task == current) gsbase = x86_gsbase_read_cpu_inactive(); - else if (task->thread.gsindex == 0) + else if (static_cpu_has(X86_FEATURE_FSGSBASE) || + (task->thread.gsindex == 0)) gsbase = task->thread.gsbase; else gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 44130588987f..5679aa3fdcb8 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -281,17 +281,9 @@ static int set_segment_reg(struct task_struct *task, return -EIO; /* - * This function has some ABI oddities. - * - * A 32-bit ptracer probably expects that writing FS or GS will change - * FSBASE or GSBASE respectively. In the absence of FSGSBASE support, - * this code indeed has that effect. When FSGSBASE is added, this - * will require a special case. - * - * For existing 64-bit ptracers, writing FS or GS *also* currently - * changes the base if the selector is nonzero the next time the task - * is run. This behavior may not be needed, and trying to preserve it - * when FSGSBASE is added would be complicated at best. + * Writes to FS and GS will change the stored selector. Whether + * this changes the segment base as well depends on whether + * FSGSBASE is enabled. */ switch (offset) { @@ -379,25 +371,12 @@ static int putreg(struct task_struct *child, case offsetof(struct user_regs_struct,fs_base): if (value >= TASK_SIZE_MAX) return -EIO; - /* - * When changing the FS base, use do_arch_prctl_64() - * to set the index to zero and to set the base - * as requested. - * - * NB: This behavior is nonsensical and likely needs to - * change when FSGSBASE support is added. - */ - if (child->thread.fsbase != value) - return do_arch_prctl_64(child, ARCH_SET_FS, value); + x86_fsbase_write_task(child, value); return 0; case offsetof(struct user_regs_struct,gs_base): - /* - * Exactly the same here as the %fs handling above. - */ if (value >= TASK_SIZE_MAX) return -EIO; - if (child->thread.gsbase != value) - return do_arch_prctl_64(child, ARCH_SET_GS, value); + x86_gsbase_write_task(child, value); return 0; #endif } @@ -433,26 +412,12 @@ static unsigned long getreg(struct task_struct *task, unsigned long offset) static int genregs_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { - if (kbuf) { - unsigned long *k = kbuf; - while (count >= sizeof(*k)) { - *k++ = getreg(target, pos); - count -= sizeof(*k); - pos += sizeof(*k); - } - } else { - unsigned long __user *u = ubuf; - while (count >= sizeof(*u)) { - if (__put_user(getreg(target, pos), u++)) - return -EFAULT; - count -= sizeof(*u); - pos += sizeof(*u); - } - } + int reg; + for (reg = 0; to.left; reg++) + membuf_store(&to, getreg(target, reg * sizeof(unsigned long))); return 0; } @@ -716,16 +681,14 @@ static int ioperm_active(struct task_struct *target, static int ioperm_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { struct io_bitmap *iobm = target->thread.io_bitmap; if (!iobm) return -ENXIO; - return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - iobm->bitmap, 0, IO_BITMAP_BYTES); + return membuf_write(&to, iobm->bitmap, IO_BITMAP_BYTES); } /* @@ -880,14 +843,39 @@ long arch_ptrace(struct task_struct *child, long request, static int putreg32(struct task_struct *child, unsigned regno, u32 value) { struct pt_regs *regs = task_pt_regs(child); + int ret; switch (regno) { SEG32(cs); SEG32(ds); SEG32(es); - SEG32(fs); - SEG32(gs); + + /* + * A 32-bit ptracer on a 64-bit kernel expects that writing + * FS or GS will also update the base. This is needed for + * operations like PTRACE_SETREGS to fully restore a saved + * CPU state. + */ + + case offsetof(struct user32, regs.fs): + ret = set_segment_reg(child, + offsetof(struct user_regs_struct, fs), + value); + if (ret == 0) + child->thread.fsbase = + x86_fsgsbase_read_task(child, value); + return ret; + + case offsetof(struct user32, regs.gs): + ret = set_segment_reg(child, + offsetof(struct user_regs_struct, gs), + value); + if (ret == 0) + child->thread.gsbase = + x86_fsgsbase_read_task(child, value); + return ret; + SEG32(ss); R32(ebx, bx); @@ -1003,28 +991,15 @@ static int getreg32(struct task_struct *child, unsigned regno, u32 *val) static int genregs32_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { - if (kbuf) { - compat_ulong_t *k = kbuf; - while (count >= sizeof(*k)) { - getreg32(target, pos, k++); - count -= sizeof(*k); - pos += sizeof(*k); - } - } else { - compat_ulong_t __user *u = ubuf; - while (count >= sizeof(*u)) { - compat_ulong_t word; - getreg32(target, pos, &word); - if (__put_user(word, u++)) - return -EFAULT; - count -= sizeof(*u); - pos += sizeof(*u); - } - } + int reg; + for (reg = 0; to.left; reg++) { + u32 val; + getreg32(target, reg * 4, &val); + membuf_store(&to, val); + } return 0; } @@ -1234,25 +1209,25 @@ static struct user_regset x86_64_regsets[] __ro_after_init = { .core_note_type = NT_PRSTATUS, .n = sizeof(struct user_regs_struct) / sizeof(long), .size = sizeof(long), .align = sizeof(long), - .get = genregs_get, .set = genregs_set + .regset_get = genregs_get, .set = genregs_set }, [REGSET_FP] = { .core_note_type = NT_PRFPREG, .n = sizeof(struct user_i387_struct) / sizeof(long), .size = sizeof(long), .align = sizeof(long), - .active = regset_xregset_fpregs_active, .get = xfpregs_get, .set = xfpregs_set + .active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set }, [REGSET_XSTATE] = { .core_note_type = NT_X86_XSTATE, .size = sizeof(u64), .align = sizeof(u64), - .active = xstateregs_active, .get = xstateregs_get, + .active = xstateregs_active, .regset_get = xstateregs_get, .set = xstateregs_set }, [REGSET_IOPERM64] = { .core_note_type = NT_386_IOPERM, .n = IO_BITMAP_LONGS, .size = sizeof(long), .align = sizeof(long), - .active = ioperm_active, .get = ioperm_get + .active = ioperm_active, .regset_get = ioperm_get }, }; @@ -1275,24 +1250,24 @@ static struct user_regset x86_32_regsets[] __ro_after_init = { .core_note_type = NT_PRSTATUS, .n = sizeof(struct user_regs_struct32) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), - .get = genregs32_get, .set = genregs32_set + .regset_get = genregs32_get, .set = genregs32_set }, [REGSET_FP] = { .core_note_type = NT_PRFPREG, .n = sizeof(struct user_i387_ia32_struct) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), - .active = regset_fpregs_active, .get = fpregs_get, .set = fpregs_set + .active = regset_fpregs_active, .regset_get = fpregs_get, .set = fpregs_set }, [REGSET_XFP] = { .core_note_type = NT_PRXFPREG, .n = sizeof(struct user32_fxsr_struct) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), - .active = regset_xregset_fpregs_active, .get = xfpregs_get, .set = xfpregs_set + .active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set }, [REGSET_XSTATE] = { .core_note_type = NT_X86_XSTATE, .size = sizeof(u64), .align = sizeof(u64), - .active = xstateregs_active, .get = xstateregs_get, + .active = xstateregs_active, .regset_get = xstateregs_get, .set = xstateregs_set }, [REGSET_TLS] = { @@ -1301,13 +1276,13 @@ static struct user_regset x86_32_regsets[] __ro_after_init = { .size = sizeof(struct user_desc), .align = sizeof(struct user_desc), .active = regset_tls_active, - .get = regset_tls_get, .set = regset_tls_set + .regset_get = regset_tls_get, .set = regset_tls_set }, [REGSET_IOPERM32] = { .core_note_type = NT_386_IOPERM, .n = IO_BITMAP_BYTES / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), - .active = ioperm_active, .get = ioperm_get + .active = ioperm_active, .regset_get = ioperm_get }, }; diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 896d74cb5081..1b10717c9321 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -95,7 +95,7 @@ static void ich_force_hpet_resume(void) static void ich_force_enable_hpet(struct pci_dev *dev) { u32 val; - u32 uninitialized_var(rcba); + u32 rcba; int err = 0; if (hpet_address || force_hpet_address) @@ -185,7 +185,7 @@ static void hpet_print_force_info(void) static void old_ich_force_hpet_resume(void) { u32 val; - u32 uninitialized_var(gen_cntl); + u32 gen_cntl; if (!force_hpet_address || !cached_dev) return; @@ -207,7 +207,7 @@ static void old_ich_force_hpet_resume(void) static void old_ich_force_enable_hpet(struct pci_dev *dev) { u32 val; - u32 uninitialized_var(gen_cntl); + u32 gen_cntl; if (hpet_address || force_hpet_address) return; @@ -298,7 +298,7 @@ static void vt8237_force_hpet_resume(void) static void vt8237_force_enable_hpet(struct pci_dev *dev) { - u32 uninitialized_var(val); + u32 val; if (hpet_address || force_hpet_address) return; @@ -429,7 +429,7 @@ static void nvidia_force_hpet_resume(void) static void nvidia_force_enable_hpet(struct pci_dev *dev) { - u32 uninitialized_var(val); + u32 val; if (hpet_address || force_hpet_address) return; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index a3767e74c758..3511736fbc74 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -25,6 +25,7 @@ #include <xen/xen.h> #include <asm/apic.h> +#include <asm/numa.h> #include <asm/bios_ebda.h> #include <asm/bugs.h> #include <asm/cpu.h> @@ -870,8 +871,6 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_BLK_DEV_RAM rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; - rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0); - rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0); #endif #ifdef CONFIG_EFI if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 399f97abee02..d5fa494c2304 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -25,6 +25,7 @@ #include <linux/user-return-notifier.h> #include <linux/uprobes.h> #include <linux/context_tracking.h> +#include <linux/entry-common.h> #include <linux/syscalls.h> #include <asm/processor.h> @@ -803,7 +804,7 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) * want to handle. Thus you cannot kill init even with a SIGKILL even by * mistake. */ -void do_signal(struct pt_regs *regs) +void arch_do_signal(struct pt_regs *regs) { struct ksignal ksig; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ffbd9a3d78d8..27aa04a95702 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -51,11 +51,11 @@ #include <linux/err.h> #include <linux/nmi.h> #include <linux/tboot.h> -#include <linux/stackprotector.h> #include <linux/gfp.h> #include <linux/cpuidle.h> #include <linux/numa.h> #include <linux/pgtable.h> +#include <linux/overflow.h> #include <asm/acpi.h> #include <asm/desc.h> @@ -80,6 +80,7 @@ #include <asm/cpu_device_id.h> #include <asm/spec-ctrl.h> #include <asm/hw_irq.h> +#include <asm/stackprotector.h> /* representing HT siblings of each logical CPU */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); @@ -259,21 +260,10 @@ static void notrace start_secondary(void *unused) /* enable local interrupts */ local_irq_enable(); - /* to prevent fake stack check failure in clock setup */ - boot_init_stack_canary(); - x86_cpuinit.setup_percpu_clockev(); wmb(); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); - - /* - * Prevent tail call to cpu_startup_entry() because the stack protector - * guard has been changed a couple of function calls up, in - * boot_init_stack_canary() and must not be checked before tail calling - * another function. - */ - prevent_tail_call_optimization(); } /** @@ -1011,6 +1001,7 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle) alternatives_enable_smp(); per_cpu(current_task, cpu) = idle; + cpu_init_stack_canary(cpu, idle); /* Initialize the interrupt stack(s) */ ret = irq_init_percpu_irqstack(cpu); @@ -1777,6 +1768,7 @@ void native_play_dead(void) #endif +#ifdef CONFIG_X86_64 /* * APERF/MPERF frequency ratio computation. * @@ -1975,6 +1967,7 @@ static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) static bool intel_set_max_freq_ratio(void) { u64 base_freq, turbo_freq; + u64 turbo_ratio; if (slv_set_max_freq_ratio(&base_freq, &turbo_freq)) goto out; @@ -2000,15 +1993,23 @@ out: /* * Some hypervisors advertise X86_FEATURE_APERFMPERF * but then fill all MSR's with zeroes. + * Some CPUs have turbo boost but don't declare any turbo ratio + * in MSR_TURBO_RATIO_LIMIT. */ - if (!base_freq) { - pr_debug("Couldn't determine cpu base frequency, necessary for scale-invariant accounting.\n"); + if (!base_freq || !turbo_freq) { + pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n"); return false; } - arch_turbo_freq_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, - base_freq); + turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq); + if (!turbo_ratio) { + pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n"); + return false; + } + + arch_turbo_freq_ratio = turbo_ratio; arch_set_max_freq_ratio(turbo_disabled()); + return true; } @@ -2048,11 +2049,19 @@ static void init_freq_invariance(bool secondary) } } +static void disable_freq_invariance_workfn(struct work_struct *work) +{ + static_branch_disable(&arch_scale_freq_key); +} + +static DECLARE_WORK(disable_freq_invariance_work, + disable_freq_invariance_workfn); + DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; void arch_scale_freq_tick(void) { - u64 freq_scale; + u64 freq_scale = SCHED_CAPACITY_SCALE; u64 aperf, mperf; u64 acnt, mcnt; @@ -2064,19 +2073,32 @@ void arch_scale_freq_tick(void) acnt = aperf - this_cpu_read(arch_prev_aperf); mcnt = mperf - this_cpu_read(arch_prev_mperf); - if (!mcnt) - return; this_cpu_write(arch_prev_aperf, aperf); this_cpu_write(arch_prev_mperf, mperf); - acnt <<= 2*SCHED_CAPACITY_SHIFT; - mcnt *= arch_max_freq_ratio; + if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) + goto error; + + if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt) + goto error; freq_scale = div64_u64(acnt, mcnt); + if (!freq_scale) + goto error; if (freq_scale > SCHED_CAPACITY_SCALE) freq_scale = SCHED_CAPACITY_SCALE; this_cpu_write(arch_freq_scale, freq_scale); + return; + +error: + pr_warn("Scheduler frequency invariance went wobbly, disabling!\n"); + schedule_work(&disable_freq_invariance_work); +} +#else +static inline void init_freq_invariance(bool secondary) +{ } +#endif /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/sys_ia32.c b/arch/x86/kernel/sys_ia32.c index f8d65c99feb8..720cde885042 100644 --- a/arch/x86/kernel/sys_ia32.c +++ b/arch/x86/kernel/sys_ia32.c @@ -251,9 +251,6 @@ COMPAT_SYSCALL_DEFINE5(ia32_clone, unsigned long, clone_flags, .tls = tls_val, }; - if (!legacy_clone_args_valid(&args)) - return -EINVAL; - return _do_fork(&args); } #endif /* CONFIG_IA32_EMULATION */ diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 71d3fef1edc9..64a496a0687f 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -256,36 +256,16 @@ int regset_tls_active(struct task_struct *target, } int regset_tls_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { const struct desc_struct *tls; + struct user_desc v; + int pos; - if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || - (pos % sizeof(struct user_desc)) != 0 || - (count % sizeof(struct user_desc)) != 0) - return -EINVAL; - - pos /= sizeof(struct user_desc); - count /= sizeof(struct user_desc); - - tls = &target->thread.tls_array[pos]; - - if (kbuf) { - struct user_desc *info = kbuf; - while (count-- > 0) - fill_user_desc(info++, GDT_ENTRY_TLS_MIN + pos++, - tls++); - } else { - struct user_desc __user *u_info = ubuf; - while (count-- > 0) { - struct user_desc info; - fill_user_desc(&info, GDT_ENTRY_TLS_MIN + pos++, tls++); - if (__copy_to_user(u_info++, &info, sizeof(info))) - return -EFAULT; - } + for (pos = 0, tls = target->thread.tls_array; to.left; pos++, tls++) { + fill_user_desc(&v, GDT_ENTRY_TLS_MIN + pos, tls); + membuf_write(&to, &v, sizeof(v)); } - return 0; } diff --git a/arch/x86/kernel/tls.h b/arch/x86/kernel/tls.h index 3a76e1d3535e..fc39447a0c1a 100644 --- a/arch/x86/kernel/tls.h +++ b/arch/x86/kernel/tls.h @@ -12,7 +12,7 @@ #include <linux/regset.h> extern user_regset_active_fn regset_tls_active; -extern user_regset_get_fn regset_tls_get; +extern user_regset_get2_fn regset_tls_get; extern user_regset_set_fn regset_tls_set; #endif /* _ARCH_X86_KERNEL_TLS_H */ diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index b8810ebbc8ae..0a2ec801b63f 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -31,6 +31,7 @@ #include <linux/init.h> #include <linux/smp.h> #include <linux/irq.h> +#include <asm/io_apic.h> #include <asm/cpu.h> static DEFINE_PER_CPU(struct x86_cpu, cpu_devices); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b7cb3e0716f7..1f66d2d1e998 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -62,7 +62,6 @@ #ifdef CONFIG_X86_64 #include <asm/x86_init.h> -#include <asm/pgalloc.h> #include <asm/proto.h> #else #include <asm/processor-flags.h> @@ -245,7 +244,7 @@ static noinstr bool handle_bug(struct pt_regs *regs) DEFINE_IDTENTRY_RAW(exc_invalid_op) { - bool rcu_exit; + irqentry_state_t state; /* * We use UD2 as a short encoding for 'CALL __WARN', as such @@ -255,11 +254,11 @@ DEFINE_IDTENTRY_RAW(exc_invalid_op) if (!user_mode(regs) && handle_bug(regs)) return; - rcu_exit = idtentry_enter_cond_rcu(regs); + state = irqentry_enter(regs); instrumentation_begin(); handle_invalid_op(regs); instrumentation_end(); - idtentry_exit_cond_rcu(regs, rcu_exit); + irqentry_exit(regs, state); } DEFINE_IDTENTRY(exc_coproc_segment_overrun) @@ -405,7 +404,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault) } #endif - nmi_enter(); + idtentry_enter_nmi(regs); instrumentation_begin(); notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); @@ -638,28 +637,25 @@ DEFINE_IDTENTRY_RAW(exc_int3) return; /* - * idtentry_enter_user() uses static_branch_{,un}likely() and therefore - * can trigger INT3, hence poke_int3_handler() must be done - * before. If the entry came from kernel mode, then use nmi_enter() - * because the INT3 could have been hit in any context including - * NMI. + * irqentry_enter_from_user_mode() uses static_branch_{,un}likely() + * and therefore can trigger INT3, hence poke_int3_handler() must + * be done before. If the entry came from kernel mode, then use + * nmi_enter() because the INT3 could have been hit in any context + * including NMI. */ if (user_mode(regs)) { - idtentry_enter_user(regs); + irqentry_enter_from_user_mode(regs); instrumentation_begin(); do_int3_user(regs); instrumentation_end(); - idtentry_exit_user(regs); + irqentry_exit_to_user_mode(regs); } else { - nmi_enter(); + bool irq_state = idtentry_enter_nmi(regs); instrumentation_begin(); - trace_hardirqs_off_finish(); if (!do_int3(regs)) die("int3", regs, 0); - if (regs->flags & X86_EFLAGS_IF) - trace_hardirqs_on_prepare(); instrumentation_end(); - nmi_exit(); + idtentry_exit_nmi(regs, irq_state); } } @@ -867,9 +863,8 @@ out: static __always_inline void exc_debug_kernel(struct pt_regs *regs, unsigned long dr6) { - nmi_enter(); + bool irq_state = idtentry_enter_nmi(regs); instrumentation_begin(); - trace_hardirqs_off_finish(); /* * If something gets miswired and we end up here for a user mode @@ -886,10 +881,8 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, handle_debug(regs, dr6, false); - if (regs->flags & X86_EFLAGS_IF) - trace_hardirqs_on_prepare(); instrumentation_end(); - nmi_exit(); + idtentry_exit_nmi(regs, irq_state); } static __always_inline void exc_debug_user(struct pt_regs *regs, @@ -901,12 +894,13 @@ static __always_inline void exc_debug_user(struct pt_regs *regs, */ WARN_ON_ONCE(!user_mode(regs)); - idtentry_enter_user(regs); + irqentry_enter_from_user_mode(regs); instrumentation_begin(); handle_debug(regs, dr6, true); + instrumentation_end(); - idtentry_exit_user(regs); + irqentry_exit_to_user_mode(regs); } #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 4fec6f3a1858..46c72f2ec32f 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -7,6 +7,7 @@ */ #include <linux/kernel.h> +#include <linux/thread_info.h> #include <asm/apic.h> #include <asm/cpu_device_id.h> diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 722a85f3b2dd..d7c44b257f7f 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -269,13 +269,13 @@ bool unwind_next_frame(struct unwind_state *state) /* * kthreads (other than the boot CPU's idle thread) have some * partial regs at the end of their stack which were placed - * there by copy_thread_tls(). But the regs don't have any + * there by copy_thread(). But the regs don't have any * useful information, so we can skip them. * * This user_mode() check is slightly broader than a PF_KTHREAD * check because it also catches the awkward situation where a * newly forked kthread transitions into a user task by calling - * do_execve(), which eventually clears PF_KTHREAD. + * kernel_execve(), which eventually clears PF_KTHREAD. */ if (!user_mode(regs)) goto the_end; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index b277a2db6267..fbd5bd7a945a 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -42,6 +42,7 @@ config KVM select HAVE_KVM_MSI select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_NO_POLL + select KVM_XFER_TO_GUEST_WORK select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_VFIO select SRCU diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 8a294f9747aa..3fd6eec202d7 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -54,28 +54,38 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted) #define F feature_bit -int kvm_update_cpuid(struct kvm_vcpu *vcpu) +static int kvm_check_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - struct kvm_lapic *apic = vcpu->arch.apic; - best = kvm_find_cpuid_entry(vcpu, 1, 0); - if (!best) - return 0; + /* + * The existing code assumes virtual address is 48-bit or 57-bit in the + * canonical address checks; exit if it is ever changed. + */ + best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); + if (best) { + int vaddr_bits = (best->eax & 0xff00) >> 8; + + if (vaddr_bits != 48 && vaddr_bits != 57 && vaddr_bits != 0) + return -EINVAL; + } - /* Update OSXSAVE bit */ - if (boot_cpu_has(X86_FEATURE_XSAVE) && best->function == 0x1) - cpuid_entry_change(best, X86_FEATURE_OSXSAVE, + return 0; +} + +void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 1, 0); + if (best) { + /* Update OSXSAVE bit */ + if (boot_cpu_has(X86_FEATURE_XSAVE)) + cpuid_entry_change(best, X86_FEATURE_OSXSAVE, kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)); - cpuid_entry_change(best, X86_FEATURE_APIC, + cpuid_entry_change(best, X86_FEATURE_APIC, vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE); - - if (apic) { - if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER)) - apic->lapic_timer.timer_mode_mask = 3 << 17; - else - apic->lapic_timer.timer_mode_mask = 1 << 17; } best = kvm_find_cpuid_entry(vcpu, 7, 0); @@ -84,31 +94,14 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) kvm_read_cr4_bits(vcpu, X86_CR4_PKE)); best = kvm_find_cpuid_entry(vcpu, 0xD, 0); - if (!best) { - vcpu->arch.guest_supported_xcr0 = 0; - } else { - vcpu->arch.guest_supported_xcr0 = - (best->eax | ((u64)best->edx << 32)) & supported_xcr0; + if (best) best->ebx = xstate_required_size(vcpu->arch.xcr0, false); - } best = kvm_find_cpuid_entry(vcpu, 0xD, 1); if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) || cpuid_entry_has(best, X86_FEATURE_XSAVEC))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); - /* - * The existing code assumes virtual address is 48-bit or 57-bit in the - * canonical address checks; exit if it is ever changed. - */ - best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); - if (best) { - int vaddr_bits = (best->eax & 0xff00) >> 8; - - if (vaddr_bits != 48 && vaddr_bits != 57 && vaddr_bits != 0) - return -EINVAL; - } - best = kvm_find_cpuid_entry(vcpu, KVM_CPUID_FEATURES, 0); if (kvm_hlt_in_guest(vcpu->kvm) && best && (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) @@ -121,14 +114,39 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT); } +} + +static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + struct kvm_cpuid_entry2 *best; + + kvm_x86_ops.vcpu_after_set_cpuid(vcpu); + + best = kvm_find_cpuid_entry(vcpu, 1, 0); + if (best && apic) { + if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER)) + apic->lapic_timer.timer_mode_mask = 3 << 17; + else + apic->lapic_timer.timer_mode_mask = 1 << 17; + + kvm_apic_set_version(vcpu); + } + + best = kvm_find_cpuid_entry(vcpu, 0xD, 0); + if (!best) + vcpu->arch.guest_supported_xcr0 = 0; + else + vcpu->arch.guest_supported_xcr0 = + (best->eax | ((u64)best->edx << 32)) & supported_xcr0; - /* Note, maxphyaddr must be updated before tdp_level. */ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); - vcpu->arch.tdp_level = kvm_x86_ops.get_tdp_level(vcpu); kvm_mmu_reset_context(vcpu); kvm_pmu_refresh(vcpu); - return 0; + vcpu->arch.cr4_guest_rsvd_bits = + __cr4_reserved_bits(guest_cpuid_has, vcpu); + kvm_x86_ops.update_exception_bitmap(vcpu); } static int is_efer_nx(void) @@ -203,10 +221,16 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, vcpu->arch.cpuid_entries[i].padding[2] = 0; } vcpu->arch.cpuid_nent = cpuid->nent; + r = kvm_check_cpuid(vcpu); + if (r) { + vcpu->arch.cpuid_nent = 0; + kvfree(cpuid_entries); + goto out; + } + cpuid_fix_nx_cap(vcpu); - kvm_apic_set_version(vcpu); - kvm_x86_ops.cpuid_update(vcpu); - r = kvm_update_cpuid(vcpu); + kvm_update_cpuid_runtime(vcpu); + kvm_vcpu_after_set_cpuid(vcpu); kvfree(cpuid_entries); out: @@ -227,9 +251,14 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, cpuid->nent * sizeof(struct kvm_cpuid_entry2))) goto out; vcpu->arch.cpuid_nent = cpuid->nent; - kvm_apic_set_version(vcpu); - kvm_x86_ops.cpuid_update(vcpu); - r = kvm_update_cpuid(vcpu); + r = kvm_check_cpuid(vcpu); + if (r) { + vcpu->arch.cpuid_nent = 0; + goto out; + } + + kvm_update_cpuid_runtime(vcpu); + kvm_vcpu_after_set_cpuid(vcpu); out: return r; } @@ -341,7 +370,8 @@ void kvm_set_cpu_caps(void) kvm_cpu_cap_mask(CPUID_7_EDX, F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | - F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) + F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) | + F(SERIALIZE) ); /* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */ @@ -604,7 +634,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) eax.split.bit_width = cap.bit_width_gp; eax.split.mask_length = cap.events_mask_len; - edx.split.num_counters_fixed = cap.num_counters_fixed; + edx.split.num_counters_fixed = min(cap.num_counters_fixed, MAX_FIXED_COUNTERS); edx.split.bit_width_fixed = cap.bit_width_fixed; edx.split.reserved = 0; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 05434cd9342f..3a923ae15f2f 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -9,7 +9,7 @@ extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly; void kvm_set_cpu_caps(void); -int kvm_update_cpuid(struct kvm_vcpu *vcpu); +void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index); int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index af9cdb426dd2..814d3aee5cef 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -900,6 +900,7 @@ int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages) kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_HYPERV); synic->active = true; synic->dont_zero_synic_pages = dont_zero_synic_pages; + synic->control = HV_SYNIC_CONTROL_ENABLE; return 0; } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4ce2ddd26c0b..5ccbee7165a2 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -354,7 +354,6 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val) void kvm_apic_set_version(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - struct kvm_cpuid_entry2 *feat; u32 v = APIC_VERSION; if (!lapic_in_kernel(vcpu)) @@ -367,8 +366,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu) * version first and level-triggered interrupts never get EOIed in * IOAPIC. */ - feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0); - if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31))) && + if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) && !ioapic_in_kernel(vcpu->kvm)) v |= APIC_LVR_DIRECTED_EOI; kvm_lapic_set_reg(apic, APIC_LVR, v); @@ -2068,7 +2066,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_TDCR: { uint32_t old_divisor = apic->divide_count; - kvm_lapic_set_reg(apic, APIC_TDCR, val); + kvm_lapic_set_reg(apic, APIC_TDCR, val & 0xb); update_divide_count(apic); if (apic->divide_count != old_divisor && apic->lapic_timer.period) { @@ -2085,7 +2083,8 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_SELF_IPI: if (apic_x2apic_mode(apic)) { - kvm_lapic_reg_write(apic, APIC_ICR, 0x40000 | (val & 0xff)); + kvm_lapic_reg_write(apic, APIC_ICR, + APIC_DEST_SELF | (val & APIC_VECTOR_MASK)); } else ret = 1; break; @@ -2232,7 +2231,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) vcpu->arch.apic_base = value; if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) - kvm_update_cpuid(vcpu); + kvm_update_cpuid_runtime(vcpu); if (!apic) return; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 444bb9c54548..5efc6081ca13 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -4,6 +4,7 @@ #include <linux/kvm_host.h> #include "kvm_cache_regs.h" +#include "cpuid.h" #define PT64_PT_BITS 9 #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) @@ -57,22 +58,14 @@ void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots); -void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer); +void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer, + gpa_t nested_cr3); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, bool accessed_dirty, gpa_t new_eptp); bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, u64 fault_address, char *insn, int insn_len); -static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm) -{ - if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages) - return kvm->arch.n_max_mmu_pages - - kvm->arch.n_used_mmu_pages; - - return 0; -} - static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) { if (likely(vcpu->arch.mmu->root_hpa != INVALID_PAGE)) @@ -97,9 +90,13 @@ static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu) static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu) { - if (VALID_PAGE(vcpu->arch.mmu->root_hpa)) - kvm_x86_ops.load_mmu_pgd(vcpu, vcpu->arch.mmu->root_hpa | - kvm_get_active_pcid(vcpu)); + u64 root_hpa = vcpu->arch.mmu->root_hpa; + + if (!VALID_PAGE(root_hpa)) + return; + + kvm_x86_ops.load_mmu_pgd(vcpu, root_hpa | kvm_get_active_pcid(vcpu), + vcpu->arch.mmu->shadow_root_level); } int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, @@ -158,6 +155,11 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu) return kvm_read_cr0_bits(vcpu, X86_CR0_WP); } +static inline bool kvm_mmu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + return (gpa >= BIT_ULL(cpuid_maxphyaddr(vcpu))); +} + /* * Check if a given access (described through the I/D, W/R and U/S bits of a * page fault error code pfec) causes a permission fault with the given PTE @@ -218,11 +220,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end); -void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); -void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); -bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, - struct kvm_memory_slot *slot, u64 gfn); -int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu, gpa_t l2_gpa); +int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); int kvm_mmu_post_init_vm(struct kvm *kvm); void kvm_mmu_pre_destroy_vm(struct kvm *kvm); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 6d6a0ae7800c..4e03841f053d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -18,6 +18,7 @@ #include "irq.h" #include "ioapic.h" #include "mmu.h" +#include "mmu_internal.h" #include "x86.h" #include "kvm_cache_regs.h" #include "kvm_emulate.h" @@ -91,7 +92,8 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644); */ bool tdp_enabled = false; -static int max_page_level __read_mostly; +static int max_huge_page_level __read_mostly; +static int max_tdp_level __read_mostly; enum { AUDIT_PRE_PAGE_FAULT, @@ -515,6 +517,18 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) return likely(kvm_gen == spte_gen); } +static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, + struct x86_exception *exception) +{ + /* Check if guest physical address doesn't exceed guest maximum */ + if (kvm_mmu_is_illegal_gpa(vcpu, gpa)) { + exception->error_code |= PFERR_RSVD_MASK; + return UNMAPPED_GVA; + } + + return gpa; +} + /* * Sets the shadow PTE masks used by the MMU. * @@ -676,7 +690,7 @@ union split_spte { static void count_spte_clear(u64 *sptep, u64 spte) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); + struct kvm_mmu_page *sp = sptep_to_sp(sptep); if (is_shadow_present_pte(spte)) return; @@ -760,7 +774,7 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) */ static u64 __get_spte_lockless(u64 *sptep) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); + struct kvm_mmu_page *sp = sptep_to_sp(sptep); union split_spte spte, *orig = (union split_spte *)sptep; int count; @@ -1060,94 +1074,40 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) local_irq_enable(); } -static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, - struct kmem_cache *base_cache, int min) -{ - void *obj; - - if (cache->nobjs >= min) - return 0; - while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT); - if (!obj) - return cache->nobjs >= min ? 0 : -ENOMEM; - cache->objects[cache->nobjs++] = obj; - } - return 0; -} - -static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache) -{ - return cache->nobjs; -} - -static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, - struct kmem_cache *cache) -{ - while (mc->nobjs) - kmem_cache_free(cache, mc->objects[--mc->nobjs]); -} - -static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, - int min) -{ - void *page; - - if (cache->nobjs >= min) - return 0; - while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); - if (!page) - return cache->nobjs >= min ? 0 : -ENOMEM; - cache->objects[cache->nobjs++] = page; - } - return 0; -} - -static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc) -{ - while (mc->nobjs) - free_page((unsigned long)mc->objects[--mc->nobjs]); -} - -static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) +static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) { int r; - r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, - pte_list_desc_cache, 8 + PTE_PREFETCH_NUM); + /* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */ + r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, + 1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM); if (r) - goto out; - r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); + return r; + r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache, + PT64_ROOT_MAX_LEVEL); if (r) - goto out; - r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, - mmu_page_header_cache, 4); -out: - return r; + return r; + if (maybe_indirect) { + r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache, + PT64_ROOT_MAX_LEVEL); + if (r) + return r; + } + return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, + PT64_ROOT_MAX_LEVEL); } static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) { - mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, - pte_list_desc_cache); - mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); - mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache, - mmu_page_header_cache); -} - -static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) -{ - void *p; - - BUG_ON(!mc->nobjs); - p = mc->objects[--mc->nobjs]; - return p; + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache); + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache); + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache); + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); } static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) { - return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache); + return kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache); } static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) @@ -1415,10 +1375,10 @@ static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, static bool rmap_can_add(struct kvm_vcpu *vcpu) { - struct kvm_mmu_memory_cache *cache; + struct kvm_mmu_memory_cache *mc; - cache = &vcpu->arch.mmu_pte_list_desc_cache; - return mmu_memory_cache_free_objects(cache); + mc = &vcpu->arch.mmu_pte_list_desc_cache; + return kvm_mmu_memory_cache_nr_free_objects(mc); } static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) @@ -1426,7 +1386,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) struct kvm_mmu_page *sp; struct kvm_rmap_head *rmap_head; - sp = page_header(__pa(spte)); + sp = sptep_to_sp(spte); kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); return pte_list_add(vcpu, spte, rmap_head); @@ -1438,7 +1398,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) gfn_t gfn; struct kvm_rmap_head *rmap_head; - sp = page_header(__pa(spte)); + sp = sptep_to_sp(spte); gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); rmap_head = gfn_to_rmap(kvm, gfn, sp); __pte_list_remove(spte, rmap_head); @@ -1530,7 +1490,7 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) { if (is_large_pte(*sptep)) { - WARN_ON(page_header(__pa(sptep))->role.level == PG_LEVEL_4K); + WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K); drop_spte(kvm, sptep); --kvm->stat.lpages; return true; @@ -1542,7 +1502,7 @@ static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) { if (__drop_large_spte(vcpu->kvm, sptep)) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); + struct kvm_mmu_page *sp = sptep_to_sp(sptep); kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); @@ -1738,21 +1698,6 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } -/** - * kvm_arch_write_log_dirty - emulate dirty page logging - * @vcpu: Guest mode vcpu - * - * Emulate arch specific page modification logging for the - * nested hypervisor - */ -int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu, gpa_t l2_gpa) -{ - if (kvm_x86_ops.write_log_dirty) - return kvm_x86_ops.write_log_dirty(vcpu, l2_gpa); - - return 0; -} - bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn) { @@ -1986,7 +1931,7 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, unsigned long data) { u64 *sptep; - struct rmap_iterator uninitialized_var(iter); + struct rmap_iterator iter; int young = 0; for_each_rmap_spte(rmap_head, &iter, sptep) @@ -2016,7 +1961,7 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) struct kvm_rmap_head *rmap_head; struct kvm_mmu_page *sp; - sp = page_header(__pa(spte)); + sp = sptep_to_sp(spte); rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); @@ -2105,10 +2050,10 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct { struct kvm_mmu_page *sp; - sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); - sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); + sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); + sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); if (!direct) - sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); + sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); /* @@ -2138,7 +2083,7 @@ static void mark_unsync(u64 *spte) struct kvm_mmu_page *sp; unsigned int index; - sp = page_header(__pa(spte)); + sp = sptep_to_sp(spte); index = spte - sp->spt; if (__test_and_set_bit(index, sp->unsync_child_bitmap)) return; @@ -2207,7 +2152,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, continue; } - child = page_header(ent & PT64_BASE_ADDR_MASK); + child = to_shadow_page(ent & PT64_BASE_ADDR_MASK); if (child->unsync_children) { if (mmu_pages_add(pvec, child, i)) @@ -2258,15 +2203,14 @@ static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list); - -#define for_each_valid_sp(_kvm, _sp, _gfn) \ - hlist_for_each_entry(_sp, \ - &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ +#define for_each_valid_sp(_kvm, _sp, _list) \ + hlist_for_each_entry(_sp, _list, hash_link) \ if (is_obsolete_sp((_kvm), (_sp))) { \ } else #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ - for_each_valid_sp(_kvm, _sp, _gfn) \ + for_each_valid_sp(_kvm, _sp, \ + &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \ if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else static inline bool is_ept_sp(struct kvm_mmu_page *sp) @@ -2464,9 +2408,7 @@ static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp) static void clear_sp_write_flooding_count(u64 *spte) { - struct kvm_mmu_page *sp = page_header(__pa(spte)); - - __clear_sp_write_flooding_count(sp); + __clear_sp_write_flooding_count(sptep_to_sp(spte)); } static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, @@ -2476,7 +2418,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, int direct, unsigned int access) { + bool direct_mmu = vcpu->arch.mmu->direct_map; union kvm_mmu_page_role role; + struct hlist_head *sp_list; unsigned quadrant; struct kvm_mmu_page *sp; bool need_sync = false; @@ -2490,13 +2434,14 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (role.direct) role.gpte_is_8_bytes = true; role.access = access; - if (!vcpu->arch.mmu->direct_map - && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) { + if (!direct_mmu && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) { quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; } - for_each_valid_sp(vcpu->kvm, sp, gfn) { + + sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]; + for_each_valid_sp(vcpu->kvm, sp, sp_list) { if (sp->gfn != gfn) { collisions++; continue; @@ -2508,6 +2453,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (sp->role.word != role.word) continue; + if (direct_mmu) + goto trace_get_page; + if (sp->unsync) { /* The page is good, but __kvm_sync_page might still end * up zapping it. If so, break in order to rebuild it. @@ -2523,6 +2471,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); __clear_sp_write_flooding_count(sp); + +trace_get_page: trace_kvm_mmu_get_page(sp, false); goto out; } @@ -2533,8 +2483,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, sp->gfn = gfn; sp->role = role; - hlist_add_head(&sp->hash_link, - &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]); + hlist_add_head(&sp->hash_link, sp_list); if (!direct) { /* * we should do write protection before syncing pages @@ -2548,7 +2497,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (level > PG_LEVEL_4K && need_sync) flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); } - clear_page(sp->spt); trace_kvm_mmu_get_page(sp, true); kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); @@ -2657,7 +2605,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, * so we should update the spte at this point to get * a new sp with the correct access. */ - child = page_header(*sptep & PT64_BASE_ADDR_MASK); + child = to_shadow_page(*sptep & PT64_BASE_ADDR_MASK); if (child->role.access == direct_access) return; @@ -2679,7 +2627,7 @@ static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, if (is_large_pte(pte)) --kvm->stat.lpages; } else { - child = page_header(pte & PT64_BASE_ADDR_MASK); + child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); drop_parent_pte(child, spte); } return true; @@ -2757,10 +2705,23 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, if (!sp->root_count) { /* Count self */ (*nr_zapped)++; - list_move(&sp->link, invalid_list); + + /* + * Already invalid pages (previously active roots) are not on + * the active page list. See list_del() in the "else" case of + * !sp->root_count. + */ + if (sp->role.invalid) + list_add(&sp->link, invalid_list); + else + list_move(&sp->link, invalid_list); kvm_mod_used_mmu_pages(kvm, -1); } else { - list_move(&sp->link, &kvm->arch.active_mmu_pages); + /* + * Remove the active root from the active page list, the root + * will be explicitly freed when the root_count hits zero. + */ + list_del(&sp->link); /* * Obsolete pages cannot be used on any vCPUs, see the comment @@ -2812,33 +2773,60 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, } } -static bool prepare_zap_oldest_mmu_page(struct kvm *kvm, - struct list_head *invalid_list) +static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm, + unsigned long nr_to_zap) { - struct kvm_mmu_page *sp; + unsigned long total_zapped = 0; + struct kvm_mmu_page *sp, *tmp; + LIST_HEAD(invalid_list); + bool unstable; + int nr_zapped; if (list_empty(&kvm->arch.active_mmu_pages)) - return false; + return 0; + +restart: + list_for_each_entry_safe(sp, tmp, &kvm->arch.active_mmu_pages, link) { + /* + * Don't zap active root pages, the page itself can't be freed + * and zapping it will just force vCPUs to realloc and reload. + */ + if (sp->root_count) + continue; + + unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, + &nr_zapped); + total_zapped += nr_zapped; + if (total_zapped >= nr_to_zap) + break; + + if (unstable) + goto restart; + } - sp = list_last_entry(&kvm->arch.active_mmu_pages, - struct kvm_mmu_page, link); - return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); + kvm_mmu_commit_zap_page(kvm, &invalid_list); + + kvm->stat.mmu_recycled += total_zapped; + return total_zapped; +} + +static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm) +{ + if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages) + return kvm->arch.n_max_mmu_pages - + kvm->arch.n_used_mmu_pages; + + return 0; } static int make_mmu_pages_available(struct kvm_vcpu *vcpu) { - LIST_HEAD(invalid_list); + unsigned long avail = kvm_mmu_available_pages(vcpu->kvm); - if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES)) + if (likely(avail >= KVM_MIN_FREE_MMU_PAGES)) return 0; - while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) { - if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list)) - break; - - ++vcpu->kvm->stat.mmu_recycled; - } - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail); if (!kvm_mmu_available_pages(vcpu->kvm)) return -ENOSPC; @@ -2851,17 +2839,12 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu) */ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages) { - LIST_HEAD(invalid_list); - spin_lock(&kvm->mmu_lock); if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { - /* Need to free some mmu pages to achieve the goal. */ - while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) - if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list)) - break; + kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages - + goal_nr_mmu_pages); - kvm_mmu_commit_zap_page(kvm, &invalid_list); goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; } @@ -2999,7 +2982,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) return 0; - sp = page_header(__pa(sptep)); + sp = sptep_to_sp(sptep); if (sp_ad_disabled(sp)) spte |= SPTE_AD_DISABLED_MASK; else if (kvm_vcpu_ad_need_write_protect(vcpu)) @@ -3102,7 +3085,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, struct kvm_mmu_page *child; u64 pte = *sptep; - child = page_header(pte & PT64_BASE_ADDR_MASK); + child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); drop_parent_pte(child, sptep); flush = true; } else if (pfn != spte_to_pfn(*sptep)) { @@ -3212,7 +3195,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) { struct kvm_mmu_page *sp; - sp = page_header(__pa(sptep)); + sp = sptep_to_sp(sptep); /* * Without accessed bits, there's no way to distinguish between @@ -3274,7 +3257,7 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, if (!slot) return PG_LEVEL_4K; - max_level = min(max_level, max_page_level); + max_level = min(max_level, max_huge_page_level); for ( ; max_level > PG_LEVEL_4K; max_level--) { linfo = lpage_info_slot(gfn, slot, max_level); if (!linfo->disallow_lpage) @@ -3520,7 +3503,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, if (!is_shadow_present_pte(spte)) break; - sp = page_header(__pa(iterator.sptep)); + sp = sptep_to_sp(iterator.sptep); if (!is_last_spte(spte, sp->role.level)) break; @@ -3607,7 +3590,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, if (!VALID_PAGE(*root_hpa)) return; - sp = page_header(*root_hpa & PT64_BASE_ADDR_MASK); + sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK); --sp->root_count; if (!sp->root_count && sp->role.invalid) kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); @@ -3668,7 +3651,7 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) { int ret = 0; - if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) { + if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) { kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); ret = 1; } @@ -3837,7 +3820,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) { hpa_t root = vcpu->arch.mmu->root_hpa; - sp = page_header(root); + sp = to_shadow_page(root); /* * Even if another CPU was marking the SP as unsync-ed @@ -3871,7 +3854,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) if (root && VALID_PAGE(root)) { root &= PT64_BASE_ADDR_MASK; - sp = page_header(root); + sp = to_shadow_page(root); mmu_sync_children(vcpu, sp); } } @@ -4045,8 +4028,8 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) walk_shadow_page_lockless_end(vcpu); } -static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - gfn_t gfn) +static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + gfn_t gfn) { struct kvm_arch_async_pf arch; @@ -4108,16 +4091,16 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; - r = mmu_topup_memory_caches(vcpu); + if (fast_page_fault(vcpu, gpa, error_code)) + return RET_PF_RETRY; + + r = mmu_topup_memory_caches(vcpu, false); if (r) return r; if (lpage_disallowed) max_level = PG_LEVEL_4K; - if (fast_page_fault(vcpu, gpa, error_code)) - return RET_PF_RETRY; - mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -4131,7 +4114,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; - if (make_mmu_pages_available(vcpu) < 0) + r = make_mmu_pages_available(vcpu); + if (r) goto out_unlock; r = __direct_map(vcpu, gpa, write, map_writable, max_level, pfn, prefault, is_tdp && lpage_disallowed); @@ -4156,6 +4140,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, u64 fault_address, char *insn, int insn_len) { int r = 1; + u32 flags = vcpu->arch.apf.host_apf_flags; #ifndef CONFIG_X86_64 /* A 64-bit CR2 should be impossible on 32-bit KVM. */ @@ -4164,28 +4149,22 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, #endif vcpu->arch.l1tf_flush_l1d = true; - switch (vcpu->arch.apf.host_apf_flags) { - default: + if (!flags) { trace_kvm_page_fault(fault_address, error_code); if (kvm_event_needs_reinjection(vcpu)) kvm_mmu_unprotect_page_virt(vcpu, fault_address); r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn, insn_len); - break; - case KVM_PV_REASON_PAGE_NOT_PRESENT: + } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) { vcpu->arch.apf.host_apf_flags = 0; local_irq_disable(); kvm_async_pf_task_wait_schedule(fault_address); local_irq_enable(); - break; - case KVM_PV_REASON_PAGE_READY: - vcpu->arch.apf.host_apf_flags = 0; - local_irq_disable(); - kvm_async_pf_task_wake(fault_address); - local_irq_enable(); - break; + } else { + WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags); } + return r; } EXPORT_SYMBOL_GPL(kvm_handle_page_fault); @@ -4227,8 +4206,8 @@ static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd, union kvm_mmu_page_role role) { return (role.direct || pgd == root->pgd) && - VALID_PAGE(root->hpa) && page_header(root->hpa) && - role.word == page_header(root->hpa)->role.word; + VALID_PAGE(root->hpa) && to_shadow_page(root->hpa) && + role.word == to_shadow_page(root->hpa)->role.word; } /* @@ -4277,8 +4256,7 @@ static bool fast_pgd_switch(struct kvm_vcpu *vcpu, gpa_t new_pgd, */ if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && mmu->root_level >= PT64_ROOT_4LEVEL) - return !mmu_check_root(vcpu, new_pgd >> PAGE_SHIFT) && - cached_root_available(vcpu, new_pgd, new_role); + return cached_root_available(vcpu, new_pgd, new_role); return false; } @@ -4313,7 +4291,7 @@ static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, */ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); - __clear_sp_write_flooding_count(page_header(vcpu->arch.mmu->root_hpa)); + __clear_sp_write_flooding_count(to_shadow_page(vcpu->arch.mmu->root_hpa)); } void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush, @@ -4869,13 +4847,22 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu, return role; } +static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) +{ + /* Use 5-level TDP if and only if it's useful/necessary. */ + if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48) + return 4; + + return max_tdp_level; +} + static union kvm_mmu_role kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) { union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only); role.base.ad_disabled = (shadow_accessed_mask == 0); - role.base.level = vcpu->arch.tdp_level; + role.base.level = kvm_mmu_get_tdp_level(vcpu); role.base.direct = true; role.base.gpte_is_8_bytes = true; @@ -4884,7 +4871,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) { - struct kvm_mmu *context = vcpu->arch.mmu; + struct kvm_mmu *context = &vcpu->arch.root_mmu; union kvm_mmu_role new_role = kvm_calc_tdp_mmu_root_page_role(vcpu, false); @@ -4896,7 +4883,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->sync_page = nonpaging_sync_page; context->invlpg = NULL; context->update_pte = nonpaging_update_pte; - context->shadow_root_level = vcpu->arch.tdp_level; + context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu); context->direct_map = true; context->get_guest_pgd = get_cr3; context->get_pdptr = kvm_pdptr_read; @@ -4931,7 +4918,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) } static union kvm_mmu_role -kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) +kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu, bool base_only) { union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only); @@ -4939,9 +4926,19 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) !is_write_protection(vcpu); role.base.smap_andnot_wp = role.ext.cr4_smap && !is_write_protection(vcpu); - role.base.direct = !is_paging(vcpu); role.base.gpte_is_8_bytes = !!is_pae(vcpu); + return role; +} + +static union kvm_mmu_role +kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) +{ + union kvm_mmu_role role = + kvm_calc_shadow_root_page_role_common(vcpu, base_only); + + role.base.direct = !is_paging(vcpu); + if (!is_long_mode(vcpu)) role.base.level = PT32E_ROOT_LEVEL; else if (is_la57_mode(vcpu)) @@ -4952,15 +4949,10 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) return role; } -void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer) +static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context, + u32 cr0, u32 cr4, u32 efer, + union kvm_mmu_role new_role) { - struct kvm_mmu *context = vcpu->arch.mmu; - union kvm_mmu_role new_role = - kvm_calc_shadow_mmu_root_page_role(vcpu, false); - - if (new_role.as_u64 == context->mmu_role.as_u64) - return; - if (!(cr0 & X86_CR0_PG)) nonpaging_init_context(vcpu, context); else if (efer & EFER_LMA) @@ -4973,7 +4965,43 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer) context->mmu_role.as_u64 = new_role.as_u64; reset_shadow_zero_bits_mask(vcpu, context); } -EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); + +static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer) +{ + struct kvm_mmu *context = &vcpu->arch.root_mmu; + union kvm_mmu_role new_role = + kvm_calc_shadow_mmu_root_page_role(vcpu, false); + + if (new_role.as_u64 != context->mmu_role.as_u64) + shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role); +} + +static union kvm_mmu_role +kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu) +{ + union kvm_mmu_role role = + kvm_calc_shadow_root_page_role_common(vcpu, false); + + role.base.direct = false; + role.base.level = kvm_mmu_get_tdp_level(vcpu); + + return role; +} + +void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer, + gpa_t nested_cr3) +{ + struct kvm_mmu *context = &vcpu->arch.guest_mmu; + union kvm_mmu_role new_role = kvm_calc_shadow_npt_root_page_role(vcpu); + + context->shadow_root_level = new_role.base.level; + + __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base, false, false); + + if (new_role.as_u64 != context->mmu_role.as_u64) + shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role); +} +EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu); static union kvm_mmu_role kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, @@ -5007,7 +5035,7 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, bool accessed_dirty, gpa_t new_eptp) { - struct kvm_mmu *context = vcpu->arch.mmu; + struct kvm_mmu *context = &vcpu->arch.guest_mmu; u8 level = vmx_eptp_page_walk_level(new_eptp); union kvm_mmu_role new_role = kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty, @@ -5041,7 +5069,7 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); static void init_kvm_softmmu(struct kvm_vcpu *vcpu) { - struct kvm_mmu *context = vcpu->arch.mmu; + struct kvm_mmu *context = &vcpu->arch.root_mmu; kvm_init_shadow_mmu(vcpu, kvm_read_cr0_bits(vcpu, X86_CR0_PG), @@ -5151,7 +5179,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) { int r; - r = mmu_topup_memory_caches(vcpu); + r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map); if (r) goto out; r = mmu_alloc_roots(vcpu); @@ -5345,7 +5373,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, * or not since pte prefetch is skiped if it does not have * enough objects in the cache. */ - mmu_topup_memory_caches(vcpu); + mmu_topup_memory_caches(vcpu, true); spin_lock(&vcpu->kvm->mmu_lock); @@ -5553,23 +5581,25 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) } EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva); -void kvm_configure_mmu(bool enable_tdp, int tdp_page_level) +void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, + int tdp_huge_page_level) { tdp_enabled = enable_tdp; + max_tdp_level = tdp_max_root_level; /* - * max_page_level reflects the capabilities of KVM's MMU irrespective + * max_huge_page_level reflects KVM's MMU capabilities irrespective * of kernel support, e.g. KVM may be capable of using 1GB pages when * the kernel is not. But, KVM never creates a page size greater than * what is used by the kernel for any given HVA, i.e. the kernel's * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust(). */ if (tdp_enabled) - max_page_level = tdp_page_level; + max_huge_page_level = tdp_huge_page_level; else if (boot_cpu_has(X86_FEATURE_GBPAGES)) - max_page_level = PG_LEVEL_1G; + max_huge_page_level = PG_LEVEL_1G; else - max_page_level = PG_LEVEL_2M; + max_huge_page_level = PG_LEVEL_2M; } EXPORT_SYMBOL_GPL(kvm_configure_mmu); @@ -5665,7 +5695,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) * SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can * skip allocating the PDP table. */ - if (tdp_enabled && vcpu->arch.tdp_level > PT32E_ROOT_LEVEL) + if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL) return 0; page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32); @@ -5684,6 +5714,14 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) uint i; int ret; + vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache; + vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO; + + vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache; + vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO; + + vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO; + vcpu->arch.mmu = &vcpu->arch.root_mmu; vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; @@ -5732,12 +5770,11 @@ restart: break; /* - * Skip invalid pages with a non-zero root count, zapping pages - * with a non-zero root count will never succeed, i.e. the page - * will get thrown back on active_mmu_pages and we'll get stuck - * in an infinite loop. + * Invalid pages should never land back on the list of active + * pages. Skip the bogus page, otherwise we'll get stuck in an + * infinite loop if the page gets put back on the list (again). */ - if (sp->role.invalid && sp->root_count) + if (WARN_ON(sp->role.invalid)) continue; /* @@ -5904,7 +5941,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, restart: for_each_rmap_spte(rmap_head, &iter, sptep) { - sp = page_header(__pa(sptep)); + sp = sptep_to_sp(sptep); pfn = spte_to_pfn(*sptep); /* @@ -6015,7 +6052,7 @@ void kvm_mmu_zap_all(struct kvm *kvm) spin_lock(&kvm->mmu_lock); restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { - if (sp->role.invalid && sp->root_count) + if (WARN_ON(sp->role.invalid)) continue; if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) goto restart; @@ -6092,9 +6129,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) goto unlock; } - if (prepare_zap_oldest_mmu_page(kvm, &invalid_list)) - freed++; - kvm_mmu_commit_zap_page(kvm, &invalid_list); + freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan); unlock: spin_unlock(&kvm->mmu_lock); diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c index 9d2844f87f6d..c8d51a37e2ce 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu/mmu_audit.c @@ -45,7 +45,7 @@ static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, !is_last_spte(ent[i], level)) { struct kvm_mmu_page *child; - child = page_header(ent[i] & PT64_BASE_ADDR_MASK); + child = to_shadow_page(ent[i] & PT64_BASE_ADDR_MASK); __mmu_spte_walk(vcpu, child, fn, level - 1); } } @@ -62,7 +62,7 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) { hpa_t root = vcpu->arch.mmu->root_hpa; - sp = page_header(root); + sp = to_shadow_page(root); __mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu->root_level); return; } @@ -72,7 +72,7 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) if (root && VALID_PAGE(root)) { root &= PT64_BASE_ADDR_MASK; - sp = page_header(root); + sp = to_shadow_page(root); __mmu_spte_walk(vcpu, sp, fn, 2); } } @@ -97,7 +97,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) kvm_pfn_t pfn; hpa_t hpa; - sp = page_header(__pa(sptep)); + sp = sptep_to_sp(sptep); if (sp->unsync) { if (level != PG_LEVEL_4K) { @@ -132,7 +132,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) struct kvm_memory_slot *slot; gfn_t gfn; - rev_sp = page_header(__pa(sptep)); + rev_sp = sptep_to_sp(sptep); gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); slots = kvm_memslots_for_spte_role(kvm, rev_sp->role); @@ -165,7 +165,7 @@ static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level) static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); + struct kvm_mmu_page *sp = sptep_to_sp(sptep); if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync) audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync " diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h new file mode 100644 index 000000000000..3acf3b8eb469 --- /dev/null +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_MMU_INTERNAL_H +#define __KVM_X86_MMU_INTERNAL_H + +#include <linux/types.h> + +#include <asm/kvm_host.h> + +struct kvm_mmu_page { + struct list_head link; + struct hlist_node hash_link; + struct list_head lpage_disallowed_link; + + bool unsync; + u8 mmu_valid_gen; + bool mmio_cached; + bool lpage_disallowed; /* Can't be replaced by an equiv large page */ + + /* + * The following two entries are used to key the shadow page in the + * hash table. + */ + union kvm_mmu_page_role role; + gfn_t gfn; + + u64 *spt; + /* hold the gfn of each spte inside spt */ + gfn_t *gfns; + int root_count; /* Currently serving as active root */ + unsigned int unsync_children; + struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ + DECLARE_BITMAP(unsync_child_bitmap, 512); + +#ifdef CONFIG_X86_32 + /* + * Used out of the mmu-lock to avoid reading spte values while an + * update is in progress; see the comments in __get_spte_lockless(). + */ + int clear_spte_count; +#endif + + /* Number of writes since the last time traversal visited this page. */ + atomic_t write_flooding_count; +}; + +static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page) +{ + struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); + + return (struct kvm_mmu_page *)page_private(page); +} + +static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) +{ + return to_shadow_page(__pa(sptep)); +} + +void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); +void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); +bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, + struct kvm_memory_slot *slot, u64 gfn); + +#endif /* __KVM_X86_MMU_INTERNAL_H */ diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index ffcd96fc02d0..9d15bc0c535b 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -387,7 +387,7 @@ TRACE_EVENT( #endif /* _TRACE_KVMMMU_H */ #undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_PATH mmu #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE mmutrace diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index a7bcde34d1f2..a84a141a2ad2 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -16,7 +16,7 @@ #include <asm/kvm_page_track.h> -#include "mmu.h" +#include "mmu_internal.h" void kvm_page_track_free_memslot(struct kvm_memory_slot *slot) { diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index bd70ece1ef8b..4dd6b1e5b8cf 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -260,7 +260,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, !(pte & PT_GUEST_DIRTY_MASK)) { trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); #if PTTYPE == PTTYPE_EPT - if (kvm_arch_write_log_dirty(vcpu, addr)) + if (kvm_x86_ops.nested_ops->write_log_dirty(vcpu, addr)) return -EINVAL; #endif pte |= PT_GUEST_DIRTY_MASK; @@ -314,7 +314,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, { int ret; pt_element_t pte; - pt_element_t __user *uninitialized_var(ptep_user); + pt_element_t __user *ptep_user; gfn_t table_gfn; u64 pt_access, pte_access; unsigned index, accessed_dirty, pte_pkey; @@ -596,7 +596,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, u64 *spte; int i; - sp = page_header(__pa(sptep)); + sp = sptep_to_sp(sptep); if (sp->role.level > PG_LEVEL_4K) return; @@ -789,10 +789,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); - r = mmu_topup_memory_caches(vcpu); - if (r) - return r; - /* * If PFEC.RSVD is set, this is a shadow page fault. * The bit needs to be cleared before walking guest page tables. @@ -820,6 +816,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, return RET_PF_EMULATE; } + r = mmu_topup_memory_caches(vcpu, true); + if (r) + return r; + vcpu->arch.write_fault_to_shadow_pgtable = false; is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, @@ -866,7 +866,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, goto out_unlock; kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); - if (make_mmu_pages_available(vcpu) < 0) + r = make_mmu_pages_available(vcpu); + if (r) goto out_unlock; r = FNAME(fetch)(vcpu, addr, &walker, write_fault, max_level, pfn, map_writable, prefault, lpage_disallowed); @@ -903,7 +904,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) * No need to check return value here, rmap_can_add() can * help us to skip pte prefetch later. */ - mmu_topup_memory_caches(vcpu); + mmu_topup_memory_caches(vcpu, true); if (!VALID_PAGE(root_hpa)) { WARN_ON(1); @@ -915,7 +916,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) level = iterator.level; sptep = iterator.sptep; - sp = page_header(__pa(sptep)); + sp = sptep_to_sp(sptep); if (is_last_spte(*sptep, level)) { pt_element_t gpte; gpa_t pte_gpa; diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index b86346903f2e..67741d2a0308 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -372,6 +372,11 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) if (!pmc) return 1; + if (!(kvm_read_cr4(vcpu) & X86_CR4_PCE) && + (kvm_x86_ops.get_cpl(vcpu) != 0) && + (kvm_read_cr0(vcpu) & X86_CR0_PE)) + return 1; + *data = pmc_read_counter(pmc) & mask; return 0; } diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index ab85eed8a6cc..067fef51760c 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -15,6 +15,8 @@ #define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001 #define VMWARE_BACKDOOR_PMC_APPARENT_TIME 0x10002 +#define MAX_FIXED_COUNTERS 3 + struct kvm_event_hw_type_mapping { u8 eventsel; u8 unit_mask; diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index e80daa98682f..ac830cd50830 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -665,7 +665,7 @@ void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) } else { vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; } - mark_dirty(vmcb, VMCB_AVIC); + vmcb_mark_dirty(vmcb, VMCB_AVIC); svm_set_pi_irte_mode(vcpu, activated); } diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 6bceafb19108..fb68467e6049 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -48,13 +48,6 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, svm->vmcb->control.exit_info_1 &= ~0xffffffffULL; svm->vmcb->control.exit_info_1 |= fault->error_code; - /* - * The present bit is always zero for page structure faults on real - * hardware. - */ - if (svm->vmcb->control.exit_info_1 & (2ULL << 32)) - svm->vmcb->control.exit_info_1 &= ~1; - nested_svm_vmexit(svm); } @@ -87,11 +80,11 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) WARN_ON(mmu_is_nested(vcpu)); vcpu->arch.mmu = &vcpu->arch.guest_mmu; - kvm_init_shadow_mmu(vcpu, X86_CR0_PG, hsave->save.cr4, hsave->save.efer); + kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, hsave->save.cr4, hsave->save.efer, + svm->nested.ctl.nested_cr3); vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3; vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr; vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit; - vcpu->arch.mmu->shadow_root_level = vcpu->arch.tdp_level; reset_shadow_zero_bits_mask(vcpu, vcpu->arch.mmu); vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; } @@ -106,7 +99,7 @@ void recalc_intercepts(struct vcpu_svm *svm) { struct vmcb_control_area *c, *h, *g; - mark_dirty(svm->vmcb, VMCB_INTERCEPTS); + vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); if (!is_guest_mode(&svm->vcpu)) return; @@ -222,8 +215,9 @@ static bool nested_vmcb_check_controls(struct vmcb_control_area *control) return true; } -static bool nested_vmcb_checks(struct vmcb *vmcb) +static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb) { + bool nested_vmcb_lma; if ((vmcb->save.efer & EFER_SVME) == 0) return false; @@ -231,6 +225,30 @@ static bool nested_vmcb_checks(struct vmcb *vmcb) (vmcb->save.cr0 & X86_CR0_NW)) return false; + if (!kvm_dr6_valid(vmcb->save.dr6) || !kvm_dr7_valid(vmcb->save.dr7)) + return false; + + nested_vmcb_lma = + (vmcb->save.efer & EFER_LME) && + (vmcb->save.cr0 & X86_CR0_PG); + + if (!nested_vmcb_lma) { + if (vmcb->save.cr4 & X86_CR4_PAE) { + if (vmcb->save.cr3 & MSR_CR3_LEGACY_PAE_RESERVED_MASK) + return false; + } else { + if (vmcb->save.cr3 & MSR_CR3_LEGACY_RESERVED_MASK) + return false; + } + } else { + if (!(vmcb->save.cr4 & X86_CR4_PAE) || + !(vmcb->save.cr0 & X86_CR0_PE) || + (vmcb->save.cr3 & MSR_CR3_LONG_RESERVED_MASK)) + return false; + } + if (kvm_valid_cr4(&svm->vcpu, vmcb->save.cr4)) + return false; + return nested_vmcb_check_controls(&vmcb->control); } @@ -258,7 +276,7 @@ void sync_nested_vmcb_control(struct vcpu_svm *svm) /* Only a few fields of int_ctl are written by the processor. */ mask = V_IRQ_MASK | V_TPR_MASK; if (!(svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) && - is_intercept(svm, INTERCEPT_VINTR)) { + svm_is_intercept(svm, INTERCEPT_VINTR)) { /* * In order to request an interrupt window, L0 is usurping * svm->vmcb->control.int_ctl and possibly setting V_IRQ @@ -310,6 +328,42 @@ static void nested_vmcb_save_pending_event(struct vcpu_svm *svm, nested_vmcb->control.exit_int_info = exit_int_info; } +static inline bool nested_npt_enabled(struct vcpu_svm *svm) +{ + return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE; +} + +/* + * Load guest's/host's cr3 on nested vmentry or vmexit. @nested_npt is true + * if we are emulating VM-Entry into a guest with NPT enabled. + */ +static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, + bool nested_npt) +{ + if (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)) + return -EINVAL; + + if (!nested_npt && is_pae_paging(vcpu) && + (cr3 != kvm_read_cr3(vcpu) || pdptrs_changed(vcpu))) { + if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) + return -EINVAL; + } + + /* + * TODO: optimize unconditional TLB flush/MMU sync here and in + * kvm_init_shadow_npt_mmu(). + */ + if (!nested_npt) + kvm_mmu_new_pgd(vcpu, cr3, false, false); + + vcpu->arch.cr3 = cr3; + kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); + + kvm_init_mmu(vcpu, false); + + return 0; +} + static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_vmcb) { /* Load the nested guest state */ @@ -323,8 +377,6 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_v svm_set_efer(&svm->vcpu, nested_vmcb->save.efer); svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0); svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4); - (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); - svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2; kvm_rax_write(&svm->vcpu, nested_vmcb->save.rax); kvm_rsp_write(&svm->vcpu, nested_vmcb->save.rsp); @@ -342,13 +394,9 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_v static void nested_prepare_vmcb_control(struct vcpu_svm *svm) { const u32 mask = V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK; - if (svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) - nested_svm_init_mmu_context(&svm->vcpu); - - /* Guest paging mode is active - reset mmu */ - kvm_mmu_reset_context(&svm->vcpu); - svm_flush_tlb(&svm->vcpu); + if (nested_npt_enabled(svm)) + nested_svm_init_mmu_context(&svm->vcpu); svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset + svm->nested.ctl.tsc_offset; @@ -375,18 +423,27 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm) */ recalc_intercepts(svm); - mark_all_dirty(svm->vmcb); + vmcb_mark_all_dirty(svm->vmcb); } -void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, +int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, struct vmcb *nested_vmcb) { + int ret; + svm->nested.vmcb = vmcb_gpa; load_nested_vmcb_control(svm, &nested_vmcb->control); nested_prepare_vmcb_save(svm, nested_vmcb); nested_prepare_vmcb_control(svm); + ret = nested_svm_load_cr3(&svm->vcpu, nested_vmcb->save.cr3, + nested_npt_enabled(svm)); + if (ret) + return ret; + svm_set_gif(svm, true); + + return 0; } int nested_svm_vmrun(struct vcpu_svm *svm) @@ -416,7 +473,7 @@ int nested_svm_vmrun(struct vcpu_svm *svm) nested_vmcb = map.hva; - if (!nested_vmcb_checks(nested_vmcb)) { + if (!nested_vmcb_checks(svm, nested_vmcb)) { nested_vmcb->control.exit_code = SVM_EXIT_ERR; nested_vmcb->control.exit_code_hi = 0; nested_vmcb->control.exit_info_1 = 0; @@ -464,16 +521,22 @@ int nested_svm_vmrun(struct vcpu_svm *svm) copy_vmcb_control_area(&hsave->control, &vmcb->control); svm->nested.nested_run_pending = 1; - enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb); - if (!nested_svm_vmrun_msrpm(svm)) { - svm->vmcb->control.exit_code = SVM_EXIT_ERR; - svm->vmcb->control.exit_code_hi = 0; - svm->vmcb->control.exit_info_1 = 0; - svm->vmcb->control.exit_info_2 = 0; + if (enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb)) + goto out_exit_err; - nested_svm_vmexit(svm); - } + if (nested_svm_vmrun_msrpm(svm)) + goto out; + +out_exit_err: + svm->nested.nested_run_pending = 0; + + svm->vmcb->control.exit_code = SVM_EXIT_ERR; + svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_info_1 = 0; + svm->vmcb->control.exit_info_2 = 0; + + nested_svm_vmexit(svm); out: kvm_vcpu_unmap(&svm->vcpu, &map, true); @@ -585,12 +648,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm) svm_set_efer(&svm->vcpu, hsave->save.efer); svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE); svm_set_cr4(&svm->vcpu, hsave->save.cr4); - if (npt_enabled) { - svm->vmcb->save.cr3 = hsave->save.cr3; - svm->vcpu.arch.cr3 = hsave->save.cr3; - } else { - (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3); - } kvm_rax_write(&svm->vcpu, hsave->save.rax); kvm_rsp_write(&svm->vcpu, hsave->save.rsp); kvm_rip_write(&svm->vcpu, hsave->save.rip); @@ -598,7 +655,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) svm->vmcb->save.cpl = 0; svm->vmcb->control.exit_int_info = 0; - mark_all_dirty(svm->vmcb); + vmcb_mark_all_dirty(svm->vmcb); trace_kvm_nested_vmexit_inject(nested_vmcb->control.exit_code, nested_vmcb->control.exit_info_1, @@ -610,8 +667,13 @@ int nested_svm_vmexit(struct vcpu_svm *svm) kvm_vcpu_unmap(&svm->vcpu, &map, true); nested_svm_uninit_mmu_context(&svm->vcpu); - kvm_mmu_reset_context(&svm->vcpu); - kvm_mmu_load(&svm->vcpu); + + rc = nested_svm_load_cr3(&svm->vcpu, hsave->save.cr3, false); + if (rc) + return 1; + + if (npt_enabled) + svm->vmcb->save.cr3 = hsave->save.cr3; /* * Drop what we picked up for L2 via svm_complete_interrupts() so it diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 5573a97f1520..402dc4234e39 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -313,13 +313,15 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, int write) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - unsigned long npages, npinned, size; + unsigned long npages, size; + int npinned; unsigned long locked, lock_limit; struct page **pages; unsigned long first, last; + int ret; if (ulen == 0 || uaddr + ulen < uaddr) - return NULL; + return ERR_PTR(-EINVAL); /* Calculate number of pages. */ first = (uaddr & PAGE_MASK) >> PAGE_SHIFT; @@ -330,9 +332,12 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; if (locked > lock_limit && !capable(CAP_IPC_LOCK)) { pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit); - return NULL; + return ERR_PTR(-ENOMEM); } + if (WARN_ON_ONCE(npages > INT_MAX)) + return ERR_PTR(-EINVAL); + /* Avoid using vmalloc for smaller buffers. */ size = npages * sizeof(struct page *); if (size > PAGE_SIZE) @@ -341,12 +346,13 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, pages = kmalloc(size, GFP_KERNEL_ACCOUNT); if (!pages) - return NULL; + return ERR_PTR(-ENOMEM); /* Pin the user virtual address. */ - npinned = get_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages); + npinned = pin_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages); if (npinned != npages) { pr_err("SEV: Failure locking %lu pages.\n", npages); + ret = -ENOMEM; goto err; } @@ -357,10 +363,10 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, err: if (npinned > 0) - release_pages(pages, npinned); + unpin_user_pages(pages, npinned); kvfree(pages); - return NULL; + return ERR_PTR(ret); } static void sev_unpin_memory(struct kvm *kvm, struct page **pages, @@ -368,7 +374,7 @@ static void sev_unpin_memory(struct kvm *kvm, struct page **pages, { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - release_pages(pages, npages); + unpin_user_pages(pages, npages); kvfree(pages); sev->pages_locked -= npages; } @@ -434,8 +440,8 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* Lock the user memory. */ inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1); - if (!inpages) { - ret = -ENOMEM; + if (IS_ERR(inpages)) { + ret = PTR_ERR(inpages); goto e_free; } @@ -789,13 +795,13 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) /* lock userspace source and destination page */ src_p = sev_pin_memory(kvm, vaddr & PAGE_MASK, PAGE_SIZE, &n, 0); - if (!src_p) - return -EFAULT; + if (IS_ERR(src_p)) + return PTR_ERR(src_p); dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, 1); - if (!dst_p) { + if (IS_ERR(dst_p)) { sev_unpin_memory(kvm, src_p, n); - return -EFAULT; + return PTR_ERR(dst_p); } /* @@ -860,8 +866,8 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) return -EFAULT; pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1); - if (!pages) - return -ENOMEM; + if (IS_ERR(pages)) + return PTR_ERR(pages); /* * The secret must be copied into contiguous memory region, lets verify @@ -987,8 +993,8 @@ int svm_register_enc_region(struct kvm *kvm, return -ENOMEM; region->pages = sev_pin_memory(kvm, range->addr, range->size, ®ion->npages, 1); - if (!region->pages) { - ret = -ENOMEM; + if (IS_ERR(region->pages)) { + ret = PTR_ERR(region->pages); goto e_free; } @@ -1180,11 +1186,10 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu) * 2) or this VMCB was executed on different host CPU in previous VMRUNs. */ if (sd->sev_vmcbs[asid] == svm->vmcb && - svm->last_cpu == cpu) + svm->vcpu.arch.last_vmentry_cpu == cpu) return; - svm->last_cpu = cpu; sd->sev_vmcbs[asid] = svm->vmcb; svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; - mark_dirty(svm->vmcb, VMCB_ASID); + vmcb_mark_dirty(svm->vmcb, VMCB_ASID); } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 5bbf76189afa..03dd7bac8034 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -254,7 +254,7 @@ static inline void invlpga(unsigned long addr, u32 asid) asm volatile (__ex("invlpga %1, %0") : : "c"(asid), "a"(addr)); } -static int get_npt_level(struct kvm_vcpu *vcpu) +static int get_max_npt_level(void) { #ifdef CONFIG_X86_64 return PT64_ROOT_4LEVEL; @@ -282,7 +282,7 @@ void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) } svm->vmcb->save.efer = efer | EFER_SVME; - mark_dirty(svm->vmcb, VMCB_CR); + vmcb_mark_dirty(svm->vmcb, VMCB_CR); } static int is_external_interrupt(u32 info) @@ -713,7 +713,7 @@ static void grow_ple_window(struct kvm_vcpu *vcpu) pause_filter_count_max); if (control->pause_filter_count != old) { - mark_dirty(svm->vmcb, VMCB_INTERCEPTS); + vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); trace_kvm_ple_window_update(vcpu->vcpu_id, control->pause_filter_count, old); } @@ -731,7 +731,7 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) pause_filter_count_shrink, pause_filter_count); if (control->pause_filter_count != old) { - mark_dirty(svm->vmcb, VMCB_INTERCEPTS); + vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); trace_kvm_ple_window_update(vcpu->vcpu_id, control->pause_filter_count, old); } @@ -885,7 +885,7 @@ static __init int svm_hardware_setup(void) if (npt_enabled && !npt) npt_enabled = false; - kvm_configure_mmu(npt_enabled, PG_LEVEL_1G); + kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G); pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); if (nrips) { @@ -924,6 +924,21 @@ static __init int svm_hardware_setup(void) svm_set_cpu_caps(); + /* + * It seems that on AMD processors PTE's accessed bit is + * being set by the CPU hardware before the NPF vmexit. + * This is not expected behaviour and our tests fail because + * of it. + * A workaround here is to disable support for + * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled. + * In this case userspace can know if there is support using + * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle + * it + * If future AMD CPU models change the behaviour described above, + * this variable can be changed accordingly + */ + allow_smaller_maxphyaddr = !npt_enabled; + return 0; err: @@ -966,7 +981,7 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) svm->vmcb->control.tsc_offset = offset + g_tsc_offset; - mark_dirty(svm->vmcb, VMCB_INTERCEPTS); + vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); return svm->vmcb->control.tsc_offset; } @@ -1002,38 +1017,38 @@ static void init_vmcb(struct vcpu_svm *svm) if (enable_vmware_backdoor) set_exception_intercept(svm, GP_VECTOR); - set_intercept(svm, INTERCEPT_INTR); - set_intercept(svm, INTERCEPT_NMI); - set_intercept(svm, INTERCEPT_SMI); - set_intercept(svm, INTERCEPT_SELECTIVE_CR0); - set_intercept(svm, INTERCEPT_RDPMC); - set_intercept(svm, INTERCEPT_CPUID); - set_intercept(svm, INTERCEPT_INVD); - set_intercept(svm, INTERCEPT_INVLPG); - set_intercept(svm, INTERCEPT_INVLPGA); - set_intercept(svm, INTERCEPT_IOIO_PROT); - set_intercept(svm, INTERCEPT_MSR_PROT); - set_intercept(svm, INTERCEPT_TASK_SWITCH); - set_intercept(svm, INTERCEPT_SHUTDOWN); - set_intercept(svm, INTERCEPT_VMRUN); - set_intercept(svm, INTERCEPT_VMMCALL); - set_intercept(svm, INTERCEPT_VMLOAD); - set_intercept(svm, INTERCEPT_VMSAVE); - set_intercept(svm, INTERCEPT_STGI); - set_intercept(svm, INTERCEPT_CLGI); - set_intercept(svm, INTERCEPT_SKINIT); - set_intercept(svm, INTERCEPT_WBINVD); - set_intercept(svm, INTERCEPT_XSETBV); - set_intercept(svm, INTERCEPT_RDPRU); - set_intercept(svm, INTERCEPT_RSM); + svm_set_intercept(svm, INTERCEPT_INTR); + svm_set_intercept(svm, INTERCEPT_NMI); + svm_set_intercept(svm, INTERCEPT_SMI); + svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0); + svm_set_intercept(svm, INTERCEPT_RDPMC); + svm_set_intercept(svm, INTERCEPT_CPUID); + svm_set_intercept(svm, INTERCEPT_INVD); + svm_set_intercept(svm, INTERCEPT_INVLPG); + svm_set_intercept(svm, INTERCEPT_INVLPGA); + svm_set_intercept(svm, INTERCEPT_IOIO_PROT); + svm_set_intercept(svm, INTERCEPT_MSR_PROT); + svm_set_intercept(svm, INTERCEPT_TASK_SWITCH); + svm_set_intercept(svm, INTERCEPT_SHUTDOWN); + svm_set_intercept(svm, INTERCEPT_VMRUN); + svm_set_intercept(svm, INTERCEPT_VMMCALL); + svm_set_intercept(svm, INTERCEPT_VMLOAD); + svm_set_intercept(svm, INTERCEPT_VMSAVE); + svm_set_intercept(svm, INTERCEPT_STGI); + svm_set_intercept(svm, INTERCEPT_CLGI); + svm_set_intercept(svm, INTERCEPT_SKINIT); + svm_set_intercept(svm, INTERCEPT_WBINVD); + svm_set_intercept(svm, INTERCEPT_XSETBV); + svm_set_intercept(svm, INTERCEPT_RDPRU); + svm_set_intercept(svm, INTERCEPT_RSM); if (!kvm_mwait_in_guest(svm->vcpu.kvm)) { - set_intercept(svm, INTERCEPT_MONITOR); - set_intercept(svm, INTERCEPT_MWAIT); + svm_set_intercept(svm, INTERCEPT_MONITOR); + svm_set_intercept(svm, INTERCEPT_MWAIT); } if (!kvm_hlt_in_guest(svm->vcpu.kvm)) - set_intercept(svm, INTERCEPT_HLT); + svm_set_intercept(svm, INTERCEPT_HLT); control->iopm_base_pa = __sme_set(iopm_base); control->msrpm_base_pa = __sme_set(__pa(svm->msrpm)); @@ -1077,7 +1092,7 @@ static void init_vmcb(struct vcpu_svm *svm) if (npt_enabled) { /* Setup VMCB for Nested Paging */ control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE; - clr_intercept(svm, INTERCEPT_INVLPG); + svm_clr_intercept(svm, INTERCEPT_INVLPG); clr_exception_intercept(svm, PF_VECTOR); clr_cr_intercept(svm, INTERCEPT_CR3_READ); clr_cr_intercept(svm, INTERCEPT_CR3_WRITE); @@ -1094,9 +1109,9 @@ static void init_vmcb(struct vcpu_svm *svm) control->pause_filter_count = pause_filter_count; if (pause_filter_thresh) control->pause_filter_thresh = pause_filter_thresh; - set_intercept(svm, INTERCEPT_PAUSE); + svm_set_intercept(svm, INTERCEPT_PAUSE); } else { - clr_intercept(svm, INTERCEPT_PAUSE); + svm_clr_intercept(svm, INTERCEPT_PAUSE); } if (kvm_vcpu_apicv_active(&svm->vcpu)) @@ -1107,14 +1122,14 @@ static void init_vmcb(struct vcpu_svm *svm) * in VMCB and clear intercepts to avoid #VMEXIT. */ if (vls) { - clr_intercept(svm, INTERCEPT_VMLOAD); - clr_intercept(svm, INTERCEPT_VMSAVE); + svm_clr_intercept(svm, INTERCEPT_VMLOAD); + svm_clr_intercept(svm, INTERCEPT_VMSAVE); svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; } if (vgif) { - clr_intercept(svm, INTERCEPT_STGI); - clr_intercept(svm, INTERCEPT_CLGI); + svm_clr_intercept(svm, INTERCEPT_STGI); + svm_clr_intercept(svm, INTERCEPT_CLGI); svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; } @@ -1123,7 +1138,7 @@ static void init_vmcb(struct vcpu_svm *svm) clr_exception_intercept(svm, UD_VECTOR); } - mark_all_dirty(svm->vmcb); + vmcb_mark_all_dirty(svm->vmcb); enable_gif(svm); @@ -1257,7 +1272,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (unlikely(cpu != vcpu->cpu)) { svm->asid_generation = 0; - mark_all_dirty(svm->vmcb); + vmcb_mark_all_dirty(svm->vmcb); } #ifdef CONFIG_X86_64 @@ -1356,7 +1371,7 @@ static void svm_set_vintr(struct vcpu_svm *svm) /* The following fields are ignored when AVIC is enabled */ WARN_ON(kvm_vcpu_apicv_active(&svm->vcpu)); - set_intercept(svm, INTERCEPT_VINTR); + svm_set_intercept(svm, INTERCEPT_VINTR); /* * This is just a dummy VINTR to actually cause a vmexit to happen. @@ -1367,13 +1382,13 @@ static void svm_set_vintr(struct vcpu_svm *svm) control->int_ctl &= ~V_INTR_PRIO_MASK; control->int_ctl |= V_IRQ_MASK | ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); - mark_dirty(svm->vmcb, VMCB_INTR); + vmcb_mark_dirty(svm->vmcb, VMCB_INTR); } static void svm_clear_vintr(struct vcpu_svm *svm) { const u32 mask = V_TPR_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK | V_INTR_MASKING_MASK; - clr_intercept(svm, INTERCEPT_VINTR); + svm_clr_intercept(svm, INTERCEPT_VINTR); /* Drop int_ctl fields related to VINTR injection. */ svm->vmcb->control.int_ctl &= mask; @@ -1385,7 +1400,7 @@ static void svm_clear_vintr(struct vcpu_svm *svm) svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl & ~mask; } - mark_dirty(svm->vmcb, VMCB_INTR); + vmcb_mark_dirty(svm->vmcb, VMCB_INTR); } static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) @@ -1503,7 +1518,7 @@ static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) svm->vmcb->save.idtr.limit = dt->size; svm->vmcb->save.idtr.base = dt->address ; - mark_dirty(svm->vmcb, VMCB_DT); + vmcb_mark_dirty(svm->vmcb, VMCB_DT); } static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) @@ -1520,7 +1535,7 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) svm->vmcb->save.gdtr.limit = dt->size; svm->vmcb->save.gdtr.base = dt->address ; - mark_dirty(svm->vmcb, VMCB_DT); + vmcb_mark_dirty(svm->vmcb, VMCB_DT); } static void update_cr0_intercept(struct vcpu_svm *svm) @@ -1531,7 +1546,7 @@ static void update_cr0_intercept(struct vcpu_svm *svm) *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) | (gcr0 & SVM_CR0_SELECTIVE_MASK); - mark_dirty(svm->vmcb, VMCB_CR); + vmcb_mark_dirty(svm->vmcb, VMCB_CR); if (gcr0 == *hcr0) { clr_cr_intercept(svm, INTERCEPT_CR0_READ); @@ -1572,7 +1587,7 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) cr0 &= ~(X86_CR0_CD | X86_CR0_NW); svm->vmcb->save.cr0 = cr0; - mark_dirty(svm->vmcb, VMCB_CR); + vmcb_mark_dirty(svm->vmcb, VMCB_CR); update_cr0_intercept(svm); } @@ -1592,7 +1607,7 @@ int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) cr4 |= X86_CR4_PAE; cr4 |= host_cr4_mce; to_svm(vcpu)->vmcb->save.cr4 = cr4; - mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); + vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); return 0; } @@ -1624,10 +1639,10 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, /* This is symmetric with svm_get_segment() */ svm->vmcb->save.cpl = (var->dpl & 3); - mark_dirty(svm->vmcb, VMCB_SEG); + vmcb_mark_dirty(svm->vmcb, VMCB_SEG); } -static void update_bp_intercept(struct kvm_vcpu *vcpu) +static void update_exception_bitmap(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1636,8 +1651,7 @@ static void update_bp_intercept(struct kvm_vcpu *vcpu) if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) set_exception_intercept(svm, BP_VECTOR); - } else - vcpu->guest_debug = 0; + } } static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) @@ -1651,7 +1665,7 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) svm->asid_generation = sd->asid_generation; svm->vmcb->control.asid = sd->next_asid++; - mark_dirty(svm->vmcb, VMCB_ASID); + vmcb_mark_dirty(svm->vmcb, VMCB_ASID); } static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value) @@ -1660,7 +1674,7 @@ static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value) if (unlikely(value != vmcb->save.dr6)) { vmcb->save.dr6 = value; - mark_dirty(vmcb, VMCB_DR); + vmcb_mark_dirty(vmcb, VMCB_DR); } } @@ -1687,7 +1701,7 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->save.dr7 = value; - mark_dirty(svm->vmcb, VMCB_DR); + vmcb_mark_dirty(svm->vmcb, VMCB_DR); } static int pf_interception(struct vcpu_svm *svm) @@ -2000,8 +2014,8 @@ void svm_set_gif(struct vcpu_svm *svm, bool value) * again while processing KVM_REQ_EVENT if needed. */ if (vgif_enabled(svm)) - clr_intercept(svm, INTERCEPT_STGI); - if (is_intercept(svm, INTERCEPT_VINTR)) + svm_clr_intercept(svm, INTERCEPT_STGI); + if (svm_is_intercept(svm, INTERCEPT_VINTR)) svm_clear_vintr(svm); enable_gif(svm); @@ -2162,7 +2176,7 @@ static int cpuid_interception(struct vcpu_svm *svm) static int iret_interception(struct vcpu_svm *svm) { ++svm->vcpu.stat.nmi_window_exits; - clr_intercept(svm, INTERCEPT_IRET); + svm_clr_intercept(svm, INTERCEPT_IRET); svm->vcpu.arch.hflags |= HF_IRET_MASK; svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu); kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); @@ -2358,8 +2372,10 @@ static int svm_get_msr_feature(struct kvm_msr_entry *msr) if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE; break; + case MSR_IA32_PERF_CAPABILITIES: + return 0; default: - return 1; + return KVM_MSR_RET_INVALID; } return 0; @@ -2512,7 +2528,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) return 1; vcpu->arch.pat = data; svm->vmcb->save.g_pat = data; - mark_dirty(svm->vmcb, VMCB_NPT); + vmcb_mark_dirty(svm->vmcb, VMCB_NPT); break; case MSR_IA32_SPEC_CTRL: if (!msr->host_initiated && @@ -2522,7 +2538,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) return 1; - if (data & ~kvm_spec_ctrl_valid_bits(vcpu)) + if (kvm_spec_ctrl_test_value(data)) return 1; svm->spec_ctrl = data; @@ -2617,7 +2633,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) return 1; svm->vmcb->save.dbgctl = data; - mark_dirty(svm->vmcb, VMCB_LBR); + vmcb_mark_dirty(svm->vmcb, VMCB_LBR); if (data & (1ULL<<0)) svm_enable_lbrv(svm); else @@ -2947,6 +2963,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; kvm_run->fail_entry.hardware_entry_failure_reason = svm->vmcb->control.exit_code; + kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; dump_vmcb(vcpu); return 0; } @@ -2970,8 +2987,9 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; - vcpu->run->internal.ndata = 1; + vcpu->run->internal.ndata = 2; vcpu->run->internal.data[0] = exit_code; + vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; return 0; } @@ -2992,21 +3010,18 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) static void reload_tss(struct kvm_vcpu *vcpu) { - int cpu = raw_smp_processor_id(); + struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); - struct svm_cpu_data *sd = per_cpu(svm_data, cpu); sd->tss_desc->type = 9; /* available 32/64-bit TSS */ load_TR_desc(); } static void pre_svm_run(struct vcpu_svm *svm) { - int cpu = raw_smp_processor_id(); - - struct svm_cpu_data *sd = per_cpu(svm_data, cpu); + struct svm_cpu_data *sd = per_cpu(svm_data, svm->vcpu.cpu); if (sev_guest(svm->vcpu.kvm)) - return pre_sev_run(svm, cpu); + return pre_sev_run(svm, svm->vcpu.cpu); /* FIXME: handle wraparound of asid_generation */ if (svm->asid_generation != sd->asid_generation) @@ -3019,7 +3034,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu) svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; vcpu->arch.hflags |= HF_NMI_MASK; - set_intercept(svm, INTERCEPT_IRET); + svm_set_intercept(svm, INTERCEPT_IRET); ++vcpu->stat.nmi_injections; } @@ -3040,7 +3055,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vcpu_svm *svm = to_svm(vcpu); - if (svm_nested_virtualize_tpr(vcpu)) + if (nested_svm_virtualize_tpr(vcpu)) return; clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); @@ -3096,10 +3111,10 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) if (masked) { svm->vcpu.arch.hflags |= HF_NMI_MASK; - set_intercept(svm, INTERCEPT_IRET); + svm_set_intercept(svm, INTERCEPT_IRET); } else { svm->vcpu.arch.hflags &= ~HF_NMI_MASK; - clr_intercept(svm, INTERCEPT_IRET); + svm_clr_intercept(svm, INTERCEPT_IRET); } } @@ -3179,7 +3194,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) if (!gif_set(svm)) { if (vgif_enabled(svm)) - set_intercept(svm, INTERCEPT_STGI); + svm_set_intercept(svm, INTERCEPT_STGI); return; /* STGI will cause a vm exit */ } @@ -3234,7 +3249,7 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if (svm_nested_virtualize_tpr(vcpu)) + if (nested_svm_virtualize_tpr(vcpu)) return; if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) { @@ -3248,7 +3263,7 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); u64 cr8; - if (svm_nested_virtualize_tpr(vcpu) || + if (nested_svm_virtualize_tpr(vcpu) || kvm_vcpu_apicv_active(vcpu)) return; @@ -3344,6 +3359,60 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) void __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs); +static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, + struct vcpu_svm *svm) +{ + /* + * VMENTER enables interrupts (host state), but the kernel state is + * interrupts disabled when this is invoked. Also tell RCU about + * it. This is the same logic as for exit_to_user_mode(). + * + * This ensures that e.g. latency analysis on the host observes + * guest mode as interrupt enabled. + * + * guest_enter_irqoff() informs context tracking about the + * transition to guest mode and if enabled adjusts RCU state + * accordingly. + */ + instrumentation_begin(); + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + instrumentation_end(); + + guest_enter_irqoff(); + lockdep_hardirqs_on(CALLER_ADDR0); + + __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs); + +#ifdef CONFIG_X86_64 + native_wrmsrl(MSR_GS_BASE, svm->host.gs_base); +#else + loadsegment(fs, svm->host.fs); +#ifndef CONFIG_X86_32_LAZY_GS + loadsegment(gs, svm->host.gs); +#endif +#endif + + /* + * VMEXIT disables interrupts (host state), but tracing and lockdep + * have them in state 'on' as recorded before entering guest mode. + * Same as enter_from_user_mode(). + * + * guest_exit_irqoff() restores host context and reinstates RCU if + * enabled and required. + * + * This needs to be done before the below as native_read_msr() + * contains a tracepoint and x86_spec_ctrl_restore_host() calls + * into world and some more. + */ + lockdep_hardirqs_off(CALLER_ADDR0); + guest_exit_irqoff(); + + instrumentation_begin(); + trace_hardirqs_off_finish(); + instrumentation_end(); +} + static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) { fastpath_t exit_fastpath; @@ -3399,16 +3468,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) */ x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl); - __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs); - -#ifdef CONFIG_X86_64 - wrmsrl(MSR_GS_BASE, svm->host.gs_base); -#else - loadsegment(fs, svm->host.fs); -#ifndef CONFIG_X86_32_LAZY_GS - loadsegment(gs, svm->host.gs); -#endif -#endif + svm_vcpu_enter_exit(vcpu, svm); /* * We do not use IBRS in the kernel. If this vCPU has used the @@ -3477,11 +3537,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) SVM_EXIT_EXCP_BASE + MC_VECTOR)) svm_handle_mce(svm); - mark_all_clean(svm->vmcb); + vmcb_mark_all_clean(svm->vmcb); return exit_fastpath; } -static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root) +static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root, + int root_level) { struct vcpu_svm *svm = to_svm(vcpu); unsigned long cr3; @@ -3489,7 +3550,7 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root) cr3 = __sme_set(root); if (npt_enabled) { svm->vmcb->control.nested_cr3 = cr3; - mark_dirty(svm->vmcb, VMCB_NPT); + vmcb_mark_dirty(svm->vmcb, VMCB_NPT); /* Loading L2's CR3 is handled by enter_svm_guest_mode. */ if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) @@ -3498,7 +3559,7 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root) } svm->vmcb->save.cr3 = cr3; - mark_dirty(svm->vmcb, VMCB_CR); + vmcb_mark_dirty(svm->vmcb, VMCB_CR); } static int is_disabled(void) @@ -3551,7 +3612,7 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) return 0; } -static void svm_cpuid_update(struct kvm_vcpu *vcpu) +static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3843,6 +3904,7 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) struct kvm_host_map map; u64 guest; u64 vmcb; + int ret = 0; guest = GET_SMSTATE(u64, smstate, 0x7ed8); vmcb = GET_SMSTATE(u64, smstate, 0x7ee0); @@ -3851,10 +3913,11 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) if (kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb), &map) == -EINVAL) return 1; nested_vmcb = map.hva; - enter_svm_guest_mode(svm, vmcb, nested_vmcb); + ret = enter_svm_guest_mode(svm, vmcb, nested_vmcb); kvm_vcpu_unmap(&svm->vcpu, &map, true); } - return 0; + + return ret; } static void enable_smi_window(struct kvm_vcpu *vcpu) @@ -3863,7 +3926,7 @@ static void enable_smi_window(struct kvm_vcpu *vcpu) if (!gif_set(svm)) { if (vgif_enabled(svm)) - set_intercept(svm, INTERCEPT_STGI); + svm_set_intercept(svm, INTERCEPT_STGI); /* STGI will cause a vm exit */ } else { /* We must be in SMM; RSM will cause a vmexit anyway. */ @@ -3992,7 +4055,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .vcpu_blocking = svm_vcpu_blocking, .vcpu_unblocking = svm_vcpu_unblocking, - .update_bp_intercept = update_bp_intercept, + .update_exception_bitmap = update_exception_bitmap, .get_msr_feature = svm_get_msr_feature, .get_msr = svm_get_msr, .set_msr = svm_set_msr, @@ -4049,12 +4112,11 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_tss_addr = svm_set_tss_addr, .set_identity_map_addr = svm_set_identity_map_addr, - .get_tdp_level = get_npt_level, .get_mt_mask = svm_get_mt_mask, .get_exit_info = svm_get_exit_info, - .cpuid_update = svm_cpuid_update, + .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid, .has_wbinvd_exit = svm_has_wbinvd_exit, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 6ac4c00a5d82..a798e1731709 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -81,7 +81,7 @@ struct kvm_svm { struct kvm_vcpu; -struct nested_state { +struct svm_nested_state { struct vmcb *hsave; u64 hsave_msr; u64 vm_cr_msr; @@ -133,7 +133,7 @@ struct vcpu_svm { ulong nmi_iret_rip; - struct nested_state nested; + struct svm_nested_state nested; bool nmi_singlestep; u64 nmi_singlestep_guest_rflags; @@ -158,9 +158,6 @@ struct vcpu_svm { */ struct list_head ir_list; spinlock_t ir_list_lock; - - /* which host CPU was used for running this vcpu */ - unsigned int last_cpu; }; struct svm_cpu_data { @@ -188,18 +185,18 @@ static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm) return container_of(kvm, struct kvm_svm, kvm); } -static inline void mark_all_dirty(struct vmcb *vmcb) +static inline void vmcb_mark_all_dirty(struct vmcb *vmcb) { vmcb->control.clean = 0; } -static inline void mark_all_clean(struct vmcb *vmcb) +static inline void vmcb_mark_all_clean(struct vmcb *vmcb) { vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1) & ~VMCB_ALWAYS_DIRTY_MASK; } -static inline void mark_dirty(struct vmcb *vmcb, int bit) +static inline void vmcb_mark_dirty(struct vmcb *vmcb, int bit) { vmcb->control.clean &= ~(1 << bit); } @@ -293,7 +290,7 @@ static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit) recalc_intercepts(svm); } -static inline void set_intercept(struct vcpu_svm *svm, int bit) +static inline void svm_set_intercept(struct vcpu_svm *svm, int bit) { struct vmcb *vmcb = get_host_vmcb(svm); @@ -302,7 +299,7 @@ static inline void set_intercept(struct vcpu_svm *svm, int bit) recalc_intercepts(svm); } -static inline void clr_intercept(struct vcpu_svm *svm, int bit) +static inline void svm_clr_intercept(struct vcpu_svm *svm, int bit) { struct vmcb *vmcb = get_host_vmcb(svm); @@ -311,7 +308,7 @@ static inline void clr_intercept(struct vcpu_svm *svm, int bit) recalc_intercepts(svm); } -static inline bool is_intercept(struct vcpu_svm *svm, int bit) +static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit) { return (svm->vmcb->control.intercept & (1ULL << bit)) != 0; } @@ -346,7 +343,10 @@ static inline bool gif_set(struct vcpu_svm *svm) } /* svm.c */ -#define MSR_INVALID 0xffffffffU +#define MSR_CR3_LEGACY_RESERVED_MASK 0xfe7U +#define MSR_CR3_LEGACY_PAE_RESERVED_MASK 0x7U +#define MSR_CR3_LONG_RESERVED_MASK 0xfff0000000000fe7U +#define MSR_INVALID 0xffffffffU u32 svm_msrpm_offset(u32 msr); void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer); @@ -365,7 +365,7 @@ void svm_set_gif(struct vcpu_svm *svm, bool value); #define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ #define NESTED_EXIT_CONTINUE 2 /* Further checks needed */ -static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu) +static inline bool nested_svm_virtualize_tpr(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -387,8 +387,8 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm) return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_NMI)); } -void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, - struct vmcb *nested_vmcb); +int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, + struct vmcb *nested_vmcb); void svm_leave_nested(struct vcpu_svm *svm); int nested_svm_vmrun(struct vcpu_svm *svm); void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb); @@ -420,7 +420,7 @@ extern int avic; static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data) { svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK; - mark_dirty(svm->vmcb, VMCB_AVIC); + vmcb_mark_dirty(svm->vmcb, VMCB_AVIC); } static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index bf944334003a..1ec1ac40e328 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -27,7 +27,7 @@ #define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE #endif - .text +.section .noinstr.text, "ax" /** * __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 11e4df560018..23b58c28a1c9 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -171,15 +171,6 @@ static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) static int nested_vmx_failValid(struct kvm_vcpu *vcpu, u32 vm_instruction_error) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - - /* - * failValid writes the error number to the current VMCS, which - * can't be done if there isn't a current VMCS. - */ - if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs) - return nested_vmx_failInvalid(vcpu); - vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_SF | X86_EFLAGS_OF)) @@ -192,6 +183,20 @@ static int nested_vmx_failValid(struct kvm_vcpu *vcpu, return kvm_skip_emulated_instruction(vcpu); } +static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * failValid writes the error number to the current VMCS, which + * can't be done if there isn't a current VMCS. + */ + if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs) + return nested_vmx_failInvalid(vcpu); + + return nested_vmx_failValid(vcpu, vm_instruction_error); +} + static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) { /* TODO: not to reset guest simply here. */ @@ -2157,7 +2162,8 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) * consistency checks. */ if (enable_ept && nested_early_check) - vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0)); + vmcs_write64(EPT_POINTER, + construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL)); /* All VMFUNCs are currently emulated through L0 vmexits. */ if (cpu_has_vmx_vmfunc()) @@ -2433,22 +2439,28 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) /* * Whether page-faults are trapped is determined by a combination of - * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. - * If enable_ept, L0 doesn't care about page faults and we should - * set all of these to L1's desires. However, if !enable_ept, L0 does - * care about (at least some) page faults, and because it is not easy - * (if at all possible?) to merge L0 and L1's desires, we simply ask - * to exit on each and every L2 page fault. This is done by setting - * MASK=MATCH=0 and (see below) EB.PF=1. + * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0 + * doesn't care about page faults then we should set all of these to + * L1's desires. However, if L0 does care about (some) page faults, it + * is not easy (if at all possible?) to merge L0 and L1's desires, we + * simply ask to exit on each and every L2 page fault. This is done by + * setting MASK=MATCH=0 and (see below) EB.PF=1. * Note that below we don't need special code to set EB.PF beyond the * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when * !enable_ept, EB.PF is 1, so the "or" will always be 1. */ - vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, - enable_ept ? vmcs12->page_fault_error_code_mask : 0); - vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, - enable_ept ? vmcs12->page_fault_error_code_match : 0); + if (vmx_need_pf_intercept(&vmx->vcpu)) { + /* + * TODO: if both L0 and L1 need the same MASK and MATCH, + * go ahead and use it? + */ + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); + } else { + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match); + } if (cpu_has_vmx_apicv()) { vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); @@ -3205,6 +3217,43 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) return true; } +static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + struct vmcs12 *vmcs12; + struct vcpu_vmx *vmx = to_vmx(vcpu); + gpa_t dst; + + if (WARN_ON_ONCE(!is_guest_mode(vcpu))) + return 0; + + if (WARN_ON_ONCE(vmx->nested.pml_full)) + return 1; + + /* + * Check if PML is enabled for the nested guest. Whether eptp bit 6 is + * set is already checked as part of A/D emulation. + */ + vmcs12 = get_vmcs12(vcpu); + if (!nested_cpu_has_pml(vmcs12)) + return 0; + + if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { + vmx->nested.pml_full = true; + return 1; + } + + gpa &= ~0xFFFull; + dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; + + if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, + offset_in_page(dst), sizeof(gpa))) + return 0; + + vmcs12->guest_pml_index--; + + return 0; +} + /* * Intel's VMX Instruction Reference specifies a common set of prerequisites * for running VMX instructions (except VMXON, whose prerequisites are @@ -3456,19 +3505,18 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * when using the merged vmcs02. */ if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) - return nested_vmx_failValid(vcpu, - VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); + return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); if (vmcs12->launch_state == launch) - return nested_vmx_failValid(vcpu, + return nested_vmx_fail(vcpu, launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS : VMXERR_VMRESUME_NONLAUNCHED_VMCS); if (nested_vmx_check_controls(vcpu, vmcs12)) - return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); if (nested_vmx_check_host_state(vcpu, vmcs12)) - return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); + return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); /* * We're finally done with prerequisite checking, and can start with @@ -3517,7 +3565,7 @@ vmentry_failed: if (status == NVMX_VMENTRY_VMEXIT) return 1; WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); - return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); } /* @@ -4460,7 +4508,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, * flag and the VM-instruction error field of the VMCS * accordingly, and skip the emulated instruction. */ - (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); /* * Restore L1's host state to KVM's software model. We're here @@ -4760,8 +4808,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) } if (vmx->nested.vmxon) - return nested_vmx_failValid(vcpu, - VMXERR_VMXON_IN_VMX_ROOT_OPERATION); + return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) != VMXON_NEEDED_FEATURES) { @@ -4852,12 +4899,10 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) return r; if (!page_address_valid(vcpu, vmptr)) - return nested_vmx_failValid(vcpu, - VMXERR_VMCLEAR_INVALID_ADDRESS); + return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); if (vmptr == vmx->nested.vmxon_ptr) - return nested_vmx_failValid(vcpu, - VMXERR_VMCLEAR_VMXON_POINTER); + return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); /* * When Enlightened VMEntry is enabled on the calling CPU we treat @@ -4927,8 +4972,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) offset = vmcs_field_to_offset(field); if (offset < 0) - return nested_vmx_failValid(vcpu, - VMXERR_UNSUPPORTED_VMCS_COMPONENT); + return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); @@ -5031,8 +5075,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) offset = vmcs_field_to_offset(field); if (offset < 0) - return nested_vmx_failValid(vcpu, - VMXERR_UNSUPPORTED_VMCS_COMPONENT); + return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); /* * If the vCPU supports "VMWRITE to any supported field in the @@ -5040,8 +5083,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) */ if (vmcs_field_readonly(field) && !nested_cpu_has_vmwrite_any_field(vcpu)) - return nested_vmx_failValid(vcpu, - VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); + return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); /* * Ensure vmcs12 is up-to-date before any VMWRITE that dirties @@ -5116,12 +5158,10 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) return r; if (!page_address_valid(vcpu, vmptr)) - return nested_vmx_failValid(vcpu, - VMXERR_VMPTRLD_INVALID_ADDRESS); + return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); if (vmptr == vmx->nested.vmxon_ptr) - return nested_vmx_failValid(vcpu, - VMXERR_VMPTRLD_VMXON_POINTER); + return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); /* Forbid normal VMPTRLD if Enlightened version was used */ if (vmx->nested.hv_evmcs) @@ -5138,7 +5178,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) * given physical address won't match the required * VMCS12_REVISION identifier. */ - return nested_vmx_failValid(vcpu, + return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); } @@ -5148,7 +5188,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) (new_vmcs12->hdr.shadow_vmcs && !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { kvm_vcpu_unmap(vcpu, &map, false); - return nested_vmx_failValid(vcpu, + return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); } @@ -5233,8 +5273,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; if (type >= 32 || !(types & (1 << type))) - return nested_vmx_failValid(vcpu, - VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); /* According to the Intel VMX instruction reference, the memory * operand is read even if it isn't needed (e.g., for type==global) @@ -5255,7 +5294,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) switch (type) { case VMX_EPT_EXTENT_CONTEXT: if (!nested_vmx_check_eptp(vcpu, operand.eptp)) - return nested_vmx_failValid(vcpu, + return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); roots_to_free = 0; @@ -5315,7 +5354,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; if (type >= 32 || !(types & (1 << type))) - return nested_vmx_failValid(vcpu, + return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); /* according to the intel vmx instruction reference, the memory @@ -5329,7 +5368,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) return vmx_handle_memory_failure(vcpu, r, &e); if (operand.vpid >> 16) - return nested_vmx_failValid(vcpu, + return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); vpid02 = nested_get_vpid02(vcpu); @@ -5337,14 +5376,14 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: if (!operand.vpid || is_noncanonical_address(operand.gla, vcpu)) - return nested_vmx_failValid(vcpu, + return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); vpid_sync_vcpu_addr(vpid02, operand.gla); break; case VMX_VPID_EXTENT_SINGLE_CONTEXT: case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: if (!operand.vpid) - return nested_vmx_failValid(vcpu, + return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); vpid_sync_context(vpid02); break; @@ -6333,7 +6372,8 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) /* * secondary cpu-based controls. Do not include those that - * depend on CPUID bits, they are added later by vmx_cpuid_update. + * depend on CPUID bits, they are added later by + * vmx_vcpu_after_set_cpuid. */ if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, @@ -6514,6 +6554,7 @@ struct kvm_x86_nested_ops vmx_nested_ops = { .get_state = vmx_get_nested_state, .set_state = vmx_set_nested_state, .get_vmcs12_pages = nested_get_vmcs12_pages, + .write_log_dirty = nested_vmx_write_pml_buffer, .enable_evmcs = nested_enable_evmcs, .get_evmcs_version = nested_get_evmcs_version, }; diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/ops.h index 5f1ac002b4b6..692b0c31c9c8 100644 --- a/arch/x86/kvm/vmx/ops.h +++ b/arch/x86/kvm/vmx/ops.h @@ -146,7 +146,9 @@ do { \ : : op1 : "cc" : error, fault); \ return; \ error: \ + instrumentation_begin(); \ insn##_error(error_args); \ + instrumentation_end(); \ return; \ fault: \ kvm_spurious_fault(); \ @@ -161,7 +163,9 @@ do { \ : : op1, op2 : "cc" : error, fault); \ return; \ error: \ + instrumentation_begin(); \ insn##_error(error_args); \ + instrumentation_end(); \ return; \ fault: \ kvm_spurious_fault(); \ diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index bdcce65c7a1d..a886a47daebd 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -180,9 +180,6 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) case MSR_CORE_PERF_GLOBAL_OVF_CTRL: ret = pmu->version > 1; break; - case MSR_IA32_PERF_CAPABILITIES: - ret = 1; - break; default: ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) || @@ -224,12 +221,6 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_CORE_PERF_GLOBAL_OVF_CTRL: msr_info->data = pmu->global_ovf_ctrl; return 0; - case MSR_IA32_PERF_CAPABILITIES: - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) - return 1; - msr_info->data = vcpu->arch.perf_capabilities; - return 0; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { @@ -289,14 +280,6 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 0; } break; - case MSR_IA32_PERF_CAPABILITIES: - if (!msr_info->host_initiated) - return 1; - if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM) ? - (data & ~vmx_get_perf_capabilities()) : data) - return 1; - vcpu->arch.perf_capabilities = data; - return 0; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index e0a182cb3cdd..799db084a336 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -27,7 +27,7 @@ #define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE #endif - .text +.section .noinstr.text, "ax" /** * vmx_vmenter - VM-Enter the current loaded VMCS @@ -234,6 +234,9 @@ SYM_FUNC_START(__vmx_vcpu_run) jmp 1b SYM_FUNC_END(__vmx_vcpu_run) + +.section .text, "ax" + /** * vmread_error_trampoline - Trampoline from inline asm to vmread_error() * @field: VMCS field encoding that failed diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 13745f2a5ecd..46ba2e03a892 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -27,6 +27,7 @@ #include <linux/slab.h> #include <linux/tboot.h> #include <linux/trace_events.h> +#include <linux/entry-kvm.h> #include <asm/apic.h> #include <asm/asm.h> @@ -780,7 +781,7 @@ void update_exception_bitmap(struct kvm_vcpu *vcpu) eb |= 1u << BP_VECTOR; if (to_vmx(vcpu)->rmode.vm86_active) eb = ~0; - if (enable_ept) + if (!vmx_need_pf_intercept(vcpu)) eb &= ~(1u << PF_VECTOR); /* When we are running a nested L2 guest and L1 specified for it a @@ -1169,7 +1170,7 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) gs_base = cpu_kernelmode_gs_base(cpu); if (likely(is_64bit_mm(current->mm))) { - save_fsgs_for_kvm(); + current_save_fsgs(); fs_sel = current->thread.fsindex; gs_sel = current->thread.gsindex; fs_base = current->thread.fsbase; @@ -1815,7 +1816,7 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr) msr->data = vmx_get_perf_capabilities(); return 0; default: - return 1; + return KVM_MSR_RET_INVALID; } } @@ -2062,7 +2063,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) return 1; - if (data & ~kvm_spec_ctrl_valid_bits(vcpu)) + if (kvm_spec_ctrl_test_value(data)) return 1; vmx->spec_ctrl = data; @@ -2933,14 +2934,16 @@ static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) { - u64 root_hpa = vcpu->arch.mmu->root_hpa; + struct kvm_mmu *mmu = vcpu->arch.mmu; + u64 root_hpa = mmu->root_hpa; /* No flush required if the current context is invalid. */ if (!VALID_PAGE(root_hpa)) return; if (enable_ept) - ept_sync_context(construct_eptp(vcpu, root_hpa)); + ept_sync_context(construct_eptp(vcpu, root_hpa, + mmu->shadow_root_level)); else if (!is_guest_mode(vcpu)) vpid_sync_context(to_vmx(vcpu)->vpid); else @@ -3063,26 +3066,19 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vmx->emulation_required = emulation_required(vcpu); } -static int vmx_get_tdp_level(struct kvm_vcpu *vcpu) +static int vmx_get_max_tdp_level(void) { - if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48)) + if (cpu_has_vmx_ept_5levels()) return 5; return 4; } -static int get_ept_level(struct kvm_vcpu *vcpu) -{ - if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu))) - return vmx_eptp_page_walk_level(nested_ept_get_eptp(vcpu)); - - return vmx_get_tdp_level(vcpu); -} - -u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) +u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa, + int root_level) { u64 eptp = VMX_EPTP_MT_WB; - eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; + eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; if (enable_ept_ad_bits && (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) @@ -3092,7 +3088,8 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) return eptp; } -void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd) +static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd, + int pgd_level) { struct kvm *kvm = vcpu->kvm; bool update_guest_cr3 = true; @@ -3100,7 +3097,7 @@ void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd) u64 eptp; if (enable_ept) { - eptp = construct_eptp(vcpu, pgd); + eptp = construct_eptp(vcpu, pgd, pgd_level); vmcs_write64(EPT_POINTER, eptp); if (kvm_x86_ops.tlb_remote_flush) { @@ -4355,6 +4352,16 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmx->pt_desc.guest.output_mask = 0x7F; vmcs_write64(GUEST_IA32_RTIT_CTL, 0); } + + /* + * If EPT is enabled, #PF is only trapped if MAXPHYADDR is mismatched + * between guest and host. In that case we only care about present + * faults. + */ + if (enable_ept) { + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, PFERR_PRESENT_MASK); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, PFERR_PRESENT_MASK); + } } static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -4781,18 +4788,25 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; - vcpu->run->internal.ndata = 3; + vcpu->run->internal.ndata = 4; vcpu->run->internal.data[0] = vect_info; vcpu->run->internal.data[1] = intr_info; vcpu->run->internal.data[2] = error_code; + vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu; return 0; } if (is_page_fault(intr_info)) { cr2 = vmx_get_exit_qual(vcpu); - /* EPT won't cause page fault directly */ - WARN_ON_ONCE(!vcpu->arch.apf.host_apf_flags && enable_ept); - return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); + if (enable_ept && !vcpu->arch.apf.host_apf_flags) { + /* + * EPT will cause page fault only if we need to + * detect illegal GPAs. + */ + kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code); + return 1; + } else + return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); } ex_no = intr_info & INTR_INFO_VECTOR_MASK; @@ -5308,6 +5322,18 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; vcpu->arch.exit_qualification = exit_qualification; + + /* + * Check that the GPA doesn't exceed physical memory limits, as that is + * a guest page fault. We have to emulate the instruction here, because + * if the illegal address is that of a paging structure, then + * EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we + * would also use advanced VM-exit information for EPT violations to + * reconstruct the page fault error code. + */ + if (unlikely(kvm_mmu_is_illegal_gpa(vcpu, gpa))) + return kvm_emulate_instruction(vcpu, 0); + return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); } @@ -5373,14 +5399,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) } /* - * Note, return 1 and not 0, vcpu_run() is responsible for - * morphing the pending signal into the proper return code. + * Note, return 1 and not 0, vcpu_run() will invoke + * xfer_to_guest_mode() which will create a proper return + * code. */ - if (signal_pending(current)) + if (__xfer_to_guest_mode_work_pending()) return 1; - - if (need_resched()) - schedule(); } return 1; @@ -6006,6 +6030,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason; + vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; return 0; } @@ -6014,6 +6039,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason = vmcs_read32(VM_INSTRUCTION_ERROR); + vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; return 0; } @@ -6040,6 +6066,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) vcpu->run->internal.data[3] = vmcs_read64(GUEST_PHYSICAL_ADDRESS); } + vcpu->run->internal.data[vcpu->run->internal.ndata++] = + vcpu->arch.last_vmentry_cpu; return 0; } @@ -6095,8 +6123,9 @@ unexpected_vmexit: vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; - vcpu->run->internal.ndata = 1; + vcpu->run->internal.ndata = 2; vcpu->run->internal.data[0] = exit_reason; + vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; return 0; } @@ -6110,7 +6139,7 @@ unexpected_vmexit: * information but as all relevant affected CPUs have 32KiB L1D cache size * there is no point in doing so. */ -static void vmx_l1d_flush(struct kvm_vcpu *vcpu) +static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) { int size = PAGE_SIZE << L1D_CACHE_ORDER; @@ -6143,7 +6172,7 @@ static void vmx_l1d_flush(struct kvm_vcpu *vcpu) vcpu->stat.l1d_flush++; if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { - wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); + native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); return; } @@ -6629,7 +6658,7 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) } } -void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) +void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) { if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) { vmx->loaded_vmcs->host_state.rsp = host_rsp; @@ -6651,6 +6680,63 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched); +static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, + struct vcpu_vmx *vmx) +{ + /* + * VMENTER enables interrupts (host state), but the kernel state is + * interrupts disabled when this is invoked. Also tell RCU about + * it. This is the same logic as for exit_to_user_mode(). + * + * This ensures that e.g. latency analysis on the host observes + * guest mode as interrupt enabled. + * + * guest_enter_irqoff() informs context tracking about the + * transition to guest mode and if enabled adjusts RCU state + * accordingly. + */ + instrumentation_begin(); + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + instrumentation_end(); + + guest_enter_irqoff(); + lockdep_hardirqs_on(CALLER_ADDR0); + + /* L1D Flush includes CPU buffer clear to mitigate MDS */ + if (static_branch_unlikely(&vmx_l1d_should_flush)) + vmx_l1d_flush(vcpu); + else if (static_branch_unlikely(&mds_user_clear)) + mds_clear_cpu_buffers(); + + if (vcpu->arch.cr2 != native_read_cr2()) + native_write_cr2(vcpu->arch.cr2); + + vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, + vmx->loaded_vmcs->launched); + + vcpu->arch.cr2 = native_read_cr2(); + + /* + * VMEXIT disables interrupts (host state), but tracing and lockdep + * have them in state 'on' as recorded before entering guest mode. + * Same as enter_from_user_mode(). + * + * guest_exit_irqoff() restores host context and reinstates RCU if + * enabled and required. + * + * This needs to be done before the below as native_read_msr() + * contains a tracepoint and x86_spec_ctrl_restore_host() calls + * into world and some more. + */ + lockdep_hardirqs_off(CALLER_ADDR0); + guest_exit_irqoff(); + + instrumentation_begin(); + trace_hardirqs_off_finish(); + instrumentation_end(); +} + static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) { fastpath_t exit_fastpath; @@ -6725,19 +6811,8 @@ reenter_guest: */ x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0); - /* L1D Flush includes CPU buffer clear to mitigate MDS */ - if (static_branch_unlikely(&vmx_l1d_should_flush)) - vmx_l1d_flush(vcpu); - else if (static_branch_unlikely(&mds_user_clear)) - mds_clear_cpu_buffers(); - - if (vcpu->arch.cr2 != read_cr2()) - write_cr2(vcpu->arch.cr2); - - vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, - vmx->loaded_vmcs->launched); - - vcpu->arch.cr2 = read_cr2(); + /* The actual VMENTER/EXIT is in the .noinstr.text section. */ + vmx_vcpu_enter_exit(vcpu, vmx); /* * We do not use IBRS in the kernel. If this vCPU has used the @@ -7230,7 +7305,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4)); } -static void vmx_cpuid_update(struct kvm_vcpu *vcpu) +static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7479,42 +7554,6 @@ static void vmx_flush_log_dirty(struct kvm *kvm) kvm_flush_pml_buffers(kvm); } -static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) -{ - struct vmcs12 *vmcs12; - struct vcpu_vmx *vmx = to_vmx(vcpu); - gpa_t dst; - - if (is_guest_mode(vcpu)) { - WARN_ON_ONCE(vmx->nested.pml_full); - - /* - * Check if PML is enabled for the nested guest. - * Whether eptp bit 6 is set is already checked - * as part of A/D emulation. - */ - vmcs12 = get_vmcs12(vcpu); - if (!nested_cpu_has_pml(vmcs12)) - return 0; - - if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { - vmx->nested.pml_full = true; - return 1; - } - - gpa &= ~0xFFFull; - dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; - - if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, - offset_in_page(dst), sizeof(gpa))) - return 0; - - vmcs12->guest_pml_index--; - } - - return 0; -} - static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *memslot, gfn_t offset, unsigned long mask) @@ -7859,7 +7898,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .vcpu_load = vmx_vcpu_load, .vcpu_put = vmx_vcpu_put, - .update_bp_intercept = update_exception_bitmap, + .update_exception_bitmap = update_exception_bitmap, .get_msr_feature = vmx_get_msr_feature, .get_msr = vmx_get_msr, .set_msr = vmx_set_msr, @@ -7919,12 +7958,11 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .set_tss_addr = vmx_set_tss_addr, .set_identity_map_addr = vmx_set_identity_map_addr, - .get_tdp_level = vmx_get_tdp_level, .get_mt_mask = vmx_get_mt_mask, .get_exit_info = vmx_get_exit_info, - .cpuid_update = vmx_cpuid_update, + .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid, .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, @@ -7943,7 +7981,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .slot_disable_log_dirty = vmx_slot_disable_log_dirty, .flush_log_dirty = vmx_flush_log_dirty, .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, - .write_log_dirty = vmx_write_pml_buffer, .pre_block = vmx_pre_block, .post_block = vmx_post_block, @@ -8071,7 +8108,7 @@ static __init int hardware_setup(void) ept_lpage_level = PG_LEVEL_2M; else ept_lpage_level = PG_LEVEL_4K; - kvm_configure_mmu(enable_ept, ept_lpage_level); + kvm_configure_mmu(enable_ept, vmx_get_max_tdp_level(), ept_lpage_level); /* * Only enable PML when hardware supports PML feature, and both EPT @@ -8266,6 +8303,13 @@ static int __init vmx_init(void) #endif vmx_check_vmcs12_offsets(); + /* + * Intel processors don't have problems with + * GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable + * it for VMX by default + */ + allow_smaller_maxphyaddr = true; + return 0; } module_init(vmx_init); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 639798e4a6ca..26175a4759fa 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -11,6 +11,7 @@ #include "kvm_cache_regs.h" #include "ops.h" #include "vmcs.h" +#include "cpuid.h" extern const u32 vmx_msr_index[]; @@ -337,11 +338,11 @@ void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer); void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); void set_cr4_guest_host_mask(struct vcpu_vmx *vmx); -void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long cr3); void ept_save_pdptrs(struct kvm_vcpu *vcpu); void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); -u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); +u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa, + int root_level); void update_exception_bitmap(struct kvm_vcpu *vcpu); void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); bool vmx_nmi_blocked(struct kvm_vcpu *vcpu); @@ -536,8 +537,6 @@ static inline struct vmcs *alloc_vmcs(bool shadow) GFP_KERNEL_ACCOUNT); } -u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); - static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx) { vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio; @@ -550,6 +549,11 @@ static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx) SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; } +static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu) +{ + return !enable_ept || cpuid_maxphyaddr(vcpu) < boot_cpu_data.x86_phys_bits; +} + void dump_vmcs(void); #endif /* __KVM_X86_VMX_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 88c593f83b28..599d73206299 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -56,6 +56,7 @@ #include <linux/sched/stat.h> #include <linux/sched/isolation.h> #include <linux/mem_encrypt.h> +#include <linux/entry-kvm.h> #include <trace/events/kvm.h> @@ -187,6 +188,9 @@ static struct kvm_shared_msrs __percpu *shared_msrs; u64 __read_mostly host_efer; EXPORT_SYMBOL_GPL(host_efer); +bool __read_mostly allow_smaller_maxphyaddr; +EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); + static u64 __read_mostly host_xss; u64 __read_mostly supported_xss; EXPORT_SYMBOL_GPL(supported_xss); @@ -243,6 +247,29 @@ static struct kmem_cache *x86_fpu_cache; static struct kmem_cache *x86_emulator_cache; +/* + * When called, it means the previous get/set msr reached an invalid msr. + * Return 0 if we want to ignore/silent this failed msr access, or 1 if we want + * to fail the caller. + */ +static int kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr, + u64 data, bool write) +{ + const char *op = write ? "wrmsr" : "rdmsr"; + + if (ignore_msrs) { + if (report_ignored_msrs) + vcpu_unimpl(vcpu, "ignored %s: 0x%x data 0x%llx\n", + op, msr, data); + /* Mask the error */ + return 0; + } else { + vcpu_debug_ratelimited(vcpu, "unhandled %s: 0x%x data 0x%llx\n", + op, msr, data); + return 1; + } +} + static struct kmem_cache *kvm_alloc_emulator_cache(void) { unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src); @@ -379,7 +406,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } EXPORT_SYMBOL_GPL(kvm_set_apic_base); -asmlinkage __visible void kvm_spurious_fault(void) +asmlinkage __visible noinstr void kvm_spurious_fault(void) { /* Fault while not rebooting. We want the trace. */ BUG_ON(!kvm_rebooting); @@ -775,6 +802,7 @@ EXPORT_SYMBOL_GPL(pdptrs_changed); int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { unsigned long old_cr0 = kvm_read_cr0(vcpu); + unsigned long pdptr_bits = X86_CR0_CD | X86_CR0_NW | X86_CR0_PG; unsigned long update_bits = X86_CR0_PG | X86_CR0_WP; cr0 |= X86_CR0_ET; @@ -792,22 +820,22 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) return 1; - if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { #ifdef CONFIG_X86_64 - if ((vcpu->arch.efer & EFER_LME)) { - int cs_db, cs_l; + if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) && + (cr0 & X86_CR0_PG)) { + int cs_db, cs_l; - if (!is_pae(vcpu)) - return 1; - kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - if (cs_l) - return 1; - } else -#endif - if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, - kvm_read_cr3(vcpu))) + if (!is_pae(vcpu)) + return 1; + kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + if (cs_l) return 1; } +#endif + if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) && + is_pae(vcpu) && ((cr0 ^ old_cr0) & pdptr_bits) && + !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu))) + return 1; if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) return 1; @@ -916,7 +944,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) vcpu->arch.xcr0 = xcr0; if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND) - kvm_update_cpuid(vcpu); + kvm_update_cpuid_runtime(vcpu); return 0; } @@ -931,37 +959,17 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) } EXPORT_SYMBOL_GPL(kvm_set_xcr); -#define __cr4_reserved_bits(__cpu_has, __c) \ -({ \ - u64 __reserved_bits = CR4_RESERVED_BITS; \ - \ - if (!__cpu_has(__c, X86_FEATURE_XSAVE)) \ - __reserved_bits |= X86_CR4_OSXSAVE; \ - if (!__cpu_has(__c, X86_FEATURE_SMEP)) \ - __reserved_bits |= X86_CR4_SMEP; \ - if (!__cpu_has(__c, X86_FEATURE_SMAP)) \ - __reserved_bits |= X86_CR4_SMAP; \ - if (!__cpu_has(__c, X86_FEATURE_FSGSBASE)) \ - __reserved_bits |= X86_CR4_FSGSBASE; \ - if (!__cpu_has(__c, X86_FEATURE_PKU)) \ - __reserved_bits |= X86_CR4_PKE; \ - if (!__cpu_has(__c, X86_FEATURE_LA57)) \ - __reserved_bits |= X86_CR4_LA57; \ - if (!__cpu_has(__c, X86_FEATURE_UMIP)) \ - __reserved_bits |= X86_CR4_UMIP; \ - __reserved_bits; \ -}) - -static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { if (cr4 & cr4_reserved_bits) return -EINVAL; - if (cr4 & __cr4_reserved_bits(guest_cpuid_has, vcpu)) + if (cr4 & vcpu->arch.cr4_guest_rsvd_bits) return -EINVAL; return 0; } +EXPORT_SYMBOL_GPL(kvm_valid_cr4); int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { @@ -1000,7 +1008,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) kvm_mmu_reset_context(vcpu); if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) - kvm_update_cpuid(vcpu); + kvm_update_cpuid_runtime(vcpu); return 0; } @@ -1110,7 +1118,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) case 4: /* fall through */ case 6: - if (val & 0xffffffff00000000ULL) + if (!kvm_dr6_valid(val)) return -1; /* #GP */ vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu); break; @@ -1389,8 +1397,7 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr) rdmsrl_safe(msr->index, &msr->data); break; default: - if (kvm_x86_ops.get_msr_feature(msr)) - return 1; + return kvm_x86_ops.get_msr_feature(msr); } return 0; } @@ -1402,6 +1409,13 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) msr.index = index; r = kvm_get_msr_feature(&msr); + + if (r == KVM_MSR_RET_INVALID) { + /* Unconditionally clear the output for simplicity */ + *data = 0; + r = kvm_msr_ignored_check(vcpu, index, 0, false); + } + if (r) return r; @@ -1516,6 +1530,17 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, return kvm_x86_ops.set_msr(vcpu, &msr); } +static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, + u32 index, u64 data, bool host_initiated) +{ + int ret = __kvm_set_msr(vcpu, index, data, host_initiated); + + if (ret == KVM_MSR_RET_INVALID) + ret = kvm_msr_ignored_check(vcpu, index, data, true); + + return ret; +} + /* * Read the MSR specified by @index into @data. Select MSR specific fault * checks are bypassed if @host_initiated is %true. @@ -1537,15 +1562,29 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, return ret; } +static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, + u32 index, u64 *data, bool host_initiated) +{ + int ret = __kvm_get_msr(vcpu, index, data, host_initiated); + + if (ret == KVM_MSR_RET_INVALID) { + /* Unconditionally clear *data for simplicity */ + *data = 0; + ret = kvm_msr_ignored_check(vcpu, index, 0, false); + } + + return ret; +} + int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) { - return __kvm_get_msr(vcpu, index, data, false); + return kvm_get_msr_ignored_check(vcpu, index, data, false); } EXPORT_SYMBOL_GPL(kvm_get_msr); int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) { - return __kvm_set_msr(vcpu, index, data, false); + return kvm_set_msr_ignored_check(vcpu, index, data, false); } EXPORT_SYMBOL_GPL(kvm_set_msr); @@ -1587,7 +1626,7 @@ EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) { return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) || - need_resched() || signal_pending(current); + xfer_to_guest_mode_work_pending(); } EXPORT_SYMBOL_GPL(kvm_vcpu_exit_request); @@ -1665,12 +1704,12 @@ EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); */ static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { - return __kvm_get_msr(vcpu, index, data, true); + return kvm_get_msr_ignored_check(vcpu, index, data, true); } static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { - return __kvm_set_msr(vcpu, index, *data, true); + return kvm_set_msr_ignored_check(vcpu, index, *data, true); } #ifdef CONFIG_X86_64 @@ -2822,6 +2861,20 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.arch_capabilities = data; break; + case MSR_IA32_PERF_CAPABILITIES: { + struct kvm_msr_entry msr_ent = {.index = msr, .data = 0}; + + if (!msr_info->host_initiated) + return 1; + if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM) && kvm_get_msr_feature(&msr_ent)) + return 1; + if (data & ~msr_ent.data) + return 1; + + vcpu->arch.perf_capabilities = data; + + return 0; + } case MSR_EFER: return set_efer(vcpu, msr_info); case MSR_K7_HWCR: @@ -2881,7 +2934,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3)) return 1; vcpu->arch.ia32_misc_enable_msr = data; - kvm_update_cpuid(vcpu); + kvm_update_cpuid_runtime(vcpu); } else { vcpu->arch.ia32_misc_enable_msr = data; } @@ -3066,17 +3119,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return xen_hvm_config(vcpu, data); if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); - if (!ignore_msrs) { - vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n", - msr, data); - return 1; - } else { - if (report_ignored_msrs) - vcpu_unimpl(vcpu, - "ignored wrmsr: 0x%x data 0x%llx\n", - msr, data); - break; - } + return KVM_MSR_RET_INVALID; } return 0; } @@ -3172,6 +3215,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; msr_info->data = vcpu->arch.arch_capabilities; break; + case MSR_IA32_PERF_CAPABILITIES: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) + return 1; + msr_info->data = vcpu->arch.perf_capabilities; + break; case MSR_IA32_POWER_CTL: msr_info->data = vcpu->arch.msr_ia32_power_ctl; break; @@ -3331,17 +3380,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) default: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); - if (!ignore_msrs) { - vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n", - msr_info->index); - return 1; - } else { - if (report_ignored_msrs) - vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", - msr_info->index); - msr_info->data = 0; - } - break; + return KVM_MSR_RET_INVALID; } return 0; } @@ -3476,6 +3515,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_MSR_PLATFORM_INFO: case KVM_CAP_EXCEPTION_PAYLOAD: case KVM_CAP_SET_GUEST_DEBUG: + case KVM_CAP_LAST_CPU: r = 1; break; case KVM_CAP_SYNC_REGS: @@ -3538,6 +3578,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: r = kvm_x86_ops.nested_ops->enable_evmcs != NULL; break; + case KVM_CAP_SMALLER_MAXPHYADDR: + r = (int) allow_smaller_maxphyaddr; + break; default: break; } @@ -8154,7 +8197,7 @@ static void enter_smm(struct kvm_vcpu *vcpu) kvm_x86_ops.set_efer(vcpu, 0); #endif - kvm_update_cpuid(vcpu); + kvm_update_cpuid_runtime(vcpu); kvm_mmu_reset_context(vcpu); } @@ -8506,7 +8549,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } trace_kvm_entry(vcpu->vcpu_id); - guest_enter_irqoff(); fpregs_assert_state_consistent(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) @@ -8548,6 +8590,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (hw_breakpoint_active()) hw_breakpoint_restore(); + vcpu->arch.last_vmentry_cpu = vcpu->cpu; vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); vcpu->mode = OUTSIDE_GUEST_MODE; @@ -8568,7 +8611,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) local_irq_disable(); kvm_after_interrupt(vcpu); - guest_exit_irqoff(); if (lapic_in_kernel(vcpu)) { s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta; if (delta != S64_MIN) { @@ -8681,15 +8723,11 @@ static int vcpu_run(struct kvm_vcpu *vcpu) break; } - if (signal_pending(current)) { - r = -EINTR; - vcpu->run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.signal_exits; - break; - } - if (need_resched()) { + if (__xfer_to_guest_mode_work_pending()) { srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); - cond_resched(); + r = xfer_to_guest_mode_handle_work(vcpu); + if (r) + return r; vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); } } @@ -9177,7 +9215,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) (X86_CR4_OSXSAVE | X86_CR4_PKE)); kvm_x86_ops.set_cr4(vcpu, sregs->cr4); if (cpuid_update_needed) - kvm_update_cpuid(vcpu); + kvm_update_cpuid_runtime(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); if (is_pae_paging(vcpu)) { @@ -9281,7 +9319,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, */ kvm_set_rflags(vcpu, rflags); - kvm_x86_ops.update_bp_intercept(vcpu); + kvm_x86_ops.update_exception_bitmap(vcpu); r = 0; @@ -9479,7 +9517,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) fx_init(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); - vcpu->arch.tdp_level = kvm_x86_ops.get_tdp_level(vcpu); vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; @@ -9932,7 +9969,7 @@ void kvm_arch_sync_events(struct kvm *kvm) int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) { int i, r; - unsigned long hva, uninitialized_var(old_npages); + unsigned long hva, old_npages; struct kvm_memslots *slots = kvm_memslots(kvm); struct kvm_memory_slot *slot; @@ -10630,11 +10667,17 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, { struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); + int ret; irqfd->producer = prod; + kvm_arch_start_assignment(irqfd->kvm); + ret = kvm_x86_ops.update_pi_irte(irqfd->kvm, + prod->irq, irqfd->gsi, 1); - return kvm_x86_ops.update_pi_irte(irqfd->kvm, - prod->irq, irqfd->gsi, 1); + if (ret) + kvm_arch_end_assignment(irqfd->kvm); + + return ret; } void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, @@ -10657,6 +10700,8 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, if (ret) printk(KERN_INFO "irq bypass consumer (token %p) unregistration" " fails: %d\n", irqfd->consumer.token, ret); + + kvm_arch_end_assignment(irqfd->kvm); } int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, @@ -10676,28 +10721,53 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_arch_no_poll); -u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu) + +int kvm_spec_ctrl_test_value(u64 value) { - uint64_t bits = SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD; + /* + * test that setting IA32_SPEC_CTRL to given value + * is allowed by the host processor + */ + + u64 saved_value; + unsigned long flags; + int ret = 0; + + local_irq_save(flags); + + if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &saved_value)) + ret = 1; + else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, value)) + ret = 1; + else + wrmsrl(MSR_IA32_SPEC_CTRL, saved_value); + + local_irq_restore(flags); - /* The STIBP bit doesn't fault even if it's not advertised */ - if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS)) - bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP); - if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL) && - !boot_cpu_has(X86_FEATURE_AMD_IBRS)) - bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP); + return ret; +} +EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value); - if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL_SSBD) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) - bits &= ~SPEC_CTRL_SSBD; - if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) && - !boot_cpu_has(X86_FEATURE_AMD_SSBD)) - bits &= ~SPEC_CTRL_SSBD; +void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code) +{ + struct x86_exception fault; - return bits; + if (!(error_code & PFERR_PRESENT_MASK) || + vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, error_code, &fault) != UNMAPPED_GVA) { + /* + * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page + * tables probably do not match the TLB. Just proceed + * with the error code that the processor gave. + */ + fault.vector = PF_VECTOR; + fault.error_code_valid = true; + fault.error_code = error_code; + fault.nested_page_fault = false; + fault.address = gva; + } + vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault); } -EXPORT_SYMBOL_GPL(kvm_spec_ctrl_valid_bits); +EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 6eb62e97e59f..995ab696dcf0 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -272,6 +272,7 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int page_num); bool kvm_vector_hashing_enabled(void); +void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code); int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len); fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); @@ -360,10 +361,41 @@ static inline bool kvm_dr7_valid(u64 data) /* Bits [63:32] are reserved */ return !(data >> 32); } +static inline bool kvm_dr6_valid(u64 data) +{ + /* Bits [63:32] are reserved */ + return !(data >> 32); +} void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu); void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); -u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu); +int kvm_spec_ctrl_test_value(u64 value); +int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu); +#define KVM_MSR_RET_INVALID 2 + +#define __cr4_reserved_bits(__cpu_has, __c) \ +({ \ + u64 __reserved_bits = CR4_RESERVED_BITS; \ + \ + if (!__cpu_has(__c, X86_FEATURE_XSAVE)) \ + __reserved_bits |= X86_CR4_OSXSAVE; \ + if (!__cpu_has(__c, X86_FEATURE_SMEP)) \ + __reserved_bits |= X86_CR4_SMEP; \ + if (!__cpu_has(__c, X86_FEATURE_SMAP)) \ + __reserved_bits |= X86_CR4_SMAP; \ + if (!__cpu_has(__c, X86_FEATURE_FSGSBASE)) \ + __reserved_bits |= X86_CR4_FSGSBASE; \ + if (!__cpu_has(__c, X86_FEATURE_PKU)) \ + __reserved_bits |= X86_CR4_PKE; \ + if (!__cpu_has(__c, X86_FEATURE_LA57)) \ + __reserved_bits |= X86_CR4_LA57; \ + if (!__cpu_has(__c, X86_FEATURE_UMIP)) \ + __reserved_bits |= X86_CR4_UMIP; \ + if (!__cpu_has(__c, X86_FEATURE_VMX)) \ + __reserved_bits |= X86_CR4_VMXE; \ + __reserved_bits; \ +}) + #endif diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 6110bce7237b..d46fff11f06f 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -24,7 +24,7 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_cmdline.o = -pg endif -CFLAGS_cmdline.o := $(call cc-option, -fno-stack-protector) +CFLAGS_cmdline.o := -fno-stack-protector endif inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index a873da6b46d6..8679a9d6c47f 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -689,12 +689,10 @@ int fpregs_soft_set(struct task_struct *target, int fpregs_soft_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - void *kbuf, void __user *ubuf) + struct membuf to) { struct swregs_state *s387 = &target->thread.fpu.state.soft; const void *space = s387->st_space; - int ret; int offset = (S387->ftop & 7) * 10, other = 80 - offset; RE_ENTRANT_CHECK_OFF; @@ -709,18 +707,11 @@ int fpregs_soft_get(struct task_struct *target, S387->fos |= 0xffff0000; #endif /* PECULIAR_486 */ - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, s387, 0, - offsetof(struct swregs_state, st_space)); - - /* Copy all registers in stack order. */ - if (!ret) - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - space + offset, 0, other); - if (!ret) - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - space, 0, offset); + membuf_write(&to, s387, offsetof(struct swregs_state, st_space)); + membuf_write(&to, space + offset, other); + membuf_write(&to, space, offset); RE_ENTRANT_CHECK_ON; - return ret; + return 0; } diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index f7fd0e868c9c..5864219221ca 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -22,10 +22,9 @@ obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o mmap.o \ obj-y += pat/ # Make sure __phys_addr has no stackprotector -nostackp := $(call cc-option, -fno-stack-protector) -CFLAGS_physaddr.o := $(nostackp) -CFLAGS_setup_nx.o := $(nostackp) -CFLAGS_mem_encrypt_identity.o := $(nostackp) +CFLAGS_physaddr.o := -fno-stack-protector +CFLAGS_setup_nx.o := -fno-stack-protector +CFLAGS_mem_encrypt_identity.o := -fno-stack-protector CFLAGS_fault.o := -I $(srctree)/$(src)/../include/asm/trace diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 1ead568c0101..35f1498e9832 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -21,7 +21,6 @@ #include <asm/cpufeature.h> /* boot_cpu_has, ... */ #include <asm/traps.h> /* dotraplinkage, ... */ -#include <asm/pgalloc.h> /* pgd_*(), ... */ #include <asm/fixmap.h> /* VSYSCALL_ADDR */ #include <asm/vsyscall.h> /* emulate_vsyscall */ #include <asm/vm86.h> /* struct vm86 */ @@ -1140,7 +1139,7 @@ void do_user_addr_fault(struct pt_regs *regs, struct vm_area_struct *vma; struct task_struct *tsk; struct mm_struct *mm; - vm_fault_t fault, major = 0; + vm_fault_t fault; unsigned int flags = FAULT_FLAG_DEFAULT; tsk = current; @@ -1292,8 +1291,7 @@ good_area: * userland). The return to userland is identified whenever * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags. */ - fault = handle_mm_fault(vma, address, flags); - major |= fault & VM_FAULT_MAJOR; + fault = handle_mm_fault(vma, address, flags, regs); /* Quick path to respond to signals */ if (fault_signal_pending(fault, regs)) { @@ -1320,18 +1318,6 @@ good_area: return; } - /* - * Major/minor page fault accounting. If any of the events - * returned VM_FAULT_MAJOR, we account it as a major fault. - */ - if (major) { - tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); - } else { - tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); - } - check_v8086_mode(regs, address, tsk); } NOKPROBE_SYMBOL(do_user_addr_fault); @@ -1377,7 +1363,7 @@ handle_page_fault(struct pt_regs *regs, unsigned long error_code, DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) { unsigned long address = read_cr2(); - bool rcu_exit; + irqentry_state_t state; prefetchw(¤t->mm->mmap_lock); @@ -1412,11 +1398,11 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) * code reenabled RCU to avoid subsequent wreckage which helps * debugability. */ - rcu_exit = idtentry_enter_cond_rcu(regs); + state = irqentry_enter(regs); instrumentation_begin(); handle_page_fault(regs, error_code, address); instrumentation_end(); - idtentry_exit_cond_rcu(regs, rcu_exit); + irqentry_exit(regs, state); } diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index cf5781142716..a0d023cb4292 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -17,7 +17,6 @@ #include <asm/mman.h> #include <asm/tlb.h> #include <asm/tlbflush.h> -#include <asm/pgalloc.h> #include <asm/elf.h> #if 0 /* This is just for testing */ diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 001dd7dc829f..c7a47603537f 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -25,6 +25,7 @@ #include <asm/cpufeature.h> #include <asm/pti.h> #include <asm/text-patching.h> +#include <asm/memtype.h> /* * We need to define the tracepoints somewhere, and tlb.c @@ -912,8 +913,6 @@ void free_kernel_image_pages(const char *what, void *begin, void *end) set_memory_np_noalias(begin_ul, len_pages); } -void __weak mem_encrypt_free_decrypted_mem(void) { } - void __ref free_initmem(void) { e820__reallocate_tables(); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 8b4afad84f4a..7c055259de3a 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -52,6 +52,7 @@ #include <asm/cpu_entry_area.h> #include <asm/init.h> #include <asm/pgtable_areas.h> +#include <asm/numa.h> #include "mm_internal.h" @@ -678,7 +679,6 @@ void __init initmem_init(void) #endif memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0); - sparse_memory_present_with_active_regions(0); #ifdef CONFIG_FLATMEM max_mapnr = IS_ENABLED(CONFIG_HIGHMEM) ? highend_pfn : max_low_pfn; @@ -718,7 +718,6 @@ void __init paging_init(void) * NOTE: at this point the bootmem allocator is fully available. */ olpc_dt_build_devicetree(); - sparse_memory_present_with_active_regions(MAX_NUMNODES); sparse_init(); zone_sizes_init(); } diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index dbae185511cd..a4ac13cc3fdc 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -209,7 +209,7 @@ static void sync_global_pgds_l4(unsigned long start, unsigned long end) * When memory was added make sure all the processes MM have * suitable PGD entries in the local PGD level page. */ -void sync_global_pgds(unsigned long start, unsigned long end) +static void sync_global_pgds(unsigned long start, unsigned long end) { if (pgtable_l5_enabled()) sync_global_pgds_l5(start, end); @@ -817,7 +817,6 @@ void __init initmem_init(void) void __init paging_init(void) { - sparse_memory_present_with_active_regions(MAX_NUMNODES); sparse_init(); /* @@ -1238,6 +1237,51 @@ static void __init register_page_bootmem_info(void) #endif } +/* + * Pre-allocates page-table pages for the vmalloc area in the kernel page-table. + * Only the level which needs to be synchronized between all page-tables is + * allocated because the synchronization can be expensive. + */ +static void __init preallocate_vmalloc_pages(void) +{ + unsigned long addr; + const char *lvl; + + for (addr = VMALLOC_START; addr <= VMALLOC_END; addr = ALIGN(addr + 1, PGDIR_SIZE)) { + pgd_t *pgd = pgd_offset_k(addr); + p4d_t *p4d; + pud_t *pud; + + lvl = "p4d"; + p4d = p4d_alloc(&init_mm, pgd, addr); + if (!p4d) + goto failed; + + /* + * With 5-level paging the P4D level is not folded. So the PGDs + * are now populated and there is no need to walk down to the + * PUD level. + */ + if (pgtable_l5_enabled()) + continue; + + lvl = "pud"; + pud = pud_alloc(&init_mm, p4d, addr); + if (!pud) + goto failed; + } + + return; + +failed: + + /* + * The pages have to be there now or they will be missing in + * process page-tables later. + */ + panic("Failed to pre-allocate %s pages for vmalloc area\n", lvl); +} + void __init mem_init(void) { pci_iommu_alloc(); @@ -1261,6 +1305,8 @@ void __init mem_init(void) if (get_gate_vma(&init_mm)) kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, PAGE_SIZE, KCORE_USER); + preallocate_vmalloc_pages(); + mem_init_print_info(NULL); } @@ -1406,6 +1452,15 @@ static unsigned long probe_memory_block_size(void) goto done; } + /* + * Use max block size to minimize overhead on bare metal, where + * alignment for memory hotplug isn't a concern. + */ + if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + bz = MAX_BLOCK_SIZE; + goto done; + } + /* Find the largest allowed block size that aligns to memory end */ for (bz = MAX_BLOCK_SIZE; bz > MIN_MEMORY_BLOCK_SIZE; bz >>= 1) { if (IS_ALIGNED(boot_mem_end, bz)) @@ -1463,10 +1518,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, if (pmd_none(*pmd)) { void *p; - if (altmap) - p = altmap_alloc_block_buf(PMD_SIZE, altmap); - else - p = vmemmap_alloc_block_buf(PMD_SIZE, node); + p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); if (p) { pte_t entry; @@ -1493,7 +1545,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, vmemmap_verify((pte_t *)pmd, node, addr, next); continue; } - if (vmemmap_populate_basepages(addr, next, node)) + if (vmemmap_populate_basepages(addr, next, node, NULL)) return -ENOMEM; } return 0; @@ -1505,7 +1557,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, int err; if (end - start < PAGES_PER_SECTION * sizeof(struct page)) - err = vmemmap_populate_basepages(start, end, node); + err = vmemmap_populate_basepages(start, end, node, NULL); else if (boot_cpu_has(X86_FEATURE_PSE)) err = vmemmap_populate_hugepages(start, end, node, altmap); else if (altmap) { @@ -1513,7 +1565,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, __func__); err = -ENOMEM; } else - err = vmemmap_populate_basepages(start, end, node); + err = vmemmap_populate_basepages(start, end, node, NULL); if (!err) sync_global_pgds(start, end - 1); return err; diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index fb620fd9dae9..6e6b39710e5f 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -26,7 +26,6 @@ #include <linux/memblock.h> #include <linux/pgtable.h> -#include <asm/pgalloc.h> #include <asm/setup.h> #include <asm/kaslr.h> diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 4a781cf99e92..9f1177edc2e7 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -376,7 +376,6 @@ bool force_dma_unencrypted(struct device *dev) return false; } -/* Architecture __weak replacement functions */ void __init mem_encrypt_free_decrypted_mem(void) { unsigned long vaddr, vaddr_end, npages; @@ -401,6 +400,7 @@ void __init mem_encrypt_free_decrypted_mem(void) free_init_pages("unused decrypted", vaddr, vaddr_end); } +/* Architecture __weak replacement functions */ void __init mem_encrypt_init(void) { if (!sme_me_mask) diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 8ee952038c80..aa76ec2d359b 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -543,7 +543,6 @@ static void __init numa_clear_kernel_node_hotplug(void) static int __init numa_register_memblks(struct numa_meminfo *mi) { - unsigned long uninitialized_var(pfn_align); int i, nid; /* Account for nodes with cpus and no memory */ @@ -571,15 +570,16 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) * If sections array is gonna be used for pfn -> nid mapping, check * whether its granularity is fine enough. */ -#ifdef NODE_NOT_IN_PAGE_FLAGS - pfn_align = node_map_pfn_alignment(); - if (pfn_align && pfn_align < PAGES_PER_SECTION) { - printk(KERN_WARNING "Node alignment %LuMB < min %LuMB, rejecting NUMA config\n", - PFN_PHYS(pfn_align) >> 20, - PFN_PHYS(PAGES_PER_SECTION) >> 20); - return -EINVAL; + if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS)) { + unsigned long pfn_align = node_map_pfn_alignment(); + + if (pfn_align && pfn_align < PAGES_PER_SECTION) { + pr_warn("Node alignment %LuMB < min %LuMB, rejecting NUMA config\n", + PFN_PHYS(pfn_align) >> 20, + PFN_PHYS(PAGES_PER_SECTION) >> 20); + return -EINVAL; + } } -#endif if (!numa_meminfo_cover_memory(mi)) return -EINVAL; @@ -929,5 +929,4 @@ int memory_add_physaddr_to_nid(u64 start) nid = numa_meminfo.blk[0].nid; return nid; } -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 77e04304a2a7..d1b2a889f035 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -135,7 +135,7 @@ static inline void cpa_inc_2m_checked(void) static inline void cpa_inc_4k_install(void) { - cpa_4k_install++; + data_race(cpa_4k_install++); } static inline void cpa_inc_lp_sameprot(int level) diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 1953685c2ddf..c234634e26ba 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -11,7 +11,6 @@ #include <linux/spinlock.h> #include <asm/cpu_entry_area.h> -#include <asm/pgalloc.h> #include <asm/fixmap.h> #include <asm/e820/api.h> #include <asm/tlb.h> diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index a8a924b3c335..1aab92930569 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -34,7 +34,6 @@ #include <asm/vsyscall.h> #include <asm/cmdline.h> #include <asm/pti.h> -#include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/desc.h> #include <asm/sections.h> diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 0c67a5a94de3..b8c9a5b87f37 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -557,12 +557,12 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x27B9, twinhead_reserve_killing_z * Device [8086:2fc0] * Erratum HSE43 * CONFIG_TDP_NOMINAL CSR Implemented at Incorrect Offset - * http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v3-spec-update.html + * https://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v3-spec-update.html * * Devices [8086:6f60,6fa0,6fc0] * Erratum BDF2 * PCI BARs in the Home Agent Will Return Non-Zero Values During Enumeration - * http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v4-spec-update.html + * https://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v4-spec-update.html */ static void pci_invalid_bar(struct pci_dev *dev) { diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index e3f1ca316068..9f9aad42ccff 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -62,7 +62,7 @@ static int xen_pcifront_enable_irq(struct pci_dev *dev) #ifdef CONFIG_ACPI static int xen_register_pirq(u32 gsi, int triggering, bool set_pirq) { - int rc, pirq = -1, irq = -1; + int rc, pirq = -1, irq; struct physdev_map_pirq map_irq; int shareable = 0; char *name; diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index e966115d105c..f6ea8f1a9d57 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -496,7 +496,7 @@ void __init efi_init(void) efi_print_memmap(); } -#if defined(CONFIG_X86_32) || defined(CONFIG_X86_UV) +#if defined(CONFIG_X86_32) void __init efi_set_executable(efi_memory_desc_t *md, bool executable) { @@ -648,7 +648,7 @@ static inline void *efi_map_next_entry_reverse(void *entry) */ static void *efi_map_next_entry(void *entry) { - if (!efi_have_uv1_memmap() && efi_enabled(EFI_64BIT)) { + if (efi_enabled(EFI_64BIT)) { /* * Starting in UEFI v2.5 the EFI_PROPERTIES_TABLE * config table feature requires us to map all entries @@ -777,11 +777,9 @@ static void __init kexec_enter_virtual_mode(void) /* * We don't do virtual mode, since we don't do runtime services, on - * non-native EFI. With the UV1 memmap, we don't do runtime services in - * kexec kernel because in the initial boot something else might - * have been mapped at these virtual addresses. + * non-native EFI. */ - if (efi_is_mixed() || efi_have_uv1_memmap()) { + if (efi_is_mixed()) { efi_memmap_unmap(); clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); return; @@ -832,12 +830,6 @@ static void __init kexec_enter_virtual_mode(void) * has the runtime attribute bit set in its memory descriptor into the * efi_pgd page table. * - * The old method which used to update that memory descriptor with the - * virtual address obtained from ioremap() is still supported when the - * kernel is booted on SG1 UV1 hardware. Same old method enabled the - * runtime services to be called without having to thunk back into - * physical mode for every invocation. - * * The new method does a pagetable switch in a preemption-safe manner * so that we're in a different address space when calling a runtime * function. For function arguments passing we do copy the PUDs of the diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 8e364c4c6768..413583f904a6 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -74,9 +74,6 @@ int __init efi_alloc_page_tables(void) pud_t *pud; gfp_t gfp_mask; - if (efi_have_uv1_memmap()) - return 0; - gfp_mask = GFP_KERNEL | __GFP_ZERO; efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER); if (!efi_pgd) @@ -115,9 +112,6 @@ void efi_sync_low_kernel_mappings(void) pud_t *pud_k, *pud_efi; pgd_t *efi_pgd = efi_mm.pgd; - if (efi_have_uv1_memmap()) - return; - /* * We can share all PGD entries apart from the one entry that * covers the EFI runtime mapping space. @@ -206,9 +200,6 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) unsigned npages; pgd_t *pgd = efi_mm.pgd; - if (efi_have_uv1_memmap()) - return 0; - /* * It can happen that the physical address of new_memmap lands in memory * which is not mapped in the EFI page table. Therefore we need to go @@ -315,9 +306,6 @@ void __init efi_map_region(efi_memory_desc_t *md) unsigned long size = md->num_pages << PAGE_SHIFT; u64 pa = md->phys_addr; - if (efi_have_uv1_memmap()) - return old_map_region(md); - /* * Make sure the 1:1 mappings are present as a catch-all for b0rked * firmware which doesn't update all internal pointers after switching @@ -420,12 +408,6 @@ void __init efi_runtime_update_mappings(void) { efi_memory_desc_t *md; - if (efi_have_uv1_memmap()) { - if (__supported_pte_mask & _PAGE_NX) - runtime_code_page_mkexec(); - return; - } - /* * Use the EFI Memory Attribute Table for mapping permissions if it * exists, since it is intended to supersede EFI_PROPERTIES_TABLE. @@ -474,10 +456,7 @@ void __init efi_runtime_update_mappings(void) void __init efi_dump_pagetable(void) { #ifdef CONFIG_EFI_PGT_DUMP - if (efi_have_uv1_memmap()) - ptdump_walk_pgd_level(NULL, &init_mm); - else - ptdump_walk_pgd_level(NULL, &efi_mm); + ptdump_walk_pgd_level(NULL, &efi_mm); #endif } @@ -849,21 +828,13 @@ efi_set_virtual_address_map(unsigned long memory_map_size, const efi_system_table_t *systab = (efi_system_table_t *)systab_phys; efi_status_t status; unsigned long flags; - pgd_t *save_pgd = NULL; if (efi_is_mixed()) return efi_thunk_set_virtual_address_map(memory_map_size, descriptor_size, descriptor_version, virtual_map); - - if (efi_have_uv1_memmap()) { - save_pgd = efi_uv1_memmap_phys_prolog(); - if (!save_pgd) - return EFI_ABORTED; - } else { - efi_switch_mm(&efi_mm); - } + efi_switch_mm(&efi_mm); kernel_fpu_begin(); @@ -879,10 +850,7 @@ efi_set_virtual_address_map(unsigned long memory_map_size, /* grab the virtually remapped EFI runtime services table pointer */ efi.runtime = READ_ONCE(systab->runtime); - if (save_pgd) - efi_uv1_memmap_phys_epilog(save_pgd); - else - efi_switch_mm(efi_scratch.prev_mm); + efi_switch_mm(efi_scratch.prev_mm); return status; } diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index a5a469cdf5bf..5a40fe411ebd 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -381,14 +381,6 @@ static void __init efi_unmap_pages(efi_memory_desc_t *md) u64 va = md->virt_addr; /* - * To Do: Remove this check after adding functionality to unmap EFI boot - * services code/data regions from direct mapping area because the UV1 - * memory map maps EFI regions in swapper_pg_dir. - */ - if (efi_have_uv1_memmap()) - return; - - /* * EFI mixed mode has all RAM mapped to access arguments while making * EFI runtime calls, hence don't unmap EFI boot services code/data * regions. @@ -558,16 +550,6 @@ out: return ret; } -static const struct dmi_system_id sgi_uv1_dmi[] __initconst = { - { NULL, "SGI UV1", - { DMI_MATCH(DMI_PRODUCT_NAME, "Stoutland Platform"), - DMI_MATCH(DMI_PRODUCT_VERSION, "1.0"), - DMI_MATCH(DMI_BIOS_VENDOR, "SGI.COM"), - } - }, - { } /* NULL entry stops DMI scanning */ -}; - void __init efi_apply_memmap_quirks(void) { /* @@ -579,17 +561,6 @@ void __init efi_apply_memmap_quirks(void) pr_info("Setup done, disabling due to 32/64-bit mismatch\n"); efi_memmap_unmap(); } - - /* UV2+ BIOS has a fix for this issue. UV1 still needs the quirk. */ - if (dmi_check_system(sgi_uv1_dmi)) { - if (IS_ENABLED(CONFIG_X86_UV)) { - set_bit(EFI_UV1_MEMMAP, &efi.flags); - } else { - pr_warn("EFI runtime disabled, needs CONFIG_X86_UV=y on UV1\n"); - clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); - efi_memmap_unmap(); - } - } } /* @@ -723,8 +694,6 @@ void efi_recover_from_page_fault(unsigned long phys_addr) /* * Make sure that an efi runtime service caused the page fault. - * "efi_mm" cannot be used to check if the page fault had occurred - * in the firmware context because the UV1 memmap doesn't use efi_pgd. */ if (efi_rts_work.efi_rts_id == EFI_NONE) return; diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c index 4494589a288a..a2f447dffea6 100644 --- a/arch/x86/platform/uv/bios_uv.c +++ b/arch/x86/platform/uv/bios_uv.c @@ -11,6 +11,7 @@ #include <linux/slab.h> #include <asm/efi.h> #include <linux/io.h> +#include <asm/pgalloc.h> #include <asm/uv/bios.h> #include <asm/uv/uv_hub.h> @@ -30,17 +31,7 @@ static s64 __uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, */ return BIOS_STATUS_UNIMPLEMENTED; - /* - * If EFI_UV1_MEMMAP is set, we need to fall back to using our old EFI - * callback method, which uses efi_call() directly, with the kernel page tables: - */ - if (unlikely(efi_enabled(EFI_UV1_MEMMAP))) { - kernel_fpu_begin(); - ret = efi_call((void *)__va(tab->function), (u64)which, a1, a2, a3, a4, a5); - kernel_fpu_end(); - } else { - ret = efi_call_virt_pointer(tab, function, (u64)which, a1, a2, a3, a4, a5); - } + ret = efi_call_virt_pointer(tab, function, (u64)which, a1, a2, a3, a4, a5); return ret; } @@ -209,164 +200,3 @@ int uv_bios_init(void) pr_info("UV: UVsystab: Revision:%x\n", uv_systab->revision); return 0; } - -static void __init early_code_mapping_set_exec(int executable) -{ - efi_memory_desc_t *md; - - if (!(__supported_pte_mask & _PAGE_NX)) - return; - - /* Make EFI service code area executable */ - for_each_efi_memory_desc(md) { - if (md->type == EFI_RUNTIME_SERVICES_CODE || - md->type == EFI_BOOT_SERVICES_CODE) - efi_set_executable(md, executable); - } -} - -void __init efi_uv1_memmap_phys_epilog(pgd_t *save_pgd) -{ - /* - * After the lock is released, the original page table is restored. - */ - int pgd_idx, i; - int nr_pgds; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - - nr_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE); - - for (pgd_idx = 0; pgd_idx < nr_pgds; pgd_idx++) { - pgd = pgd_offset_k(pgd_idx * PGDIR_SIZE); - set_pgd(pgd_offset_k(pgd_idx * PGDIR_SIZE), save_pgd[pgd_idx]); - - if (!pgd_present(*pgd)) - continue; - - for (i = 0; i < PTRS_PER_P4D; i++) { - p4d = p4d_offset(pgd, - pgd_idx * PGDIR_SIZE + i * P4D_SIZE); - - if (!p4d_present(*p4d)) - continue; - - pud = (pud_t *)p4d_page_vaddr(*p4d); - pud_free(&init_mm, pud); - } - - p4d = (p4d_t *)pgd_page_vaddr(*pgd); - p4d_free(&init_mm, p4d); - } - - kfree(save_pgd); - - __flush_tlb_all(); - early_code_mapping_set_exec(0); -} - -pgd_t * __init efi_uv1_memmap_phys_prolog(void) -{ - unsigned long vaddr, addr_pgd, addr_p4d, addr_pud; - pgd_t *save_pgd, *pgd_k, *pgd_efi; - p4d_t *p4d, *p4d_k, *p4d_efi; - pud_t *pud; - - int pgd; - int n_pgds, i, j; - - early_code_mapping_set_exec(1); - - n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE); - save_pgd = kmalloc_array(n_pgds, sizeof(*save_pgd), GFP_KERNEL); - if (!save_pgd) - return NULL; - - /* - * Build 1:1 identity mapping for UV1 memmap usage. Note that - * PAGE_OFFSET is PGDIR_SIZE aligned when KASLR is disabled, while - * it is PUD_SIZE ALIGNED with KASLR enabled. So for a given physical - * address X, the pud_index(X) != pud_index(__va(X)), we can only copy - * PUD entry of __va(X) to fill in pud entry of X to build 1:1 mapping. - * This means here we can only reuse the PMD tables of the direct mapping. - */ - for (pgd = 0; pgd < n_pgds; pgd++) { - addr_pgd = (unsigned long)(pgd * PGDIR_SIZE); - vaddr = (unsigned long)__va(pgd * PGDIR_SIZE); - pgd_efi = pgd_offset_k(addr_pgd); - save_pgd[pgd] = *pgd_efi; - - p4d = p4d_alloc(&init_mm, pgd_efi, addr_pgd); - if (!p4d) { - pr_err("Failed to allocate p4d table!\n"); - goto out; - } - - for (i = 0; i < PTRS_PER_P4D; i++) { - addr_p4d = addr_pgd + i * P4D_SIZE; - p4d_efi = p4d + p4d_index(addr_p4d); - - pud = pud_alloc(&init_mm, p4d_efi, addr_p4d); - if (!pud) { - pr_err("Failed to allocate pud table!\n"); - goto out; - } - - for (j = 0; j < PTRS_PER_PUD; j++) { - addr_pud = addr_p4d + j * PUD_SIZE; - - if (addr_pud > (max_pfn << PAGE_SHIFT)) - break; - - vaddr = (unsigned long)__va(addr_pud); - - pgd_k = pgd_offset_k(vaddr); - p4d_k = p4d_offset(pgd_k, vaddr); - pud[j] = *pud_offset(p4d_k, vaddr); - } - } - pgd_offset_k(pgd * PGDIR_SIZE)->pgd &= ~_PAGE_NX; - } - - __flush_tlb_all(); - return save_pgd; -out: - efi_uv1_memmap_phys_epilog(save_pgd); - return NULL; -} - -void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, - u32 type, u64 attribute) -{ - unsigned long last_map_pfn; - - if (type == EFI_MEMORY_MAPPED_IO) - return ioremap(phys_addr, size); - - last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size, - PAGE_KERNEL); - if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) { - unsigned long top = last_map_pfn << PAGE_SHIFT; - efi_ioremap(top, size - (top - phys_addr), type, attribute); - } - - if (!(attribute & EFI_MEMORY_WB)) - efi_memory_uc((u64)(unsigned long)__va(phys_addr), size); - - return (void __iomem *)__va(phys_addr); -} - -static int __init arch_parse_efi_cmdline(char *str) -{ - if (!str) { - pr_warn("need at least one option\n"); - return -EINVAL; - } - - if (!efi_is_mixed() && parse_option_str(str, "old_map")) - set_bit(EFI_UV1_MEMMAP, &efi.flags); - - return 0; -} -early_param("efi", arch_parse_efi_cmdline); diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 0ac96ca304c7..62ea907668f8 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -23,18 +23,6 @@ static struct bau_operations ops __ro_after_init; -/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */ -static const int timeout_base_ns[] = { - 20, - 160, - 1280, - 10240, - 81920, - 655360, - 5242880, - 167772160 -}; - static int timeout_us; static bool nobau = true; static int nobau_perm; @@ -510,70 +498,6 @@ static inline void end_uvhub_quiesce(struct bau_control *hmaster) atom_asr(-1, (struct atomic_short *)&hmaster->uvhub_quiesce); } -static unsigned long uv1_read_status(unsigned long mmr_offset, int right_shift) -{ - unsigned long descriptor_status; - - descriptor_status = uv_read_local_mmr(mmr_offset); - descriptor_status >>= right_shift; - descriptor_status &= UV_ACT_STATUS_MASK; - return descriptor_status; -} - -/* - * Wait for completion of a broadcast software ack message - * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP - */ -static int uv1_wait_completion(struct bau_desc *bau_desc, - struct bau_control *bcp, long try) -{ - unsigned long descriptor_status; - cycles_t ttm; - u64 mmr_offset = bcp->status_mmr; - int right_shift = bcp->status_index; - struct ptc_stats *stat = bcp->statp; - - descriptor_status = uv1_read_status(mmr_offset, right_shift); - /* spin on the status MMR, waiting for it to go idle */ - while ((descriptor_status != DS_IDLE)) { - /* - * Our software ack messages may be blocked because - * there are no swack resources available. As long - * as none of them has timed out hardware will NACK - * our message and its state will stay IDLE. - */ - if (descriptor_status == DS_SOURCE_TIMEOUT) { - stat->s_stimeout++; - return FLUSH_GIVEUP; - } else if (descriptor_status == DS_DESTINATION_TIMEOUT) { - stat->s_dtimeout++; - ttm = get_cycles(); - - /* - * Our retries may be blocked by all destination - * swack resources being consumed, and a timeout - * pending. In that case hardware returns the - * ERROR that looks like a destination timeout. - */ - if (cycles_2_us(ttm - bcp->send_message) < timeout_us) { - bcp->conseccompletes = 0; - return FLUSH_RETRY_PLUGGED; - } - - bcp->conseccompletes = 0; - return FLUSH_RETRY_TIMEOUT; - } else { - /* - * descriptor_status is still BUSY - */ - cpu_relax(); - } - descriptor_status = uv1_read_status(mmr_offset, right_shift); - } - bcp->conseccompletes++; - return FLUSH_COMPLETE; -} - /* * UV2 could have an extra bit of status in the ACTIVATION_STATUS_2 register. * But not currently used. @@ -853,24 +777,6 @@ static void record_send_stats(cycles_t time1, cycles_t time2, } /* - * Because of a uv1 hardware bug only a limited number of concurrent - * requests can be made. - */ -static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat) -{ - spinlock_t *lock = &hmaster->uvhub_lock; - atomic_t *v; - - v = &hmaster->active_descriptor_count; - if (!atomic_inc_unless_ge(lock, v, hmaster->max_concurr)) { - stat->s_throttles++; - do { - cpu_relax(); - } while (!atomic_inc_unless_ge(lock, v, hmaster->max_concurr)); - } -} - -/* * Handle the completion status of a message send. */ static void handle_cmplt(int completion_status, struct bau_desc *bau_desc, @@ -899,50 +805,30 @@ static int uv_flush_send_and_wait(struct cpumask *flush_mask, { int seq_number = 0; int completion_stat = 0; - int uv1 = 0; long try = 0; unsigned long index; cycles_t time1; cycles_t time2; struct ptc_stats *stat = bcp->statp; struct bau_control *hmaster = bcp->uvhub_master; - struct uv1_bau_msg_header *uv1_hdr = NULL; struct uv2_3_bau_msg_header *uv2_3_hdr = NULL; - if (bcp->uvhub_version == UV_BAU_V1) { - uv1 = 1; - uv1_throttle(hmaster, stat); - } - while (hmaster->uvhub_quiesce) cpu_relax(); time1 = get_cycles(); - if (uv1) - uv1_hdr = &bau_desc->header.uv1_hdr; - else - /* uv2 and uv3 */ - uv2_3_hdr = &bau_desc->header.uv2_3_hdr; + uv2_3_hdr = &bau_desc->header.uv2_3_hdr; do { if (try == 0) { - if (uv1) - uv1_hdr->msg_type = MSG_REGULAR; - else - uv2_3_hdr->msg_type = MSG_REGULAR; + uv2_3_hdr->msg_type = MSG_REGULAR; seq_number = bcp->message_number++; } else { - if (uv1) - uv1_hdr->msg_type = MSG_RETRY; - else - uv2_3_hdr->msg_type = MSG_RETRY; + uv2_3_hdr->msg_type = MSG_RETRY; stat->s_retry_messages++; } - if (uv1) - uv1_hdr->sequence = seq_number; - else - uv2_3_hdr->sequence = seq_number; + uv2_3_hdr->sequence = seq_number; index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu; bcp->send_message = get_cycles(); @@ -1162,11 +1048,10 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, address = TLB_FLUSH_ALL; switch (bcp->uvhub_version) { - case UV_BAU_V1: case UV_BAU_V2: case UV_BAU_V3: - bau_desc->payload.uv1_2_3.address = address; - bau_desc->payload.uv1_2_3.sending_cpu = cpu; + bau_desc->payload.uv2_3.address = address; + bau_desc->payload.uv2_3.sending_cpu = cpu; break; case UV_BAU_V4: bau_desc->payload.uv4.address = address; @@ -1300,7 +1185,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_uv_bau_message) if (bcp->uvhub_version == UV_BAU_V2) process_uv2_message(&msgdesc, bcp); else - /* no error workaround for uv1 or uv3 */ + /* no error workaround for uv3 */ bau_process_message(&msgdesc, bcp, 1); msg++; @@ -1350,12 +1235,7 @@ static void __init enable_timeouts(void) mmr_image &= ~((unsigned long)0xf << SOFTACK_PSHIFT); mmr_image |= (SOFTACK_TIMEOUT_PERIOD << SOFTACK_PSHIFT); write_mmr_misc_control(pnode, mmr_image); - /* - * UV1: - * Subsequent reversals of the timebase bit (3) cause an - * immediate timeout of one or all INTD resources as - * indicated in bits 2:0 (7 causes all of them to timeout). - */ + mmr_image |= (1L << SOFTACK_MSHIFT); if (is_uv2_hub()) { /* do not touch the legacy mode bit */ @@ -1711,14 +1591,12 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode) { int i; int cpu; - int uv1 = 0; unsigned long gpa; unsigned long m; unsigned long n; size_t dsize; struct bau_desc *bau_desc; struct bau_desc *bd2; - struct uv1_bau_msg_header *uv1_hdr; struct uv2_3_bau_msg_header *uv2_3_hdr; struct bau_control *bcp; @@ -1733,8 +1611,6 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode) gpa = uv_gpa(bau_desc); n = uv_gpa_to_gnode(gpa); m = ops.bau_gpa_to_offset(gpa); - if (is_uv1_hub()) - uv1 = 1; /* the 14-bit pnode */ write_mmr_descriptor_base(pnode, @@ -1746,37 +1622,15 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode) */ for (i = 0, bd2 = bau_desc; i < (ADP_SZ * ITEMS_PER_DESC); i++, bd2++) { memset(bd2, 0, sizeof(struct bau_desc)); - if (uv1) { - uv1_hdr = &bd2->header.uv1_hdr; - uv1_hdr->swack_flag = 1; - /* - * The base_dest_nasid set in the message header - * is the nasid of the first uvhub in the partition. - * The bit map will indicate destination pnode numbers - * relative to that base. They may not be consecutive - * if nasid striding is being used. - */ - uv1_hdr->base_dest_nasid = - UV_PNODE_TO_NASID(base_pnode); - uv1_hdr->dest_subnodeid = UV_LB_SUBNODEID; - uv1_hdr->command = UV_NET_ENDPOINT_INTD; - uv1_hdr->int_both = 1; - /* - * all others need to be set to zero: - * fairness chaining multilevel count replied_to - */ - } else { - /* - * BIOS uses legacy mode, but uv2 and uv3 hardware always - * uses native mode for selective broadcasts. - */ - uv2_3_hdr = &bd2->header.uv2_3_hdr; - uv2_3_hdr->swack_flag = 1; - uv2_3_hdr->base_dest_nasid = - UV_PNODE_TO_NASID(base_pnode); - uv2_3_hdr->dest_subnodeid = UV_LB_SUBNODEID; - uv2_3_hdr->command = UV_NET_ENDPOINT_INTD; - } + /* + * BIOS uses legacy mode, but uv2 and uv3 hardware always + * uses native mode for selective broadcasts. + */ + uv2_3_hdr = &bd2->header.uv2_3_hdr; + uv2_3_hdr->swack_flag = 1; + uv2_3_hdr->base_dest_nasid = UV_PNODE_TO_NASID(base_pnode); + uv2_3_hdr->dest_subnodeid = UV_LB_SUBNODEID; + uv2_3_hdr->command = UV_NET_ENDPOINT_INTD; } for_each_present_cpu(cpu) { if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu))) @@ -1861,7 +1715,7 @@ static void __init init_uvhub(int uvhub, int vector, int base_pnode) * The below initialization can't be in firmware because the * messaging IRQ will be determined by the OS. */ - apicid = uvhub_to_first_apicid(uvhub) | uv_apicid_hibits; + apicid = uvhub_to_first_apicid(uvhub); write_mmr_data_config(pnode, ((apicid << 32) | vector)); } @@ -1874,33 +1728,20 @@ static int calculate_destination_timeout(void) { unsigned long mmr_image; int mult1; - int mult2; - int index; int base; int ret; - unsigned long ts_ns; - - if (is_uv1_hub()) { - mult1 = SOFTACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK; - mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL); - index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK; - mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT); - mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK; - ts_ns = timeout_base_ns[index]; - ts_ns *= (mult1 * mult2); - ret = ts_ns / 1000; - } else { - /* same destination timeout for uv2 and uv3 */ - /* 4 bits 0/1 for 10/80us base, 3 bits of multiplier */ - mmr_image = uv_read_local_mmr(UVH_LB_BAU_MISC_CONTROL); - mmr_image = (mmr_image & UV_SA_MASK) >> UV_SA_SHFT; - if (mmr_image & (1L << UV2_ACK_UNITS_SHFT)) - base = 80; - else - base = 10; - mult1 = mmr_image & UV2_ACK_MASK; - ret = mult1 * base; - } + + /* same destination timeout for uv2 and uv3 */ + /* 4 bits 0/1 for 10/80us base, 3 bits of multiplier */ + mmr_image = uv_read_local_mmr(UVH_LB_BAU_MISC_CONTROL); + mmr_image = (mmr_image & UV_SA_MASK) >> UV_SA_SHFT; + if (mmr_image & (1L << UV2_ACK_UNITS_SHFT)) + base = 80; + else + base = 10; + mult1 = mmr_image & UV2_ACK_MASK; + ret = mult1 * base; + return ret; } @@ -2039,9 +1880,7 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp, bcp->cpus_in_socket = sdp->num_cpus; bcp->socket_master = *smasterp; bcp->uvhub = bdp->uvhub; - if (is_uv1_hub()) - bcp->uvhub_version = UV_BAU_V1; - else if (is_uv2_hub()) + if (is_uv2_hub()) bcp->uvhub_version = UV_BAU_V2; else if (is_uv3_hub()) bcp->uvhub_version = UV_BAU_V3; @@ -2123,7 +1962,7 @@ static int __init init_per_cpu(int nuvhubs, int base_part_pnode) struct uvhub_desc *uvhub_descs; unsigned char *uvhub_mask = NULL; - if (is_uv3_hub() || is_uv2_hub() || is_uv1_hub()) + if (is_uv3_hub() || is_uv2_hub()) timeout_us = calculate_destination_timeout(); uvhub_descs = kcalloc(nuvhubs, sizeof(struct uvhub_desc), GFP_KERNEL); @@ -2151,17 +1990,6 @@ fail: return 1; } -static const struct bau_operations uv1_bau_ops __initconst = { - .bau_gpa_to_offset = uv_gpa_to_offset, - .read_l_sw_ack = read_mmr_sw_ack, - .read_g_sw_ack = read_gmmr_sw_ack, - .write_l_sw_ack = write_mmr_sw_ack, - .write_g_sw_ack = write_gmmr_sw_ack, - .write_payload_first = write_mmr_payload_first, - .write_payload_last = write_mmr_payload_last, - .wait_completion = uv1_wait_completion, -}; - static const struct bau_operations uv2_3_bau_ops __initconst = { .bau_gpa_to_offset = uv_gpa_to_offset, .read_l_sw_ack = read_mmr_sw_ack, @@ -2206,8 +2034,6 @@ static int __init uv_bau_init(void) ops = uv2_3_bau_ops; else if (is_uv2_hub()) ops = uv2_3_bau_ops; - else if (is_uv1_hub()) - ops = uv1_bau_ops; nuvhubs = uv_num_possible_blades(); if (nuvhubs < 2) { @@ -2228,7 +2054,7 @@ static int __init uv_bau_init(void) } /* software timeouts are not supported on UV4 */ - if (is_uv3_hub() || is_uv2_hub() || is_uv1_hub()) + if (is_uv3_hub() || is_uv2_hub()) enable_timeouts(); if (init_per_cpu(nuvhubs, uv_base_pnode)) { @@ -2251,8 +2077,7 @@ static int __init uv_bau_init(void) val = 1L << 63; write_gmmr_activation(pnode, val); mmr = 1; /* should be 1 to broadcast to both sockets */ - if (!is_uv1_hub()) - write_mmr_data_broadcast(pnode, mmr); + write_mmr_data_broadcast(pnode, mmr); } } diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c index 7af31b245636..f82a1337a608 100644 --- a/arch/x86/platform/uv/uv_time.c +++ b/arch/x86/platform/uv/uv_time.c @@ -74,7 +74,6 @@ static void uv_rtc_send_IPI(int cpu) apicid = cpu_physical_id(cpu); pnode = uv_apicid_to_pnode(apicid); - apicid |= uv_apicid_hibits; val = (1UL << UVH_IPI_INT_SEND_SHFT) | (apicid << UVH_IPI_INT_APIC_ID_SHFT) | (X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT); @@ -85,10 +84,7 @@ static void uv_rtc_send_IPI(int cpu) /* Check for an RTC interrupt pending */ static int uv_intr_pending(int pnode) { - if (is_uv1_hub()) - return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) & - UV1H_EVENT_OCCURRED0_RTC1_MASK; - else if (is_uvx_hub()) + if (is_uvx_hub()) return uv_read_global_mmr64(pnode, UVXH_EVENT_OCCURRED2) & UVXH_EVENT_OCCURRED2_RTC_1_MASK; return 0; @@ -98,19 +94,15 @@ static int uv_intr_pending(int pnode) static int uv_setup_intr(int cpu, u64 expires) { u64 val; - unsigned long apicid = cpu_physical_id(cpu) | uv_apicid_hibits; + unsigned long apicid = cpu_physical_id(cpu); int pnode = uv_cpu_to_pnode(cpu); uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, UVH_RTC1_INT_CONFIG_M_MASK); uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L); - if (is_uv1_hub()) - uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS, - UV1H_EVENT_OCCURRED0_RTC1_MASK); - else - uv_write_global_mmr64(pnode, UVXH_EVENT_OCCURRED2_ALIAS, - UVXH_EVENT_OCCURRED2_RTC_1_MASK); + uv_write_global_mmr64(pnode, UVXH_EVENT_OCCURRED2_ALIAS, + UVXH_EVENT_OCCURRED2_RTC_1_MASK); val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | ((u64)apicid << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); diff --git a/arch/x86/power/Makefile b/arch/x86/power/Makefile index 37923d715741..6907b523e856 100644 --- a/arch/x86/power/Makefile +++ b/arch/x86/power/Makefile @@ -3,8 +3,7 @@ OBJECT_FILES_NON_STANDARD_hibernate_asm_$(BITS).o := y # __restore_processor_state() restores %gs after S3 resume and so should not # itself be stack-protected -nostackp := $(call cc-option, -fno-stack-protector) -CFLAGS_cpu.o := $(nostackp) +CFLAGS_cpu.o := -fno-stack-protector obj-$(CONFIG_PM_SLEEP) += cpu.o obj-$(CONFIG_HIBERNATION) += hibernate_$(BITS).o hibernate_asm_$(BITS).o hibernate.o diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c index d147f1b2c925..cd3914fc9f3d 100644 --- a/arch/x86/power/hibernate.c +++ b/arch/x86/power/hibernate.c @@ -98,7 +98,7 @@ static int get_e820_md5(struct e820_table *table, void *buf) if (crypto_shash_digest(desc, (u8 *)table, size, buf)) ret = -EINVAL; - kzfree(desc); + kfree_sensitive(desc); free_tfm: crypto_free_shash(tfm); diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 088bd764e0b7..183ac60e5990 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -34,7 +34,7 @@ KCOV_INSTRUMENT := n PURGATORY_CFLAGS_REMOVE := -mcmodel=kernel PURGATORY_CFLAGS := -mcmodel=large -ffreestanding -fno-zero-initialized-in-bss PURGATORY_CFLAGS += $(DISABLE_STACKLEAK_PLUGIN) -DDISABLE_BRANCH_PROFILING -PURGATORY_CFLAGS += $(call cc-option,-fno-stack-protector) +PURGATORY_CFLAGS += -fno-stack-protector # Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That # in turn leaves some undefined symbols like __fentry__ in purgatory and not diff --git a/arch/x86/um/vdso/Makefile b/arch/x86/um/vdso/Makefile index 0caddd6acb22..5943387e3f35 100644 --- a/arch/x86/um/vdso/Makefile +++ b/arch/x86/um/vdso/Makefile @@ -42,7 +42,7 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE # optimize sibling calls. # CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ - $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \ + $(filter -g%,$(KBUILD_CFLAGS)) -fno-stack-protector \ -fno-omit-frame-pointer -foptimize-sibling-calls $(vobjs): KBUILD_CFLAGS += $(CFL) diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 084de77a109e..5f1db522d06b 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -9,9 +9,8 @@ CFLAGS_REMOVE_irq.o = -pg endif # Make sure early boot has no stackprotector -nostackp := $(call cc-option, -fno-stack-protector) -CFLAGS_enlighten_pv.o := $(nostackp) -CFLAGS_mmu_pv.o := $(nostackp) +CFLAGS_enlighten_pv.o := -fno-stack-protector +CFLAGS_mmu_pv.o := -fno-stack-protector obj-y += enlighten.o obj-y += mmu.o diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index 5e53bfbe5823..1aff4ae65655 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -1,8 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/init.h> +#include <linux/thread_info.h> #include <asm/x86_init.h> #include <asm/apic.h> +#include <asm/io_apic.h> #include <asm/xen/hypercall.h> #include <xen/xen.h> diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 3e89b0067ff0..9e87ab010c82 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -11,6 +11,7 @@ #include <asm/cpu.h> #include <asm/smp.h> +#include <asm/io_apic.h> #include <asm/reboot.h> #include <asm/setup.h> #include <asm/idtentry.h> diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index c46b9f2e732f..2aab43a13a8c 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -873,7 +873,7 @@ static void xen_load_sp0(unsigned long sp0) static void xen_invalidate_io_bitmap(void) { struct physdev_set_iobitmap iobitmap = { - .bitmap = 0, + .bitmap = NULL, .nr_ports = 0, }; diff --git a/arch/x86/xen/smp_hvm.c b/arch/x86/xen/smp_hvm.c index f8d39440b292..f5e7db4f82ab 100644 --- a/arch/x86/xen/smp_hvm.c +++ b/arch/x86/xen/smp_hvm.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/thread_info.h> #include <asm/smp.h> #include <xen/events.h> diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 171aff1b11f2..47c8f4b444c9 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -29,6 +29,7 @@ #include <asm/idtentry.h> #include <asm/desc.h> #include <asm/cpu.h> +#include <asm/io_apic.h> #include <xen/interface/xen.h> #include <xen/interface/vcpu.h> @@ -92,9 +93,7 @@ static void cpu_bringup(void) asmlinkage __visible void cpu_bringup_and_idle(void) { cpu_bringup(); - boot_init_stack_canary(); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); - prevent_tail_call_optimization(); } void xen_smp_intr_free_pv(unsigned int cpu) diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 6deb49094c60..799f4eba0a62 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -114,9 +114,8 @@ PV_CALLEE_SAVE_REGS_THUNK(xen_vcpu_stolen); */ void __init xen_init_spinlocks(void) { - /* Don't need to use pvqspinlock code if there is only 1 vCPU. */ - if (num_possible_cpus() == 1) + if (num_possible_cpus() == 1 || nopvspin) xen_pvspin = false; if (!xen_pvspin) { @@ -137,6 +136,7 @@ void __init xen_init_spinlocks(void) static __init int xen_parse_nopvspin(char *arg) { + pr_notice("\"xen_nopvspin\" is deprecated, please use \"nopvspin\" instead\n"); xen_pvspin = false; return 0; } diff --git a/arch/x86/xen/suspend_pv.c b/arch/x86/xen/suspend_pv.c index 8303b58c79a9..cae9660f4c67 100644 --- a/arch/x86/xen/suspend_pv.c +++ b/arch/x86/xen/suspend_pv.c @@ -1,11 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/types.h> -#include <asm/fixmap.h> - #include <asm/xen/hypercall.h> #include <asm/xen/page.h> +#include <asm/fixmap.h> + #include "xen-ops.h" void xen_pv_pre_suspend(void) diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index c8897aad13cd..91f5b330dcc6 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -39,6 +39,7 @@ static unsigned long xen_tsc_khz(void) struct pvclock_vcpu_time_info *info = &HYPERVISOR_shared_info->vcpu_info[0].time; + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); return pvclock_tsc_khz(info); } |