diff options
Diffstat (limited to 'arch/x86')
266 files changed, 5497 insertions, 3539 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8fbd685dd984..dce10b18f4bc 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -17,6 +17,7 @@ config X86_32 select HAVE_DEBUG_STACKOVERFLOW select MODULES_USE_ELF_REL select OLD_SIGACTION + select GENERIC_VDSO_32 config X86_64 def_bool y @@ -121,6 +122,7 @@ config X86 select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER select GENERIC_TIME_VSYSCALL + select GENERIC_GETTIMEOFDAY select HARDLOCKUP_CHECK_TIMESTAMP if X86_64 select HAVE_ACPI_APEI if ACPI select HAVE_ACPI_APEI_NMI if ACPI @@ -202,6 +204,7 @@ config X86 select HAVE_SYSCALL_TRACEPOINTS select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_USER_RETURN_NOTIFIER + select HAVE_GENERIC_VDSO select HOTPLUG_SMT if SMP select IRQ_FORCED_THREADING select NEED_SG_DMA_LENGTH @@ -217,6 +220,7 @@ config X86 select USER_STACKTRACE_SUPPORT select VIRT_TO_BUS select X86_FEATURE_NAMES if PROC_FS + select PROC_PID_ARCH_STATUS if PROC_FS config INSTRUCTION_DECODER def_bool y @@ -781,6 +785,9 @@ config PARAVIRT_SPINLOCKS If you are unsure how to answer this question, answer Y. +config X86_HV_CALLBACK_VECTOR + def_bool n + source "arch/x86/xen/Kconfig" config KVM_GUEST @@ -832,6 +839,17 @@ config JAILHOUSE_GUEST cell. You can leave this option disabled if you only want to start Jailhouse and run Linux afterwards in the root cell. +config ACRN_GUEST + bool "ACRN Guest support" + depends on X86_64 + select X86_HV_CALLBACK_VECTOR + help + This option allows to run Linux as guest in the ACRN hypervisor. ACRN is + a flexible, lightweight reference open-source hypervisor, built with + real-time and safety-criticality in mind. It is built for embedded + IOT with small footprint and real-time features. More details can be + found in https://projectacrn.org/. + endif #HYPERVISOR_GUEST source "arch/x86/Kconfig.cpu" @@ -2285,7 +2303,7 @@ config COMPAT_VDSO choice prompt "vsyscall table for legacy applications" depends on X86_64 - default LEGACY_VSYSCALL_EMULATE + default LEGACY_VSYSCALL_XONLY help Legacy user code that does not know how to find the vDSO expects to be able to issue three syscalls by calling fixed addresses in @@ -2293,23 +2311,38 @@ choice it can be used to assist security vulnerability exploitation. This setting can be changed at boot time via the kernel command - line parameter vsyscall=[emulate|none]. + line parameter vsyscall=[emulate|xonly|none]. On a system with recent enough glibc (2.14 or newer) and no static binaries, you can say None without a performance penalty to improve security. - If unsure, select "Emulate". + If unsure, select "Emulate execution only". config LEGACY_VSYSCALL_EMULATE - bool "Emulate" + bool "Full emulation" + help + The kernel traps and emulates calls into the fixed vsyscall + address mapping. This makes the mapping non-executable, but + it still contains readable known contents, which could be + used in certain rare security vulnerability exploits. This + configuration is recommended when using legacy userspace + that still uses vsyscalls along with legacy binary + instrumentation tools that require code to be readable. + + An example of this type of legacy userspace is running + Pin on an old binary that still uses vsyscalls. + + config LEGACY_VSYSCALL_XONLY + bool "Emulate execution only" help - The kernel traps and emulates calls into the fixed - vsyscall address mapping. This makes the mapping - non-executable, but it still contains known contents, - which could be used in certain rare security vulnerability - exploits. This configuration is recommended when userspace - still uses the vsyscall area. + The kernel traps and emulates calls into the fixed vsyscall + address mapping and does not allow reads. This + configuration is recommended when userspace might use the + legacy vsyscall area but support for legacy binary + instrumentation of legacy code is not needed. It mitigates + certain uses of the vsyscall area as an ASLR-bypassing + buffer. config LEGACY_VSYSCALL_NONE bool "None" diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 6adce15268bd..8e29c991ba3e 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -480,3 +480,16 @@ config CPU_SUP_UMC_32 CPU might render the kernel unbootable. If unsure, say N. + +config CPU_SUP_ZHAOXIN + default y + bool "Support Zhaoxin processors" if PROCESSOR_SELECT + help + This enables detection, tunings and quirks for Zhaoxin processors + + You need this enabled if you want your kernel to run on a + Zhaoxin CPU. Disabling this option on other types of CPUs + makes the kernel a tiny bit smaller. Disabling it on a Zhaoxin + CPU might render the kernel unbootable. + + If unsure, say N. diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 59f598543203..71c92db47c41 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -179,26 +179,6 @@ config X86_DECODER_SELFTEST decoder code. If unsure, say "N". -# -# IO delay types: -# - -config IO_DELAY_TYPE_0X80 - int - default "0" - -config IO_DELAY_TYPE_0XED - int - default "1" - -config IO_DELAY_TYPE_UDELAY - int - default "2" - -config IO_DELAY_TYPE_NONE - int - default "3" - choice prompt "IO delay type" default IO_DELAY_0X80 @@ -229,30 +209,6 @@ config IO_DELAY_NONE endchoice -if IO_DELAY_0X80 -config DEFAULT_IO_DELAY_TYPE - int - default IO_DELAY_TYPE_0X80 -endif - -if IO_DELAY_0XED -config DEFAULT_IO_DELAY_TYPE - int - default IO_DELAY_TYPE_0XED -endif - -if IO_DELAY_UDELAY -config DEFAULT_IO_DELAY_TYPE - int - default IO_DELAY_TYPE_UDELAY -endif - -if IO_DELAY_NONE -config DEFAULT_IO_DELAY_TYPE - int - default IO_DELAY_TYPE_NONE -endif - config DEBUG_BOOT_PARAMS bool "Debug boot parameters" depends on DEBUG_KERNEL diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c index 64a31a6d751a..a2b6b428922a 100644 --- a/arch/x86/boot/a20.c +++ b/arch/x86/boot/a20.c @@ -1,12 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* -*- linux-c -*- ------------------------------------------------------- * * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007-2008 rPath, Inc. - All Rights Reserved * Copyright 2009 Intel Corporation; author H. Peter Anvin * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2. - * * ----------------------------------------------------------------------- */ /* diff --git a/arch/x86/boot/apm.c b/arch/x86/boot/apm.c index ee274834ea8b..b72fc10fc1be 100644 --- a/arch/x86/boot/apm.c +++ b/arch/x86/boot/apm.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* -*- linux-c -*- ------------------------------------------------------- * * * Copyright (C) 1991, 1992 Linus Torvalds @@ -7,9 +8,6 @@ * Original APM BIOS checking by Stephen Rothwell, May 1994 * (sfr@canb.auug.org.au) * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2. - * * ----------------------------------------------------------------------- */ /* diff --git a/arch/x86/boot/bitops.h b/arch/x86/boot/bitops.h index 2e1382486e91..02e1dea11d94 100644 --- a/arch/x86/boot/bitops.h +++ b/arch/x86/boot/bitops.h @@ -1,11 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* -*- linux-c -*- ------------------------------------------------------- * * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2. - * * ----------------------------------------------------------------------- */ /* diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 32a09eb5c101..19eca14b49a0 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -1,12 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* -*- linux-c -*- ------------------------------------------------------- * * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved * Copyright 2009 Intel Corporation; author H. Peter Anvin * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2. - * * ----------------------------------------------------------------------- */ /* diff --git a/arch/x86/boot/cmdline.c b/arch/x86/boot/cmdline.c index 625d21b0cd3f..4ff01176c1cc 100644 --- a/arch/x86/boot/cmdline.c +++ b/arch/x86/boot/cmdline.c @@ -1,11 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* -*- linux-c -*- ------------------------------------------------------- * * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2. - * * ----------------------------------------------------------------------- */ /* diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c index ad84239e595e..15255f388a85 100644 --- a/arch/x86/boot/compressed/acpi.c +++ b/arch/x86/boot/compressed/acpi.c @@ -44,17 +44,109 @@ static acpi_physical_address get_acpi_rsdp(void) return addr; } -/* Search EFI system tables for RSDP. */ -static acpi_physical_address efi_get_rsdp_addr(void) +/* + * Search EFI system tables for RSDP. If both ACPI_20_TABLE_GUID and + * ACPI_TABLE_GUID are found, take the former, which has more features. + */ +static acpi_physical_address +__efi_get_rsdp_addr(unsigned long config_tables, unsigned int nr_tables, + bool efi_64) { acpi_physical_address rsdp_addr = 0; #ifdef CONFIG_EFI - unsigned long systab, systab_tables, config_tables; + int i; + + /* Get EFI tables from systab. */ + for (i = 0; i < nr_tables; i++) { + acpi_physical_address table; + efi_guid_t guid; + + if (efi_64) { + efi_config_table_64_t *tbl = (efi_config_table_64_t *)config_tables + i; + + guid = tbl->guid; + table = tbl->table; + + if (!IS_ENABLED(CONFIG_X86_64) && table >> 32) { + debug_putstr("Error getting RSDP address: EFI config table located above 4GB.\n"); + return 0; + } + } else { + efi_config_table_32_t *tbl = (efi_config_table_32_t *)config_tables + i; + + guid = tbl->guid; + table = tbl->table; + } + + if (!(efi_guidcmp(guid, ACPI_TABLE_GUID))) + rsdp_addr = table; + else if (!(efi_guidcmp(guid, ACPI_20_TABLE_GUID))) + return table; + } +#endif + return rsdp_addr; +} + +/* EFI/kexec support is 64-bit only. */ +#ifdef CONFIG_X86_64 +static struct efi_setup_data *get_kexec_setup_data_addr(void) +{ + struct setup_data *data; + u64 pa_data; + + pa_data = boot_params->hdr.setup_data; + while (pa_data) { + data = (struct setup_data *)pa_data; + if (data->type == SETUP_EFI) + return (struct efi_setup_data *)(pa_data + sizeof(struct setup_data)); + + pa_data = data->next; + } + return NULL; +} + +static acpi_physical_address kexec_get_rsdp_addr(void) +{ + efi_system_table_64_t *systab; + struct efi_setup_data *esd; + struct efi_info *ei; + char *sig; + + esd = (struct efi_setup_data *)get_kexec_setup_data_addr(); + if (!esd) + return 0; + + if (!esd->tables) { + debug_putstr("Wrong kexec SETUP_EFI data.\n"); + return 0; + } + + ei = &boot_params->efi_info; + sig = (char *)&ei->efi_loader_signature; + if (strncmp(sig, EFI64_LOADER_SIGNATURE, 4)) { + debug_putstr("Wrong kexec EFI loader signature.\n"); + return 0; + } + + /* Get systab from boot params. */ + systab = (efi_system_table_64_t *) (ei->efi_systab | ((__u64)ei->efi_systab_hi << 32)); + if (!systab) + error("EFI system table not found in kexec boot_params."); + + return __efi_get_rsdp_addr((unsigned long)esd->tables, systab->nr_tables, true); +} +#else +static acpi_physical_address kexec_get_rsdp_addr(void) { return 0; } +#endif /* CONFIG_X86_64 */ + +static acpi_physical_address efi_get_rsdp_addr(void) +{ +#ifdef CONFIG_EFI + unsigned long systab, config_tables; unsigned int nr_tables; struct efi_info *ei; bool efi_64; - int size, i; char *sig; ei = &boot_params->efi_info; @@ -88,49 +180,20 @@ static acpi_physical_address efi_get_rsdp_addr(void) config_tables = stbl->tables; nr_tables = stbl->nr_tables; - size = sizeof(efi_config_table_64_t); } else { efi_system_table_32_t *stbl = (efi_system_table_32_t *)systab; config_tables = stbl->tables; nr_tables = stbl->nr_tables; - size = sizeof(efi_config_table_32_t); } if (!config_tables) error("EFI config tables not found."); - /* Get EFI tables from systab. */ - for (i = 0; i < nr_tables; i++) { - acpi_physical_address table; - efi_guid_t guid; - - config_tables += size; - - if (efi_64) { - efi_config_table_64_t *tbl = (efi_config_table_64_t *)config_tables; - - guid = tbl->guid; - table = tbl->table; - - if (!IS_ENABLED(CONFIG_X86_64) && table >> 32) { - debug_putstr("Error getting RSDP address: EFI config table located above 4GB.\n"); - return 0; - } - } else { - efi_config_table_32_t *tbl = (efi_config_table_32_t *)config_tables; - - guid = tbl->guid; - table = tbl->table; - } - - if (!(efi_guidcmp(guid, ACPI_TABLE_GUID))) - rsdp_addr = table; - else if (!(efi_guidcmp(guid, ACPI_20_TABLE_GUID))) - return table; - } + return __efi_get_rsdp_addr(config_tables, nr_tables, efi_64); +#else + return 0; #endif - return rsdp_addr; } static u8 compute_checksum(u8 *buffer, u32 length) @@ -220,6 +283,14 @@ acpi_physical_address get_rsdp_addr(void) if (!pa) pa = boot_params->acpi_rsdp_addr; + /* + * Try to get EFI data from setup_data. This can happen when we're a + * kexec'ed kernel and kexec(1) has passed all the required EFI info to + * us. + */ + if (!pa) + pa = kexec_get_rsdp_addr(); + if (!pa) pa = efi_get_rsdp_addr(); diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 544ac4fafd11..220d1279d0e2 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -1,11 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* ----------------------------------------------------------------------- * * Copyright 2011 Intel Corporation; author Matt Fleming * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2. - * * ----------------------------------------------------------------------- */ #include <linux/efi.h> diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index fafb75c6c592..6233ae35d0d9 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -659,6 +659,7 @@ no_longmode: gdt64: .word gdt_end - gdt .quad 0 + .balign 8 gdt: .word gdt_end - gdt .long gdt diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S index a480356e0ed8..6afb7130a387 100644 --- a/arch/x86/boot/compressed/mem_encrypt.S +++ b/arch/x86/boot/compressed/mem_encrypt.S @@ -1,13 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * AMD Memory Encryption Support * * Copyright (C) 2017 Advanced Micro Devices, Inc. * * Author: Tom Lendacky <thomas.lendacky@amd.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/linkage.h> diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 5a237e8dbf8d..24e65a0f756d 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -351,9 +351,6 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, /* Clear flags intended for solely in-kernel use. */ boot_params->hdr.loadflags &= ~KASLR_FLAG; - /* Save RSDP address for later use. */ - /* boot_params->acpi_rsdp_addr = get_rsdp_addr(); */ - sanitize_boot_params(boot_params); if (boot_params->screen_info.orig_video_mode == 7) { @@ -368,6 +365,14 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, cols = boot_params->screen_info.orig_video_cols; console_init(); + + /* + * Save RSDP address for later use. Have this after console_init() + * so that early debugging output from the RSDP parsing code can be + * collected. + */ + boot_params->acpi_rsdp_addr = get_rsdp_addr(); + debug_putstr("early console in extract_kernel\n"); free_mem_ptr = heap; /* Heap */ diff --git a/arch/x86/boot/copy.S b/arch/x86/boot/copy.S index 15d9f74b0008..4c5f4f4ad035 100644 --- a/arch/x86/boot/copy.S +++ b/arch/x86/boot/copy.S @@ -1,11 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* ----------------------------------------------------------------------- * * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2. - * * ----------------------------------------------------------------------- */ #include <linux/linkage.h> diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c index 26240dde081e..0bbf4f3707d2 100644 --- a/arch/x86/boot/cpu.c +++ b/arch/x86/boot/cpu.c @@ -1,11 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* -*- linux-c -*- ------------------------------------------------------- * * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007-2008 rPath, Inc. - All Rights Reserved * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2. - * * ----------------------------------------------------------------------- */ /* diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c index 51079fc9298f..e1478d32de1a 100644 --- a/arch/x86/boot/cpucheck.c +++ b/arch/x86/boot/cpucheck.c @@ -1,11 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* -*- linux-c -*- ------------------------------------------------------- * * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2. - * * ----------------------------------------------------------------------- */ /* diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c index 6c176b6a42ad..1fb4bc70cee9 100644 --- a/arch/x86/boot/edd.c +++ b/arch/x86/boot/edd.c @@ -1,12 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* -*- linux-c -*- ------------------------------------------------------- * * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved * Copyright 2009 Intel Corporation; author H. Peter Anvin * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2. - * * ----------------------------------------------------------------------- */ /* diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 90d791ca1a95..2c11c0f45d49 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -419,7 +419,17 @@ xloadflags: # define XLF4 0 #endif - .word XLF0 | XLF1 | XLF23 | XLF4 +#ifdef CONFIG_X86_64 +#ifdef CONFIG_X86_5LEVEL +#define XLF56 (XLF_5LEVEL|XLF_5LEVEL_ENABLED) +#else +#define XLF56 XLF_5LEVEL +#endif +#else +#define XLF56 0 +#endif + + .word XLF0 | XLF1 | XLF23 | XLF4 | XLF56 cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line, #added with boot protocol diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c index 73532543d689..996df3d586f0 100644 --- a/arch/x86/boot/main.c +++ b/arch/x86/boot/main.c @@ -1,12 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* -*- linux-c -*- ------------------------------------------------------- * * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved * Copyright 2009 Intel Corporation; author H. Peter Anvin * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2. - * * ----------------------------------------------------------------------- */ /* diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c index f06c147b5140..b0422b79debc 100644 --- a/arch/x86/boot/memory.c +++ b/arch/x86/boot/memory.c @@ -1,12 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* -*- linux-c -*- ------------------------------------------------------- * * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved * Copyright 2009 Intel Corporation; author H. Peter Anvin * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2. - * * ----------------------------------------------------------------------- */ /* diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c index 8062f8915250..40031a614712 100644 --- a/arch/x86/boot/pm.c +++ b/arch/x86/boot/pm.c @@ -1,11 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* -*- linux-c -*- ------------------------------------------------------- * * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2. - * * ----------------------------------------------------------------------- */ /* diff --git a/arch/x86/boot/pmjump.S b/arch/x86/boot/pmjump.S index 3e0edc6d2a20..c22f9a7d1aeb 100644 --- a/arch/x86/boot/pmjump.S +++ b/arch/x86/boot/pmjump.S @@ -1,11 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* ----------------------------------------------------------------------- * * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2. - * * ----------------------------------------------------------------------- */ /* diff --git a/arch/x86/boot/printf.c b/arch/x86/boot/printf.c index 565083c16e5c..1237beeb9540 100644 --- a/arch/x86/boot/printf.c +++ b/arch/x86/boot/printf.c @@ -1,11 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* -*- linux-c -*- ------------------------------------------------------- * * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2. - * * ----------------------------------------------------------------------- */ /* diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index 90154df8f125..401e30ca0a75 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -1,11 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* -*- linux-c -*- ------------------------------------------------------- * * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2. - * * ----------------------------------------------------------------------- */ /* diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c index def2451f46ae..1fedabdb95ad 100644 --- a/arch/x86/boot/tty.c +++ b/arch/x86/boot/tty.c @@ -1,12 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* -*- linux-c -*- ------------------------------------------------------- * * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved * Copyright 2009 Intel Corporation; author H. Peter Anvin * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2. - * * ----------------------------------------------------------------------- */ /* diff --git a/arch/x86/boot/version.c b/arch/x86/boot/version.c index 2b15aa488ffb..a1aaaf6c06a6 100644 --- a/arch/x86/boot/version.c +++ b/arch/x86/boot/version.c @@ -1,11 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* -*- linux-c -*- ------------------------------------------------------- * * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2. - * * ----------------------------------------------------------------------- */ /* diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c index 49e0c18833e0..6eb8c06bc287 100644 --- a/arch/x86/boot/video-bios.c +++ b/arch/x86/boot/video-bios.c @@ -1,12 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* -*- linux-c -*- ------------------------------------------------------- * * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved * Copyright 2009 Intel Corporation; author H. Peter Anvin * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2. - * * ----------------------------------------------------------------------- */ /* diff --git a/arch/x86/boot/video-mode.c b/arch/x86/boot/video-mode.c index 95c7a818c0ed..9ada55dc1ab7 100644 --- a/arch/x86/boot/video-mode.c +++ b/arch/x86/boot/video-mode.c @@ -1,11 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* -*- linux-c -*- ------------------------------------------------------- * * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007-2008 rPath, Inc. - All Rights Reserved * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2. - * * ----------------------------------------------------------------------- */ /* diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c index 3ecc11a9c440..7e185977a984 100644 --- a/arch/x86/boot/video-vesa.c +++ b/arch/x86/boot/video-vesa.c @@ -1,12 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* -*- linux-c -*- ------------------------------------------------------- * * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved * Copyright 2009 Intel Corporation; author H. Peter Anvin * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2. - * * ----------------------------------------------------------------------- */ /* diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c index a14c5178d4ba..4816cb9cf996 100644 --- a/arch/x86/boot/video-vga.c +++ b/arch/x86/boot/video-vga.c @@ -1,12 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* -*- linux-c -*- ------------------------------------------------------- * * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved * Copyright 2009 Intel Corporation; author H. Peter Anvin * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2. - * * ----------------------------------------------------------------------- */ /* diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c index ac89b6624a40..f2e96905b3fe 100644 --- a/arch/x86/boot/video.c +++ b/arch/x86/boot/video.c @@ -1,12 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* -*- linux-c -*- ------------------------------------------------------- * * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved * Copyright 2009 Intel Corporation; author H. Peter Anvin * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2. - * * ----------------------------------------------------------------------- */ /* diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h index b54e0328c449..cbf7fed22441 100644 --- a/arch/x86/boot/video.h +++ b/arch/x86/boot/video.h @@ -1,11 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* -*- linux-c -*- ------------------------------------------------------- * * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved * - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2. - * * ----------------------------------------------------------------------- */ /* diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 2b2481acc661..59ce9ed58430 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -130,7 +130,6 @@ CONFIG_CFG80211=y CONFIG_MAC80211=y CONFIG_MAC80211_LEDS=y CONFIG_RFKILL=y -CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_DEBUG_DEVRES=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index e8829abf063a..d0a5ffeae8df 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -129,7 +129,6 @@ CONFIG_CFG80211=y CONFIG_MAC80211=y CONFIG_MAC80211_LEDS=y CONFIG_RFKILL=y -CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_DEBUG_DEVRES=y diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S index 5f7e43d4f64a..4434607e366d 100644 --- a/arch/x86/crypto/aegis128-aesni-asm.S +++ b/arch/x86/crypto/aegis128-aesni-asm.S @@ -1,12 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * AES-NI + SSE2 implementation of AEGIS-128 * * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as published - * by the Free Software Foundation. */ #include <linux/linkage.h> diff --git a/arch/x86/crypto/aegis128l-aesni-asm.S b/arch/x86/crypto/aegis128l-aesni-asm.S index 491dd61c845c..1461ef00c0e8 100644 --- a/arch/x86/crypto/aegis128l-aesni-asm.S +++ b/arch/x86/crypto/aegis128l-aesni-asm.S @@ -1,12 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * AES-NI + SSE2 implementation of AEGIS-128L * * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as published - * by the Free Software Foundation. */ #include <linux/linkage.h> diff --git a/arch/x86/crypto/aegis256-aesni-asm.S b/arch/x86/crypto/aegis256-aesni-asm.S index 8870c7c5d9a4..37d9b13dfd85 100644 --- a/arch/x86/crypto/aegis256-aesni-asm.S +++ b/arch/x86/crypto/aegis256-aesni-asm.S @@ -1,12 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * AES-NI + SSE2 implementation of AEGIS-128L * * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as published - * by the Free Software Foundation. */ #include <linux/linkage.h> diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index e9b866e87d48..73c0ccb009a0 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -371,20 +371,6 @@ static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) } } -static void __aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) -{ - struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); - - aesni_enc(ctx, dst, src); -} - -static void __aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) -{ - struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); - - aesni_dec(ctx, dst, src); -} - static int aesni_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int len) { @@ -920,7 +906,7 @@ static int helper_rfc4106_decrypt(struct aead_request *req) } #endif -static struct crypto_alg aesni_algs[] = { { +static struct crypto_alg aesni_cipher_alg = { .cra_name = "aes", .cra_driver_name = "aes-aesni", .cra_priority = 300, @@ -937,24 +923,7 @@ static struct crypto_alg aesni_algs[] = { { .cia_decrypt = aes_decrypt } } -}, { - .cra_name = "__aes", - .cra_driver_name = "__aes-aesni", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_CIPHER | CRYPTO_ALG_INTERNAL, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = CRYPTO_AES_CTX_SIZE, - .cra_module = THIS_MODULE, - .cra_u = { - .cipher = { - .cia_min_keysize = AES_MIN_KEY_SIZE, - .cia_max_keysize = AES_MAX_KEY_SIZE, - .cia_setkey = aes_set_key, - .cia_encrypt = __aes_encrypt, - .cia_decrypt = __aes_decrypt - } - } -} }; +}; static struct skcipher_alg aesni_skciphers[] = { { @@ -1150,7 +1119,7 @@ static int __init aesni_init(void) #endif #endif - err = crypto_register_algs(aesni_algs, ARRAY_SIZE(aesni_algs)); + err = crypto_register_alg(&aesni_cipher_alg); if (err) return err; @@ -1158,7 +1127,7 @@ static int __init aesni_init(void) ARRAY_SIZE(aesni_skciphers), aesni_simd_skciphers); if (err) - goto unregister_algs; + goto unregister_cipher; err = simd_register_aeads_compat(aesni_aeads, ARRAY_SIZE(aesni_aeads), aesni_simd_aeads); @@ -1170,8 +1139,8 @@ static int __init aesni_init(void) unregister_skciphers: simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), aesni_simd_skciphers); -unregister_algs: - crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs)); +unregister_cipher: + crypto_unregister_alg(&aesni_cipher_alg); return err; } @@ -1181,7 +1150,7 @@ static void __exit aesni_exit(void) aesni_simd_aeads); simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), aesni_simd_skciphers); - crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs)); + crypto_unregister_alg(&aesni_cipher_alg); } late_initcall(aesni_init); diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c index 1ce0019c059c..388f95a4ec24 100644 --- a/arch/x86/crypto/chacha_glue.c +++ b/arch/x86/crypto/chacha_glue.c @@ -124,7 +124,7 @@ static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, } static int chacha_simd_stream_xor(struct skcipher_walk *walk, - struct chacha_ctx *ctx, u8 *iv) + const struct chacha_ctx *ctx, const u8 *iv) { u32 *state, state_buf[16 + 2] __aligned(8); int next_yield = 4096; /* bytes until next FPU yield */ diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S index f94375a8dcd1..5d53effe8abe 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_asm.S +++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Accelerated GHASH implementation with Intel PCLMULQDQ-NI * instructions. This file contains accelerated part of ghash @@ -10,10 +11,6 @@ * Vinodh Gopal * Erdinc Ozturk * Deniz Karakoyunlu - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as published - * by the Free Software Foundation. */ #include <linux/linkage.h> diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index e3f3e6fd9d65..ac76fe88ac4f 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -1,13 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Accelerated GHASH implementation with Intel PCLMULQDQ-NI * instructions. This file contains glue code. * * Copyright (c) 2009 Intel Corp. * Author: Huang Ying <ying.huang@intel.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as published - * by the Free Software Foundation. */ #include <linux/err.h> diff --git a/arch/x86/crypto/morus1280-avx2-asm.S b/arch/x86/crypto/morus1280-avx2-asm.S index de182c460f82..5413fee33481 100644 --- a/arch/x86/crypto/morus1280-avx2-asm.S +++ b/arch/x86/crypto/morus1280-avx2-asm.S @@ -1,12 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * AVX2 implementation of MORUS-1280 * * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as published - * by the Free Software Foundation. */ #include <linux/linkage.h> diff --git a/arch/x86/crypto/morus1280-sse2-asm.S b/arch/x86/crypto/morus1280-sse2-asm.S index da5d2905db60..0eece772866b 100644 --- a/arch/x86/crypto/morus1280-sse2-asm.S +++ b/arch/x86/crypto/morus1280-sse2-asm.S @@ -1,12 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * SSE2 implementation of MORUS-1280 * * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as published - * by the Free Software Foundation. */ #include <linux/linkage.h> diff --git a/arch/x86/crypto/morus640-sse2-asm.S b/arch/x86/crypto/morus640-sse2-asm.S index 414db480250e..a60891101bbd 100644 --- a/arch/x86/crypto/morus640-sse2-asm.S +++ b/arch/x86/crypto/morus640-sse2-asm.S @@ -1,12 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * SSE2 implementation of MORUS-640 * * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as published - * by the Free Software Foundation. */ #include <linux/linkage.h> diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index efb0d1b1f15f..9f1f9e3b8230 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -172,21 +172,6 @@ For 32-bit we have the following conventions - kernel is built with .endif .endm -/* - * This is a sneaky trick to help the unwinder find pt_regs on the stack. The - * frame pointer is replaced with an encoded pointer to pt_regs. The encoding - * is just setting the LSB, which makes it an invalid stack address and is also - * a signal to the unwinder that it's a pt_regs pointer in disguise. - * - * NOTE: This macro must be used *after* PUSH_AND_CLEAR_REGS because it corrupts - * the original rbp. - */ -.macro ENCODE_FRAME_POINTER ptregs_offset=0 -#ifdef CONFIG_FRAME_POINTER - leaq 1+\ptregs_offset(%rsp), %rbp -#endif -.endm - #ifdef CONFIG_PAGE_TABLE_ISOLATION /* diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 2418804e66b4..536b574b6161 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -72,23 +72,18 @@ static long syscall_trace_enter(struct pt_regs *regs) struct thread_info *ti = current_thread_info(); unsigned long ret = 0; - bool emulated = false; u32 work; if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) BUG_ON(regs != task_pt_regs(current)); - work = READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY; + work = READ_ONCE(ti->flags); - if (unlikely(work & _TIF_SYSCALL_EMU)) - emulated = true; - - if ((emulated || (work & _TIF_SYSCALL_TRACE)) && - tracehook_report_syscall_entry(regs)) - return -1L; - - if (emulated) - return -1L; + if (work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) { + ret = tracehook_report_syscall_entry(regs); + if (ret || (work & _TIF_SYSCALL_EMU)) + return -1L; + } #ifdef CONFIG_SECCOMP /* diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 7b23431be5cb..1285e5abf669 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -67,7 +67,6 @@ # define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF #else # define preempt_stop(clobbers) -# define resume_kernel restore_all_kernel #endif .macro TRACE_IRQS_IRET @@ -203,9 +202,102 @@ .Lend_\@: .endm +#define CS_FROM_ENTRY_STACK (1 << 31) +#define CS_FROM_USER_CR3 (1 << 30) +#define CS_FROM_KERNEL (1 << 29) + +.macro FIXUP_FRAME + /* + * The high bits of the CS dword (__csh) are used for CS_FROM_*. + * Clear them in case hardware didn't do this for us. + */ + andl $0x0000ffff, 3*4(%esp) + +#ifdef CONFIG_VM86 + testl $X86_EFLAGS_VM, 4*4(%esp) + jnz .Lfrom_usermode_no_fixup_\@ +#endif + testl $SEGMENT_RPL_MASK, 3*4(%esp) + jnz .Lfrom_usermode_no_fixup_\@ + + orl $CS_FROM_KERNEL, 3*4(%esp) + + /* + * When we're here from kernel mode; the (exception) stack looks like: + * + * 5*4(%esp) - <previous context> + * 4*4(%esp) - flags + * 3*4(%esp) - cs + * 2*4(%esp) - ip + * 1*4(%esp) - orig_eax + * 0*4(%esp) - gs / function + * + * Lets build a 5 entry IRET frame after that, such that struct pt_regs + * is complete and in particular regs->sp is correct. This gives us + * the original 5 enties as gap: + * + * 12*4(%esp) - <previous context> + * 11*4(%esp) - gap / flags + * 10*4(%esp) - gap / cs + * 9*4(%esp) - gap / ip + * 8*4(%esp) - gap / orig_eax + * 7*4(%esp) - gap / gs / function + * 6*4(%esp) - ss + * 5*4(%esp) - sp + * 4*4(%esp) - flags + * 3*4(%esp) - cs + * 2*4(%esp) - ip + * 1*4(%esp) - orig_eax + * 0*4(%esp) - gs / function + */ + + pushl %ss # ss + pushl %esp # sp (points at ss) + addl $6*4, (%esp) # point sp back at the previous context + pushl 6*4(%esp) # flags + pushl 6*4(%esp) # cs + pushl 6*4(%esp) # ip + pushl 6*4(%esp) # orig_eax + pushl 6*4(%esp) # gs / function +.Lfrom_usermode_no_fixup_\@: +.endm + +.macro IRET_FRAME + testl $CS_FROM_KERNEL, 1*4(%esp) + jz .Lfinished_frame_\@ + + /* + * Reconstruct the 3 entry IRET frame right after the (modified) + * regs->sp without lowering %esp in between, such that an NMI in the + * middle doesn't scribble our stack. + */ + pushl %eax + pushl %ecx + movl 5*4(%esp), %eax # (modified) regs->sp + + movl 4*4(%esp), %ecx # flags + movl %ecx, -4(%eax) + + movl 3*4(%esp), %ecx # cs + andl $0x0000ffff, %ecx + movl %ecx, -8(%eax) + + movl 2*4(%esp), %ecx # ip + movl %ecx, -12(%eax) + + movl 1*4(%esp), %ecx # eax + movl %ecx, -16(%eax) + + popl %ecx + lea -16(%eax), %esp + popl %eax +.Lfinished_frame_\@: +.endm + .macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 cld PUSH_GS + FIXUP_FRAME pushl %fs pushl %es pushl %ds @@ -247,22 +339,6 @@ .Lend_\@: .endm -/* - * This is a sneaky trick to help the unwinder find pt_regs on the stack. The - * frame pointer is replaced with an encoded pointer to pt_regs. The encoding - * is just clearing the MSB, which makes it an invalid stack address and is also - * a signal to the unwinder that it's a pt_regs pointer in disguise. - * - * NOTE: This macro must be used *after* SAVE_ALL because it corrupts the - * original rbp. - */ -.macro ENCODE_FRAME_POINTER -#ifdef CONFIG_FRAME_POINTER - mov %esp, %ebp - andl $0x7fffffff, %ebp -#endif -.endm - .macro RESTORE_INT_REGS popl %ebx popl %ecx @@ -375,9 +451,6 @@ * switch to it before we do any copying. */ -#define CS_FROM_ENTRY_STACK (1 << 31) -#define CS_FROM_USER_CR3 (1 << 30) - .macro SWITCH_TO_KERNEL_STACK ALTERNATIVE "", "jmp .Lend_\@", X86_FEATURE_XENPV @@ -391,13 +464,6 @@ * that register for the time this macro runs */ - /* - * The high bits of the CS dword (__csh) are used for - * CS_FROM_ENTRY_STACK and CS_FROM_USER_CR3. Clear them in case - * hardware didn't do this for us. - */ - andl $(0x0000ffff), PT_CS(%esp) - /* Are we on the entry stack? Bail out if not! */ movl PER_CPU_VAR(cpu_entry_area), %ecx addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx @@ -755,7 +821,7 @@ ret_from_intr: andl $SEGMENT_RPL_MASK, %eax #endif cmpl $USER_RPL, %eax - jb resume_kernel # not returning to v8086 or userspace + jb restore_all_kernel # not returning to v8086 or userspace ENTRY(resume_userspace) DISABLE_INTERRUPTS(CLBR_ANY) @@ -765,18 +831,6 @@ ENTRY(resume_userspace) jmp restore_all END(ret_from_exception) -#ifdef CONFIG_PREEMPT -ENTRY(resume_kernel) - DISABLE_INTERRUPTS(CLBR_ANY) - cmpl $0, PER_CPU_VAR(__preempt_count) - jnz restore_all_kernel - testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ? - jz restore_all_kernel - call preempt_schedule_irq - jmp restore_all_kernel -END(resume_kernel) -#endif - GLOBAL(__begin_SYSENTER_singlestep_region) /* * All code from here through __end_SYSENTER_singlestep_region is subject @@ -1019,6 +1073,7 @@ restore_all: /* Restore user state */ RESTORE_REGS pop=4 # skip orig_eax/error_code .Lirq_return: + IRET_FRAME /* * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization * when returning from IPI handler and when returning from @@ -1027,6 +1082,15 @@ restore_all: INTERRUPT_RETURN restore_all_kernel: +#ifdef CONFIG_PREEMPT + DISABLE_INTERRUPTS(CLBR_ANY) + cmpl $0, PER_CPU_VAR(__preempt_count) + jnz .Lno_preempt + testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ? + jz .Lno_preempt + call preempt_schedule_irq +.Lno_preempt: +#endif TRACE_IRQS_IRET PARANOID_EXIT_TO_KERNEL_MODE BUG_IF_WRONG_CR3 @@ -1104,6 +1168,30 @@ ENTRY(irq_entries_start) .endr END(irq_entries_start) +#ifdef CONFIG_X86_LOCAL_APIC + .align 8 +ENTRY(spurious_entries_start) + vector=FIRST_SYSTEM_VECTOR + .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR) + pushl $(~vector+0x80) /* Note: always in signed byte range */ + vector=vector+1 + jmp common_spurious + .align 8 + .endr +END(spurious_entries_start) + +common_spurious: + ASM_CLAC + addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */ + SAVE_ALL switch_stacks=1 + ENCODE_FRAME_POINTER + TRACE_IRQS_OFF + movl %esp, %eax + call smp_spurious_interrupt + jmp ret_from_intr +ENDPROC(common_interrupt) +#endif + /* * the CPU automatically disables interrupts when executing an IRQ vector, * so IRQ-flags tracing has to follow that: @@ -1360,6 +1448,7 @@ END(page_fault) common_exception: /* the function address is in %gs's slot on the stack */ + FIXUP_FRAME pushl %fs pushl %es pushl %ds diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 33f9fc38d014..0ea4831a72a4 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -375,6 +375,18 @@ ENTRY(irq_entries_start) .endr END(irq_entries_start) + .align 8 +ENTRY(spurious_entries_start) + vector=FIRST_SYSTEM_VECTOR + .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR) + UNWIND_HINT_IRET_REGS + pushq $(~vector+0x80) /* Note: always in signed byte range */ + jmp common_spurious + .align 8 + vector=vector+1 + .endr +END(spurious_entries_start) + .macro DEBUG_ENTRY_ASSERT_IRQS_OFF #ifdef CONFIG_DEBUG_ENTRY pushq %rax @@ -571,10 +583,20 @@ _ASM_NOKPROBE(interrupt_entry) /* Interrupt entry/exit. */ - /* - * The interrupt stubs push (~vector+0x80) onto the stack and - * then jump to common_interrupt. - */ +/* + * The interrupt stubs push (~vector+0x80) onto the stack and + * then jump to common_spurious/interrupt. + */ +common_spurious: + addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */ + call interrupt_entry + UNWIND_HINT_REGS indirect=1 + call smp_spurious_interrupt /* rdi points to pt_regs */ + jmp ret_from_intr +END(common_spurious) +_ASM_NOKPROBE(common_spurious) + +/* common_interrupt is a hotpath. Align it */ .p2align CONFIG_X86_L1_CACHE_SHIFT common_interrupt: addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */ @@ -1142,6 +1164,11 @@ apicinterrupt3 HYPERV_STIMER0_VECTOR \ hv_stimer0_callback_vector hv_stimer0_vector_handler #endif /* CONFIG_HYPERV */ +#if IS_ENABLED(CONFIG_ACRN_GUEST) +apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ + acrn_hv_callback_vector acrn_hv_vector_handler +#endif + idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=IST_INDEX_DB ist_offset=DB_STACK_OFFSET idtentry int3 do_int3 has_error_code=0 create_gap=1 idtentry stack_segment do_stack_segment has_error_code=1 @@ -1670,11 +1697,17 @@ nmi_restore: iretq END(nmi) +#ifndef CONFIG_IA32_EMULATION +/* + * This handles SYSCALL from 32-bit code. There is no way to program + * MSRs to fully disable 32-bit SYSCALL. + */ ENTRY(ignore_sysret) UNWIND_HINT_EMPTY mov $-ENOSYS, %eax sysret END(ignore_sysret) +#endif ENTRY(rewind_stack_do_exit) UNWIND_HINT_FUNC diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S index fee6bc79b987..cb3464525b37 100644 --- a/arch/x86/entry/thunk_32.S +++ b/arch/x86/entry/thunk_32.S @@ -1,8 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash) * Copyright 2008 by Steven Rostedt, Red Hat, Inc * (inspired by Andi Kleen's thunk_64.S) - * Subject to the GNU public license, v.2. No warranty of any kind. */ #include <linux/linkage.h> #include <asm/asm.h> diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S index be36bf4e0957..cfdca8b42c70 100644 --- a/arch/x86/entry/thunk_64.S +++ b/arch/x86/entry/thunk_64.S @@ -1,9 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Save registers before calling assembly functions. This avoids * disturbance of register allocation in some inline assembly constructs. * Copyright 2001,2002 by Andi Kleen, SuSE Labs. * Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc. - * Subject to the GNU public license, v.2. No warranty of any kind. */ #include <linux/linkage.h> #include "calling.h" diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 42fe42e82baf..39106111be86 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -3,6 +3,12 @@ # Building vDSO images for x86. # +# Absolute relocation type $(ARCH_REL_TYPE_ABS) needs to be defined before +# the inclusion of generic Makefile. +ARCH_REL_TYPE_ABS := R_X86_64_JUMP_SLOT|R_X86_64_GLOB_DAT|R_X86_64_RELATIVE| +ARCH_REL_TYPE_ABS += R_386_GLOB_DAT|R_386_JMP_SLOT|R_386_RELATIVE +include $(srctree)/lib/vdso/Makefile + KBUILD_CFLAGS += $(DISABLE_LTO) KASAN_SANITIZE := n UBSAN_SANITIZE := n @@ -51,6 +57,7 @@ VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -soname linux-vdso.so.1 --no-undefined \ $(obj)/vdso64.so.dbg: $(obj)/vdso.lds $(vobjs) FORCE $(call if_changed,vdso) + $(call if_changed,vdso_check) HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi -I$(srctree)/arch/$(SUBARCH)/include/uapi hostprogs-y += vdso2c @@ -121,6 +128,7 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE $(obj)/vdsox32.so.dbg: $(obj)/vdsox32.lds $(vobjx32s) FORCE $(call if_changed,vdso) + $(call if_changed,vdso_check) CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -soname linux-gate.so.1 @@ -160,6 +168,7 @@ $(obj)/vdso32.so.dbg: FORCE \ $(obj)/vdso32/system_call.o \ $(obj)/vdso32/sigreturn.o $(call if_changed,vdso) + $(call if_changed,vdso_check) # # The DSO images are built using a special linker script. diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 0f82a70c7682..d9ff616bb0f6 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -1,240 +1,85 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright 2006 Andi Kleen, SUSE Labs. - * * Fast user context implementation of clock_gettime, gettimeofday, and time. * + * Copyright 2006 Andi Kleen, SUSE Labs. + * Copyright 2019 ARM Limited + * * 32 Bit compat layer by Stefani Seibold <stefani@seibold.net> * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany - * - * The code should have no internal unresolved relocations. - * Check with readelf after changing. */ - -#include <uapi/linux/time.h> -#include <asm/vgtod.h> -#include <asm/vvar.h> -#include <asm/unistd.h> -#include <asm/msr.h> -#include <asm/pvclock.h> -#include <asm/mshyperv.h> -#include <linux/math64.h> #include <linux/time.h> #include <linux/kernel.h> +#include <linux/types.h> -#define gtod (&VVAR(vsyscall_gtod_data)) +#include "../../../../lib/vdso/gettimeofday.c" -extern int __vdso_clock_gettime(clockid_t clock, struct timespec *ts); -extern int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz); +extern int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz); extern time_t __vdso_time(time_t *t); -#ifdef CONFIG_PARAVIRT_CLOCK -extern u8 pvclock_page[PAGE_SIZE] - __attribute__((visibility("hidden"))); -#endif - -#ifdef CONFIG_HYPERV_TSCPAGE -extern u8 hvclock_page[PAGE_SIZE] - __attribute__((visibility("hidden"))); -#endif - -#ifndef BUILD_VDSO32 - -notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) +int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz) { - long ret; - asm ("syscall" : "=a" (ret), "=m" (*ts) : - "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : - "rcx", "r11"); - return ret; + return __cvdso_gettimeofday(tv, tz); } -#else +int gettimeofday(struct __kernel_old_timeval *, struct timezone *) + __attribute__((weak, alias("__vdso_gettimeofday"))); -notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) +time_t __vdso_time(time_t *t) { - long ret; - - asm ( - "mov %%ebx, %%edx \n" - "mov %[clock], %%ebx \n" - "call __kernel_vsyscall \n" - "mov %%edx, %%ebx \n" - : "=a" (ret), "=m" (*ts) - : "0" (__NR_clock_gettime), [clock] "g" (clock), "c" (ts) - : "edx"); - return ret; + return __cvdso_time(t); } -#endif +time_t time(time_t *t) __attribute__((weak, alias("__vdso_time"))); -#ifdef CONFIG_PARAVIRT_CLOCK -static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void) -{ - return (const struct pvclock_vsyscall_time_info *)&pvclock_page; -} -static notrace u64 vread_pvclock(void) -{ - const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti; - u32 version; - u64 ret; - - /* - * Note: The kernel and hypervisor must guarantee that cpu ID - * number maps 1:1 to per-CPU pvclock time info. - * - * Because the hypervisor is entirely unaware of guest userspace - * preemption, it cannot guarantee that per-CPU pvclock time - * info is updated if the underlying CPU changes or that that - * version is increased whenever underlying CPU changes. - * - * On KVM, we are guaranteed that pvti updates for any vCPU are - * atomic as seen by *all* vCPUs. This is an even stronger - * guarantee than we get with a normal seqlock. - * - * On Xen, we don't appear to have that guarantee, but Xen still - * supplies a valid seqlock using the version field. - * - * We only do pvclock vdso timing at all if - * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to - * mean that all vCPUs have matching pvti and that the TSC is - * synced, so we can just look at vCPU 0's pvti. - */ - - do { - version = pvclock_read_begin(pvti); - - if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) - return U64_MAX; - - ret = __pvclock_read_cycles(pvti, rdtsc_ordered()); - } while (pvclock_read_retry(pvti, version)); - - return ret; -} -#endif -#ifdef CONFIG_HYPERV_TSCPAGE -static notrace u64 vread_hvclock(void) -{ - const struct ms_hyperv_tsc_page *tsc_pg = - (const struct ms_hyperv_tsc_page *)&hvclock_page; +#if defined(CONFIG_X86_64) && !defined(BUILD_VDSO32_64) +/* both 64-bit and x32 use these */ +extern int __vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts); +extern int __vdso_clock_getres(clockid_t clock, struct __kernel_timespec *res); - return hv_read_tsc_page(tsc_pg); -} -#endif - -notrace static inline u64 vgetcyc(int mode) +int __vdso_clock_gettime(clockid_t clock, struct __kernel_timespec *ts) { - if (mode == VCLOCK_TSC) - return (u64)rdtsc_ordered(); -#ifdef CONFIG_PARAVIRT_CLOCK - else if (mode == VCLOCK_PVCLOCK) - return vread_pvclock(); -#endif -#ifdef CONFIG_HYPERV_TSCPAGE - else if (mode == VCLOCK_HVCLOCK) - return vread_hvclock(); -#endif - return U64_MAX; + return __cvdso_clock_gettime(clock, ts); } -notrace static int do_hres(clockid_t clk, struct timespec *ts) -{ - struct vgtod_ts *base = >od->basetime[clk]; - u64 cycles, last, sec, ns; - unsigned int seq; - - do { - seq = gtod_read_begin(gtod); - cycles = vgetcyc(gtod->vclock_mode); - ns = base->nsec; - last = gtod->cycle_last; - if (unlikely((s64)cycles < 0)) - return vdso_fallback_gettime(clk, ts); - if (cycles > last) - ns += (cycles - last) * gtod->mult; - ns >>= gtod->shift; - sec = base->sec; - } while (unlikely(gtod_read_retry(gtod, seq))); - - /* - * Do this outside the loop: a race inside the loop could result - * in __iter_div_u64_rem() being extremely slow. - */ - ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); - ts->tv_nsec = ns; - - return 0; -} +int clock_gettime(clockid_t, struct __kernel_timespec *) + __attribute__((weak, alias("__vdso_clock_gettime"))); -notrace static void do_coarse(clockid_t clk, struct timespec *ts) +int __vdso_clock_getres(clockid_t clock, + struct __kernel_timespec *res) { - struct vgtod_ts *base = >od->basetime[clk]; - unsigned int seq; - - do { - seq = gtod_read_begin(gtod); - ts->tv_sec = base->sec; - ts->tv_nsec = base->nsec; - } while (unlikely(gtod_read_retry(gtod, seq))); + return __cvdso_clock_getres(clock, res); } +int clock_getres(clockid_t, struct __kernel_timespec *) + __attribute__((weak, alias("__vdso_clock_getres"))); -notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) +#else +/* i386 only */ +extern int __vdso_clock_gettime(clockid_t clock, struct old_timespec32 *ts); +extern int __vdso_clock_getres(clockid_t clock, struct old_timespec32 *res); + +int __vdso_clock_gettime(clockid_t clock, struct old_timespec32 *ts) { - unsigned int msk; - - /* Sort out negative (CPU/FD) and invalid clocks */ - if (unlikely((unsigned int) clock >= MAX_CLOCKS)) - return vdso_fallback_gettime(clock, ts); - - /* - * Convert the clockid to a bitmask and use it to check which - * clocks are handled in the VDSO directly. - */ - msk = 1U << clock; - if (likely(msk & VGTOD_HRES)) { - return do_hres(clock, ts); - } else if (msk & VGTOD_COARSE) { - do_coarse(clock, ts); - return 0; - } - return vdso_fallback_gettime(clock, ts); + return __cvdso_clock_gettime32(clock, ts); } -int clock_gettime(clockid_t, struct timespec *) +int clock_gettime(clockid_t, struct old_timespec32 *) __attribute__((weak, alias("__vdso_clock_gettime"))); -notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) +int __vdso_clock_gettime64(clockid_t clock, struct __kernel_timespec *ts) { - if (likely(tv != NULL)) { - struct timespec *ts = (struct timespec *) tv; - - do_hres(CLOCK_REALTIME, ts); - tv->tv_usec /= 1000; - } - if (unlikely(tz != NULL)) { - tz->tz_minuteswest = gtod->tz_minuteswest; - tz->tz_dsttime = gtod->tz_dsttime; - } - - return 0; + return __cvdso_clock_gettime(clock, ts); } -int gettimeofday(struct timeval *, struct timezone *) - __attribute__((weak, alias("__vdso_gettimeofday"))); -/* - * This will break when the xtime seconds get inaccurate, but that is - * unlikely - */ -notrace time_t __vdso_time(time_t *t) -{ - /* This is atomic on x86 so we don't need any locks. */ - time_t result = READ_ONCE(gtod->basetime[CLOCK_REALTIME].sec); +int clock_gettime64(clockid_t, struct __kernel_timespec *) + __attribute__((weak, alias("__vdso_clock_gettime64"))); - if (t) - *t = result; - return result; +int __vdso_clock_getres(clockid_t clock, struct old_timespec32 *res) +{ + return __cvdso_clock_getres_time32(clock, res); } -time_t time(time_t *t) - __attribute__((weak, alias("__vdso_time"))); + +int clock_getres(clockid_t, struct old_timespec32 *) + __attribute__((weak, alias("__vdso_clock_getres"))); +#endif diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S index d3a2dce4cfa9..36b644e16272 100644 --- a/arch/x86/entry/vdso/vdso.lds.S +++ b/arch/x86/entry/vdso/vdso.lds.S @@ -25,6 +25,8 @@ VERSION { __vdso_getcpu; time; __vdso_time; + clock_getres; + __vdso_clock_getres; local: *; }; } diff --git a/arch/x86/entry/vdso/vdso32/vdso32.lds.S b/arch/x86/entry/vdso/vdso32/vdso32.lds.S index 422764a81d32..c7720995ab1a 100644 --- a/arch/x86/entry/vdso/vdso32/vdso32.lds.S +++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S @@ -26,6 +26,8 @@ VERSION __vdso_clock_gettime; __vdso_gettimeofday; __vdso_time; + __vdso_clock_getres; + __vdso_clock_gettime64; }; LINUX_2.5 { diff --git a/arch/x86/entry/vdso/vdsox32.lds.S b/arch/x86/entry/vdso/vdsox32.lds.S index 05cd1c5c4a15..16a8050a4fb6 100644 --- a/arch/x86/entry/vdso/vdsox32.lds.S +++ b/arch/x86/entry/vdso/vdsox32.lds.S @@ -21,6 +21,7 @@ VERSION { __vdso_gettimeofday; __vdso_getcpu; __vdso_time; + __vdso_clock_getres; local: *; }; } diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 8db1f594e8b1..349a61d8bf34 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -22,7 +22,7 @@ #include <asm/page.h> #include <asm/desc.h> #include <asm/cpufeature.h> -#include <asm/mshyperv.h> +#include <clocksource/hyperv_timer.h> #if defined(CONFIG_X86_64) unsigned int __read_mostly vdso64_enabled = 1; diff --git a/arch/x86/entry/vsyscall/Makefile b/arch/x86/entry/vsyscall/Makefile index 1ac4dd116c26..93c1b3e949a7 100644 --- a/arch/x86/entry/vsyscall/Makefile +++ b/arch/x86/entry/vsyscall/Makefile @@ -2,7 +2,5 @@ # # Makefile for the x86 low level vsyscall code # -obj-y := vsyscall_gtod.o - obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index d9d81ad7a400..e7c596dea947 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -42,9 +42,11 @@ #define CREATE_TRACE_POINTS #include "vsyscall_trace.h" -static enum { EMULATE, NONE } vsyscall_mode = +static enum { EMULATE, XONLY, NONE } vsyscall_mode __ro_after_init = #ifdef CONFIG_LEGACY_VSYSCALL_NONE NONE; +#elif defined(CONFIG_LEGACY_VSYSCALL_XONLY) + XONLY; #else EMULATE; #endif @@ -54,6 +56,8 @@ static int __init vsyscall_setup(char *str) if (str) { if (!strcmp("emulate", str)) vsyscall_mode = EMULATE; + else if (!strcmp("xonly", str)) + vsyscall_mode = XONLY; else if (!strcmp("none", str)) vsyscall_mode = NONE; else @@ -106,14 +110,15 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size) thread->cr2 = ptr; thread->trap_nr = X86_TRAP_PF; - force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)ptr, current); + force_sig_fault(SIGSEGV, SEGV_MAPERR, (void __user *)ptr); return false; } else { return true; } } -bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) +bool emulate_vsyscall(unsigned long error_code, + struct pt_regs *regs, unsigned long address) { struct task_struct *tsk; unsigned long caller; @@ -122,6 +127,22 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) long ret; unsigned long orig_dx; + /* Write faults or kernel-privilege faults never get fixed up. */ + if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER) + return false; + + if (!(error_code & X86_PF_INSTR)) { + /* Failed vsyscall read */ + if (vsyscall_mode == EMULATE) + return false; + + /* + * User code tried and failed to read the vsyscall page. + */ + warn_bad_vsyscall(KERN_INFO, regs, "vsyscall read attempt denied -- look up the vsyscall kernel parameter if you need a workaround"); + return false; + } + /* * No point in checking CS -- the only way to get here is a user mode * trap to a high address, which means that we're in 64-bit user code. @@ -268,7 +289,7 @@ do_ret: return true; sigsegv: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return true; } @@ -284,7 +305,7 @@ static const char *gate_vma_name(struct vm_area_struct *vma) static const struct vm_operations_struct gate_vma_ops = { .name = gate_vma_name, }; -static struct vm_area_struct gate_vma = { +static struct vm_area_struct gate_vma __ro_after_init = { .vm_start = VSYSCALL_ADDR, .vm_end = VSYSCALL_ADDR + PAGE_SIZE, .vm_page_prot = PAGE_READONLY_EXEC, @@ -357,12 +378,20 @@ void __init map_vsyscall(void) extern char __vsyscall_page; unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); - if (vsyscall_mode != NONE) { + /* + * For full emulation, the page needs to exist for real. In + * execute-only mode, there is no PTE at all backing the vsyscall + * page. + */ + if (vsyscall_mode == EMULATE) { __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, PAGE_KERNEL_VVAR); set_vsyscall_pgtable_user_bits(swapper_pg_dir); } + if (vsyscall_mode == XONLY) + gate_vma.vm_flags = VM_EXEC; + BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != (unsigned long)VSYSCALL_ADDR); } diff --git a/arch/x86/entry/vsyscall/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c deleted file mode 100644 index cfcdba082feb..000000000000 --- a/arch/x86/entry/vsyscall/vsyscall_gtod.c +++ /dev/null @@ -1,83 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE - * Copyright 2003 Andi Kleen, SuSE Labs. - * - * Modified for x86 32 bit architecture by - * Stefani Seibold <stefani@seibold.net> - * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany - * - * Thanks to hpa@transmeta.com for some useful hint. - * Special thanks to Ingo Molnar for his early experience with - * a different vsyscall implementation for Linux/IA32 and for the name. - * - */ - -#include <linux/timekeeper_internal.h> -#include <asm/vgtod.h> -#include <asm/vvar.h> - -int vclocks_used __read_mostly; - -DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); - -void update_vsyscall_tz(void) -{ - vsyscall_gtod_data.tz_minuteswest = sys_tz.tz_minuteswest; - vsyscall_gtod_data.tz_dsttime = sys_tz.tz_dsttime; -} - -void update_vsyscall(struct timekeeper *tk) -{ - int vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; - struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; - struct vgtod_ts *base; - u64 nsec; - - /* Mark the new vclock used. */ - BUILD_BUG_ON(VCLOCK_MAX >= 32); - WRITE_ONCE(vclocks_used, READ_ONCE(vclocks_used) | (1 << vclock_mode)); - - gtod_write_begin(vdata); - - /* copy vsyscall data */ - vdata->vclock_mode = vclock_mode; - vdata->cycle_last = tk->tkr_mono.cycle_last; - vdata->mask = tk->tkr_mono.mask; - vdata->mult = tk->tkr_mono.mult; - vdata->shift = tk->tkr_mono.shift; - - base = &vdata->basetime[CLOCK_REALTIME]; - base->sec = tk->xtime_sec; - base->nsec = tk->tkr_mono.xtime_nsec; - - base = &vdata->basetime[CLOCK_TAI]; - base->sec = tk->xtime_sec + (s64)tk->tai_offset; - base->nsec = tk->tkr_mono.xtime_nsec; - - base = &vdata->basetime[CLOCK_MONOTONIC]; - base->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; - nsec = tk->tkr_mono.xtime_nsec; - nsec += ((u64)tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift); - while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { - nsec -= ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift; - base->sec++; - } - base->nsec = nsec; - - base = &vdata->basetime[CLOCK_REALTIME_COARSE]; - base->sec = tk->xtime_sec; - base->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; - - base = &vdata->basetime[CLOCK_MONOTONIC_COARSE]; - base->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; - nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; - nsec += tk->wall_to_monotonic.tv_nsec; - while (nsec >= NSEC_PER_SEC) { - nsec -= NSEC_PER_SEC; - base->sec++; - } - base->nsec = nsec; - - gtod_write_end(vdata); -} diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile index 9cbfd34042d5..9e07f554333f 100644 --- a/arch/x86/events/Makefile +++ b/arch/x86/events/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y += core.o +obj-y += core.o probe.o obj-y += amd/ obj-$(CONFIG_X86_LOCAL_APIC) += msr.o obj-$(CONFIG_CPU_SUP_INTEL) += intel/ diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c index 58a6993d7eb3..fb616203ce42 100644 --- a/arch/x86/events/amd/iommu.c +++ b/arch/x86/events/amd/iommu.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2013 Advanced Micro Devices, Inc. * @@ -5,10 +6,6 @@ * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com> * * Perf: amd_iommu - AMD IOMMU Performance Counter PMU implementation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #define pr_fmt(fmt) "perf/amd_iommu: " fmt diff --git a/arch/x86/events/amd/iommu.h b/arch/x86/events/amd/iommu.h index 62e0702c4374..0e5c036fd7be 100644 --- a/arch/x86/events/amd/iommu.h +++ b/arch/x86/events/amd/iommu.h @@ -1,12 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2013 Advanced Micro Devices, Inc. * * Author: Steven Kinney <Steven.Kinney@amd.com> * Author: Suravee Suthikulpanit <Suraveee.Suthikulpanit@amd.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #ifndef _PERF_EVENT_AMD_IOMMU_H_ diff --git a/arch/x86/events/amd/power.c b/arch/x86/events/amd/power.c index c5ff084551c6..abef51320e3a 100644 --- a/arch/x86/events/amd/power.c +++ b/arch/x86/events/amd/power.c @@ -1,13 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Performance events - AMD Processor Power Reporting Mechanism * * Copyright (C) 2016 Advanced Micro Devices, Inc. * * Author: Huang Rui <ray.huang@amd.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/module.h> diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index 79cfd3b30ceb..85e6984c560b 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -1,11 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2013 Advanced Micro Devices, Inc. * * Author: Jacob Shin <jacob.shin@amd.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/perf_event.h> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index f315425d8468..81b005e4c7d9 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -561,14 +561,14 @@ int x86_pmu_hw_config(struct perf_event *event) } /* sample_regs_user never support XMM registers */ - if (unlikely(event->attr.sample_regs_user & PEBS_XMM_REGS)) + if (unlikely(event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK)) return -EINVAL; /* * Besides the general purpose registers, XMM registers may * be collected in PEBS on some platforms, e.g. Icelake */ - if (unlikely(event->attr.sample_regs_intr & PEBS_XMM_REGS)) { - if (x86_pmu.pebs_no_xmm_regs) + if (unlikely(event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK)) { + if (!(event->pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS)) return -EINVAL; if (!event->attr.precise_ip) @@ -1618,68 +1618,6 @@ static struct attribute_group x86_pmu_format_group __ro_after_init = { .attrs = NULL, }; -/* - * Remove all undefined events (x86_pmu.event_map(id) == 0) - * out of events_attr attributes. - */ -static void __init filter_events(struct attribute **attrs) -{ - struct device_attribute *d; - struct perf_pmu_events_attr *pmu_attr; - int offset = 0; - int i, j; - - for (i = 0; attrs[i]; i++) { - d = (struct device_attribute *)attrs[i]; - pmu_attr = container_of(d, struct perf_pmu_events_attr, attr); - /* str trumps id */ - if (pmu_attr->event_str) - continue; - if (x86_pmu.event_map(i + offset)) - continue; - - for (j = i; attrs[j]; j++) - attrs[j] = attrs[j + 1]; - - /* Check the shifted attr. */ - i--; - - /* - * event_map() is index based, the attrs array is organized - * by increasing event index. If we shift the events, then - * we need to compensate for the event_map(), otherwise - * we are looking up the wrong event in the map - */ - offset++; - } -} - -/* Merge two pointer arrays */ -__init struct attribute **merge_attr(struct attribute **a, struct attribute **b) -{ - struct attribute **new; - int j, i; - - for (j = 0; a && a[j]; j++) - ; - for (i = 0; b && b[i]; i++) - j++; - j++; - - new = kmalloc_array(j, sizeof(struct attribute *), GFP_KERNEL); - if (!new) - return NULL; - - j = 0; - for (i = 0; a && a[i]; i++) - new[j++] = a[i]; - for (i = 0; b && b[i]; i++) - new[j++] = b[i]; - new[j] = NULL; - - return new; -} - ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) { struct perf_pmu_events_attr *pmu_attr = \ @@ -1744,9 +1682,24 @@ static struct attribute *events_attr[] = { NULL, }; +/* + * Remove all undefined events (x86_pmu.event_map(id) == 0) + * out of events_attr attributes. + */ +static umode_t +is_visible(struct kobject *kobj, struct attribute *attr, int idx) +{ + struct perf_pmu_events_attr *pmu_attr; + + pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr.attr); + /* str trumps id */ + return pmu_attr->event_str || x86_pmu.event_map(idx) ? attr->mode : 0; +} + static struct attribute_group x86_pmu_events_group __ro_after_init = { .name = "events", .attrs = events_attr, + .is_visible = is_visible, }; ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event) @@ -1842,37 +1795,10 @@ static int __init init_hw_perf_events(void) x86_pmu_format_group.attrs = x86_pmu.format_attrs; - if (x86_pmu.caps_attrs) { - struct attribute **tmp; - - tmp = merge_attr(x86_pmu_caps_group.attrs, x86_pmu.caps_attrs); - if (!WARN_ON(!tmp)) - x86_pmu_caps_group.attrs = tmp; - } - - if (x86_pmu.event_attrs) - x86_pmu_events_group.attrs = x86_pmu.event_attrs; - if (!x86_pmu.events_sysfs_show) x86_pmu_events_group.attrs = &empty_attrs; - else - filter_events(x86_pmu_events_group.attrs); - - if (x86_pmu.cpu_events) { - struct attribute **tmp; - - tmp = merge_attr(x86_pmu_events_group.attrs, x86_pmu.cpu_events); - if (!WARN_ON(!tmp)) - x86_pmu_events_group.attrs = tmp; - } - - if (x86_pmu.attrs) { - struct attribute **tmp; - tmp = merge_attr(x86_pmu_attr_group.attrs, x86_pmu.attrs); - if (!WARN_ON(!tmp)) - x86_pmu_attr_group.attrs = tmp; - } + pmu.attr_update = x86_pmu.attr_update; pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.cntval_bits); @@ -2179,7 +2105,7 @@ static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) * For now, this can't happen because all callers hold mmap_sem * for write. If this changes, we'll need a different solution. */ - lockdep_assert_held_exclusive(&mm->mmap_sem); + lockdep_assert_held_write(&mm->mmap_sem); if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1) on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1); @@ -2402,13 +2328,13 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re return; } - if (perf_hw_regs(regs)) { - if (perf_callchain_store(entry, regs->ip)) - return; + if (perf_callchain_store(entry, regs->ip)) + return; + + if (perf_hw_regs(regs)) unwind_start(&state, current, regs, NULL); - } else { + else unwind_start(&state, current, NULL, (void *)regs->sp); - } for (; !unwind_done(&state); unwind_next_frame(&state)) { addr = unwind_get_return_address(&state); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index a5436cee20b1..bda450ff51ee 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -20,6 +20,7 @@ #include <asm/intel-family.h> #include <asm/apic.h> #include <asm/cpu_device_id.h> +#include <asm/hypervisor.h> #include "../perf_event.h" @@ -3897,8 +3898,6 @@ static __initconst const struct x86_pmu core_pmu = { .check_period = intel_pmu_check_period, }; -static struct attribute *intel_pmu_attrs[]; - static __initconst const struct x86_pmu intel_pmu = { .name = "Intel", .handle_irq = intel_pmu_handle_irq, @@ -3930,8 +3929,6 @@ static __initconst const struct x86_pmu intel_pmu = { .format_attrs = intel_arch3_formats_attr, .events_sysfs_show = intel_event_sysfs_show, - .attrs = intel_pmu_attrs, - .cpu_prepare = intel_pmu_cpu_prepare, .cpu_starting = intel_pmu_cpu_starting, .cpu_dying = intel_pmu_cpu_dying, @@ -4055,6 +4052,13 @@ static bool check_msr(unsigned long msr, u64 mask) u64 val_old, val_new, val_tmp; /* + * Disable the check for real HW, so we don't + * mess with potentionaly enabled registers: + */ + if (hypervisor_is_type(X86_HYPER_NATIVE)) + return true; + + /* * Read the current value, change it and read it back to see if it * matches, this is needed to detect certain hardware emulators * (qemu/kvm) that don't trap on the MSR access and always return 0s. @@ -4274,13 +4278,6 @@ static struct attribute *icl_tsx_events_attrs[] = { NULL, }; -static __init struct attribute **get_icl_events_attrs(void) -{ - return boot_cpu_has(X86_FEATURE_RTM) ? - merge_attr(icl_events_attrs, icl_tsx_events_attrs) : - icl_events_attrs; -} - static ssize_t freeze_on_smi_show(struct device *cdev, struct device_attribute *attr, char *buf) @@ -4402,43 +4399,111 @@ static DEVICE_ATTR(allow_tsx_force_abort, 0644, static struct attribute *intel_pmu_attrs[] = { &dev_attr_freeze_on_smi.attr, - NULL, /* &dev_attr_allow_tsx_force_abort.attr.attr */ + &dev_attr_allow_tsx_force_abort.attr, NULL, }; -static __init struct attribute ** -get_events_attrs(struct attribute **base, - struct attribute **mem, - struct attribute **tsx) +static umode_t +tsx_is_visible(struct kobject *kobj, struct attribute *attr, int i) { - struct attribute **attrs = base; - struct attribute **old; + return boot_cpu_has(X86_FEATURE_RTM) ? attr->mode : 0; +} - if (mem && x86_pmu.pebs) - attrs = merge_attr(attrs, mem); +static umode_t +pebs_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return x86_pmu.pebs ? attr->mode : 0; +} - if (tsx && boot_cpu_has(X86_FEATURE_RTM)) { - old = attrs; - attrs = merge_attr(attrs, tsx); - if (old != base) - kfree(old); - } +static umode_t +lbr_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return x86_pmu.lbr_nr ? attr->mode : 0; +} - return attrs; +static umode_t +exra_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return x86_pmu.version >= 2 ? attr->mode : 0; } +static umode_t +default_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + if (attr == &dev_attr_allow_tsx_force_abort.attr) + return x86_pmu.flags & PMU_FL_TFA ? attr->mode : 0; + + return attr->mode; +} + +static struct attribute_group group_events_td = { + .name = "events", +}; + +static struct attribute_group group_events_mem = { + .name = "events", + .is_visible = pebs_is_visible, +}; + +static struct attribute_group group_events_tsx = { + .name = "events", + .is_visible = tsx_is_visible, +}; + +static struct attribute_group group_caps_gen = { + .name = "caps", + .attrs = intel_pmu_caps_attrs, +}; + +static struct attribute_group group_caps_lbr = { + .name = "caps", + .attrs = lbr_attrs, + .is_visible = lbr_is_visible, +}; + +static struct attribute_group group_format_extra = { + .name = "format", + .is_visible = exra_is_visible, +}; + +static struct attribute_group group_format_extra_skl = { + .name = "format", + .is_visible = exra_is_visible, +}; + +static struct attribute_group group_default = { + .attrs = intel_pmu_attrs, + .is_visible = default_is_visible, +}; + +static const struct attribute_group *attr_update[] = { + &group_events_td, + &group_events_mem, + &group_events_tsx, + &group_caps_gen, + &group_caps_lbr, + &group_format_extra, + &group_format_extra_skl, + &group_default, + NULL, +}; + +static struct attribute *empty_attrs; + __init int intel_pmu_init(void) { - struct attribute **extra_attr = NULL; - struct attribute **mem_attr = NULL; - struct attribute **tsx_attr = NULL; - struct attribute **to_free = NULL; + struct attribute **extra_skl_attr = &empty_attrs; + struct attribute **extra_attr = &empty_attrs; + struct attribute **td_attr = &empty_attrs; + struct attribute **mem_attr = &empty_attrs; + struct attribute **tsx_attr = &empty_attrs; union cpuid10_edx edx; union cpuid10_eax eax; union cpuid10_ebx ebx; struct event_constraint *c; unsigned int unused; struct extra_reg *er; + bool pmem = false; int version, i; char *name; @@ -4596,7 +4661,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints; x86_pmu.extra_regs = intel_slm_extra_regs; x86_pmu.flags |= PMU_FL_HAS_RSP_1; - x86_pmu.cpu_events = slm_events_attrs; + td_attr = slm_events_attrs; extra_attr = slm_format_attr; pr_cont("Silvermont events, "); name = "silvermont"; @@ -4624,7 +4689,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_prec_dist = true; x86_pmu.lbr_pt_coexist = true; x86_pmu.flags |= PMU_FL_HAS_RSP_1; - x86_pmu.cpu_events = glm_events_attrs; + td_attr = glm_events_attrs; extra_attr = slm_format_attr; pr_cont("Goldmont events, "); name = "goldmont"; @@ -4651,7 +4716,7 @@ __init int intel_pmu_init(void) x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.flags |= PMU_FL_PEBS_ALL; x86_pmu.get_event_constraints = glp_get_event_constraints; - x86_pmu.cpu_events = glm_events_attrs; + td_attr = glm_events_attrs; /* Goldmont Plus has 4-wide pipeline */ event_attr_td_total_slots_scale_glm.event_str = "4"; extra_attr = slm_format_attr; @@ -4740,7 +4805,7 @@ __init int intel_pmu_init(void) x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.flags |= PMU_FL_NO_HT_SHARING; - x86_pmu.cpu_events = snb_events_attrs; + td_attr = snb_events_attrs; mem_attr = snb_mem_events_attrs; /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ @@ -4781,7 +4846,7 @@ __init int intel_pmu_init(void) x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.flags |= PMU_FL_NO_HT_SHARING; - x86_pmu.cpu_events = snb_events_attrs; + td_attr = snb_events_attrs; mem_attr = snb_mem_events_attrs; /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ @@ -4818,10 +4883,10 @@ __init int intel_pmu_init(void) x86_pmu.hw_config = hsw_hw_config; x86_pmu.get_event_constraints = hsw_get_event_constraints; - x86_pmu.cpu_events = hsw_events_attrs; x86_pmu.lbr_double_abort = true; extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? hsw_format_attr : nhm_format_attr; + td_attr = hsw_events_attrs; mem_attr = hsw_mem_events_attrs; tsx_attr = hsw_tsx_events_attrs; pr_cont("Haswell events, "); @@ -4860,10 +4925,10 @@ __init int intel_pmu_init(void) x86_pmu.hw_config = hsw_hw_config; x86_pmu.get_event_constraints = hsw_get_event_constraints; - x86_pmu.cpu_events = hsw_events_attrs; x86_pmu.limit_period = bdw_limit_period; extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? hsw_format_attr : nhm_format_attr; + td_attr = hsw_events_attrs; mem_attr = hsw_mem_events_attrs; tsx_attr = hsw_tsx_events_attrs; pr_cont("Broadwell events, "); @@ -4890,9 +4955,10 @@ __init int intel_pmu_init(void) name = "knights-landing"; break; + case INTEL_FAM6_SKYLAKE_X: + pmem = true; case INTEL_FAM6_SKYLAKE_MOBILE: case INTEL_FAM6_SKYLAKE_DESKTOP: - case INTEL_FAM6_SKYLAKE_X: case INTEL_FAM6_KABYLAKE_MOBILE: case INTEL_FAM6_KABYLAKE_DESKTOP: x86_add_quirk(intel_pebs_isolation_quirk); @@ -4920,27 +4986,28 @@ __init int intel_pmu_init(void) x86_pmu.get_event_constraints = hsw_get_event_constraints; extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? hsw_format_attr : nhm_format_attr; - extra_attr = merge_attr(extra_attr, skl_format_attr); - to_free = extra_attr; - x86_pmu.cpu_events = hsw_events_attrs; + extra_skl_attr = skl_format_attr; + td_attr = hsw_events_attrs; mem_attr = hsw_mem_events_attrs; tsx_attr = hsw_tsx_events_attrs; - intel_pmu_pebs_data_source_skl( - boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X); + intel_pmu_pebs_data_source_skl(pmem); if (boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) { x86_pmu.flags |= PMU_FL_TFA; x86_pmu.get_event_constraints = tfa_get_event_constraints; x86_pmu.enable_all = intel_tfa_pmu_enable_all; x86_pmu.commit_scheduling = intel_tfa_commit_scheduling; - intel_pmu_attrs[1] = &dev_attr_allow_tsx_force_abort.attr; } pr_cont("Skylake events, "); name = "skylake"; break; + case INTEL_FAM6_ICELAKE_X: + case INTEL_FAM6_ICELAKE_XEON_D: + pmem = true; case INTEL_FAM6_ICELAKE_MOBILE: + case INTEL_FAM6_ICELAKE_DESKTOP: x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); @@ -4959,11 +5026,12 @@ __init int intel_pmu_init(void) x86_pmu.get_event_constraints = icl_get_event_constraints; extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? hsw_format_attr : nhm_format_attr; - extra_attr = merge_attr(extra_attr, skl_format_attr); - x86_pmu.cpu_events = get_icl_events_attrs(); + extra_skl_attr = skl_format_attr; + mem_attr = icl_events_attrs; + tsx_attr = icl_tsx_events_attrs; x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xca, .umask=0x02); x86_pmu.lbr_pt_coexist = true; - intel_pmu_pebs_data_source_skl(false); + intel_pmu_pebs_data_source_skl(pmem); pr_cont("Icelake events, "); name = "icelake"; break; @@ -4988,14 +5056,14 @@ __init int intel_pmu_init(void) snprintf(pmu_name_str, sizeof(pmu_name_str), "%s", name); - if (version >= 2 && extra_attr) { - x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr, - extra_attr); - WARN_ON(!x86_pmu.format_attrs); - } - x86_pmu.cpu_events = get_events_attrs(x86_pmu.cpu_events, - mem_attr, tsx_attr); + group_events_td.attrs = td_attr; + group_events_mem.attrs = mem_attr; + group_events_tsx.attrs = tsx_attr; + group_format_extra.attrs = extra_attr; + group_format_extra_skl.attrs = extra_skl_attr; + + x86_pmu.attr_update = attr_update; if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) { WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", @@ -5043,12 +5111,8 @@ __init int intel_pmu_init(void) x86_pmu.lbr_nr = 0; } - x86_pmu.caps_attrs = intel_pmu_caps_attrs; - - if (x86_pmu.lbr_nr) { - x86_pmu.caps_attrs = merge_attr(x86_pmu.caps_attrs, lbr_attrs); + if (x86_pmu.lbr_nr) pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr); - } /* * Access extra MSR may cause #GP under certain circumstances. @@ -5078,7 +5142,6 @@ __init int intel_pmu_init(void) if (x86_pmu.counter_freezing) x86_pmu.handle_irq = intel_pmu_handle_irq_v4; - kfree(to_free); return 0; } diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 6072f92cb8ea..688592b34564 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -96,6 +96,7 @@ #include <asm/cpu_device_id.h> #include <asm/intel-family.h> #include "../perf_event.h" +#include "../probe.h" MODULE_LICENSE("GPL"); @@ -144,25 +145,42 @@ enum perf_cstate_core_events { PERF_CSTATE_CORE_EVENT_MAX, }; -PMU_EVENT_ATTR_STRING(c1-residency, evattr_cstate_core_c1, "event=0x00"); -PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_core_c3, "event=0x01"); -PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_core_c6, "event=0x02"); -PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_core_c7, "event=0x03"); +PMU_EVENT_ATTR_STRING(c1-residency, attr_cstate_core_c1, "event=0x00"); +PMU_EVENT_ATTR_STRING(c3-residency, attr_cstate_core_c3, "event=0x01"); +PMU_EVENT_ATTR_STRING(c6-residency, attr_cstate_core_c6, "event=0x02"); +PMU_EVENT_ATTR_STRING(c7-residency, attr_cstate_core_c7, "event=0x03"); -static struct perf_cstate_msr core_msr[] = { - [PERF_CSTATE_CORE_C1_RES] = { MSR_CORE_C1_RES, &evattr_cstate_core_c1 }, - [PERF_CSTATE_CORE_C3_RES] = { MSR_CORE_C3_RESIDENCY, &evattr_cstate_core_c3 }, - [PERF_CSTATE_CORE_C6_RES] = { MSR_CORE_C6_RESIDENCY, &evattr_cstate_core_c6 }, - [PERF_CSTATE_CORE_C7_RES] = { MSR_CORE_C7_RESIDENCY, &evattr_cstate_core_c7 }, +static unsigned long core_msr_mask; + +PMU_EVENT_GROUP(events, cstate_core_c1); +PMU_EVENT_GROUP(events, cstate_core_c3); +PMU_EVENT_GROUP(events, cstate_core_c6); +PMU_EVENT_GROUP(events, cstate_core_c7); + +static bool test_msr(int idx, void *data) +{ + return test_bit(idx, (unsigned long *) data); +} + +static struct perf_msr core_msr[] = { + [PERF_CSTATE_CORE_C1_RES] = { MSR_CORE_C1_RES, &group_cstate_core_c1, test_msr }, + [PERF_CSTATE_CORE_C3_RES] = { MSR_CORE_C3_RESIDENCY, &group_cstate_core_c3, test_msr }, + [PERF_CSTATE_CORE_C6_RES] = { MSR_CORE_C6_RESIDENCY, &group_cstate_core_c6, test_msr }, + [PERF_CSTATE_CORE_C7_RES] = { MSR_CORE_C7_RESIDENCY, &group_cstate_core_c7, test_msr }, }; -static struct attribute *core_events_attrs[PERF_CSTATE_CORE_EVENT_MAX + 1] = { +static struct attribute *attrs_empty[] = { NULL, }; +/* + * There are no default events, but we need to create + * "events" group (with empty attrs) before updating + * it with detected events. + */ static struct attribute_group core_events_attr_group = { .name = "events", - .attrs = core_events_attrs, + .attrs = attrs_empty, }; DEFINE_CSTATE_FORMAT_ATTR(core_event, event, "config:0-63"); @@ -211,31 +229,37 @@ enum perf_cstate_pkg_events { PERF_CSTATE_PKG_EVENT_MAX, }; -PMU_EVENT_ATTR_STRING(c2-residency, evattr_cstate_pkg_c2, "event=0x00"); -PMU_EVENT_ATTR_STRING(c3-residency, evattr_cstate_pkg_c3, "event=0x01"); -PMU_EVENT_ATTR_STRING(c6-residency, evattr_cstate_pkg_c6, "event=0x02"); -PMU_EVENT_ATTR_STRING(c7-residency, evattr_cstate_pkg_c7, "event=0x03"); -PMU_EVENT_ATTR_STRING(c8-residency, evattr_cstate_pkg_c8, "event=0x04"); -PMU_EVENT_ATTR_STRING(c9-residency, evattr_cstate_pkg_c9, "event=0x05"); -PMU_EVENT_ATTR_STRING(c10-residency, evattr_cstate_pkg_c10, "event=0x06"); - -static struct perf_cstate_msr pkg_msr[] = { - [PERF_CSTATE_PKG_C2_RES] = { MSR_PKG_C2_RESIDENCY, &evattr_cstate_pkg_c2 }, - [PERF_CSTATE_PKG_C3_RES] = { MSR_PKG_C3_RESIDENCY, &evattr_cstate_pkg_c3 }, - [PERF_CSTATE_PKG_C6_RES] = { MSR_PKG_C6_RESIDENCY, &evattr_cstate_pkg_c6 }, - [PERF_CSTATE_PKG_C7_RES] = { MSR_PKG_C7_RESIDENCY, &evattr_cstate_pkg_c7 }, - [PERF_CSTATE_PKG_C8_RES] = { MSR_PKG_C8_RESIDENCY, &evattr_cstate_pkg_c8 }, - [PERF_CSTATE_PKG_C9_RES] = { MSR_PKG_C9_RESIDENCY, &evattr_cstate_pkg_c9 }, - [PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY, &evattr_cstate_pkg_c10 }, -}; - -static struct attribute *pkg_events_attrs[PERF_CSTATE_PKG_EVENT_MAX + 1] = { - NULL, +PMU_EVENT_ATTR_STRING(c2-residency, attr_cstate_pkg_c2, "event=0x00"); +PMU_EVENT_ATTR_STRING(c3-residency, attr_cstate_pkg_c3, "event=0x01"); +PMU_EVENT_ATTR_STRING(c6-residency, attr_cstate_pkg_c6, "event=0x02"); +PMU_EVENT_ATTR_STRING(c7-residency, attr_cstate_pkg_c7, "event=0x03"); +PMU_EVENT_ATTR_STRING(c8-residency, attr_cstate_pkg_c8, "event=0x04"); +PMU_EVENT_ATTR_STRING(c9-residency, attr_cstate_pkg_c9, "event=0x05"); +PMU_EVENT_ATTR_STRING(c10-residency, attr_cstate_pkg_c10, "event=0x06"); + +static unsigned long pkg_msr_mask; + +PMU_EVENT_GROUP(events, cstate_pkg_c2); +PMU_EVENT_GROUP(events, cstate_pkg_c3); +PMU_EVENT_GROUP(events, cstate_pkg_c6); +PMU_EVENT_GROUP(events, cstate_pkg_c7); +PMU_EVENT_GROUP(events, cstate_pkg_c8); +PMU_EVENT_GROUP(events, cstate_pkg_c9); +PMU_EVENT_GROUP(events, cstate_pkg_c10); + +static struct perf_msr pkg_msr[] = { + [PERF_CSTATE_PKG_C2_RES] = { MSR_PKG_C2_RESIDENCY, &group_cstate_pkg_c2, test_msr }, + [PERF_CSTATE_PKG_C3_RES] = { MSR_PKG_C3_RESIDENCY, &group_cstate_pkg_c3, test_msr }, + [PERF_CSTATE_PKG_C6_RES] = { MSR_PKG_C6_RESIDENCY, &group_cstate_pkg_c6, test_msr }, + [PERF_CSTATE_PKG_C7_RES] = { MSR_PKG_C7_RESIDENCY, &group_cstate_pkg_c7, test_msr }, + [PERF_CSTATE_PKG_C8_RES] = { MSR_PKG_C8_RESIDENCY, &group_cstate_pkg_c8, test_msr }, + [PERF_CSTATE_PKG_C9_RES] = { MSR_PKG_C9_RESIDENCY, &group_cstate_pkg_c9, test_msr }, + [PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY, &group_cstate_pkg_c10, test_msr }, }; static struct attribute_group pkg_events_attr_group = { .name = "events", - .attrs = pkg_events_attrs, + .attrs = attrs_empty, }; DEFINE_CSTATE_FORMAT_ATTR(pkg_event, event, "config:0-63"); @@ -289,7 +313,8 @@ static int cstate_pmu_event_init(struct perf_event *event) if (event->pmu == &cstate_core_pmu) { if (cfg >= PERF_CSTATE_CORE_EVENT_MAX) return -EINVAL; - if (!core_msr[cfg].attr) + cfg = array_index_nospec((unsigned long)cfg, PERF_CSTATE_CORE_EVENT_MAX); + if (!(core_msr_mask & (1 << cfg))) return -EINVAL; event->hw.event_base = core_msr[cfg].msr; cpu = cpumask_any_and(&cstate_core_cpu_mask, @@ -298,11 +323,11 @@ static int cstate_pmu_event_init(struct perf_event *event) if (cfg >= PERF_CSTATE_PKG_EVENT_MAX) return -EINVAL; cfg = array_index_nospec((unsigned long)cfg, PERF_CSTATE_PKG_EVENT_MAX); - if (!pkg_msr[cfg].attr) + if (!(pkg_msr_mask & (1 << cfg))) return -EINVAL; event->hw.event_base = pkg_msr[cfg].msr; cpu = cpumask_any_and(&cstate_pkg_cpu_mask, - topology_core_cpumask(event->cpu)); + topology_die_cpumask(event->cpu)); } else { return -ENOENT; } @@ -385,7 +410,7 @@ static int cstate_cpu_exit(unsigned int cpu) if (has_cstate_pkg && cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask)) { - target = cpumask_any_but(topology_core_cpumask(cpu), cpu); + target = cpumask_any_but(topology_die_cpumask(cpu), cpu); /* Migrate events if there is a valid target */ if (target < nr_cpu_ids) { cpumask_set_cpu(target, &cstate_pkg_cpu_mask); @@ -414,15 +439,35 @@ static int cstate_cpu_init(unsigned int cpu) * in the package cpu mask as the designated reader. */ target = cpumask_any_and(&cstate_pkg_cpu_mask, - topology_core_cpumask(cpu)); + topology_die_cpumask(cpu)); if (has_cstate_pkg && target >= nr_cpu_ids) cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask); return 0; } +const struct attribute_group *core_attr_update[] = { + &group_cstate_core_c1, + &group_cstate_core_c3, + &group_cstate_core_c6, + &group_cstate_core_c7, + NULL, +}; + +const struct attribute_group *pkg_attr_update[] = { + &group_cstate_pkg_c2, + &group_cstate_pkg_c3, + &group_cstate_pkg_c6, + &group_cstate_pkg_c7, + &group_cstate_pkg_c8, + &group_cstate_pkg_c9, + &group_cstate_pkg_c10, + NULL, +}; + static struct pmu cstate_core_pmu = { .attr_groups = core_attr_groups, + .attr_update = core_attr_update, .name = "cstate_core", .task_ctx_nr = perf_invalid_context, .event_init = cstate_pmu_event_init, @@ -437,6 +482,7 @@ static struct pmu cstate_core_pmu = { static struct pmu cstate_pkg_pmu = { .attr_groups = pkg_attr_groups, + .attr_update = pkg_attr_update, .name = "cstate_pkg", .task_ctx_nr = perf_invalid_context, .event_init = cstate_pmu_event_init, @@ -580,35 +626,11 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_PLUS, glm_cstates), X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE_MOBILE, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE_DESKTOP, snb_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); -/* - * Probe the cstate events and insert the available one into sysfs attrs - * Return false if there are no available events. - */ -static bool __init cstate_probe_msr(const unsigned long evmsk, int max, - struct perf_cstate_msr *msr, - struct attribute **attrs) -{ - bool found = false; - unsigned int bit; - u64 val; - - for (bit = 0; bit < max; bit++) { - if (test_bit(bit, &evmsk) && !rdmsrl_safe(msr[bit].msr, &val)) { - *attrs++ = &msr[bit].attr->attr.attr; - found = true; - } else { - msr[bit].attr = NULL; - } - } - *attrs = NULL; - - return found; -} - static int __init cstate_probe(const struct cstate_model *cm) { /* SLM has different MSR for PKG C6 */ @@ -620,13 +642,14 @@ static int __init cstate_probe(const struct cstate_model *cm) pkg_msr[PERF_CSTATE_CORE_C6_RES].msr = MSR_KNL_CORE_C6_RESIDENCY; - has_cstate_core = cstate_probe_msr(cm->core_events, - PERF_CSTATE_CORE_EVENT_MAX, - core_msr, core_events_attrs); + core_msr_mask = perf_msr_probe(core_msr, PERF_CSTATE_CORE_EVENT_MAX, + true, (void *) &cm->core_events); - has_cstate_pkg = cstate_probe_msr(cm->pkg_events, - PERF_CSTATE_PKG_EVENT_MAX, - pkg_msr, pkg_events_attrs); + pkg_msr_mask = perf_msr_probe(pkg_msr, PERF_CSTATE_PKG_EVENT_MAX, + true, (void *) &cm->pkg_events); + + has_cstate_core = !!core_msr_mask; + has_cstate_pkg = !!pkg_msr_mask; return (has_cstate_core || has_cstate_pkg) ? 0 : -ENODEV; } @@ -663,7 +686,13 @@ static int __init cstate_init(void) } if (has_cstate_pkg) { - err = perf_pmu_register(&cstate_pkg_pmu, cstate_pkg_pmu.name, -1); + if (topology_max_die_per_package() > 1) { + err = perf_pmu_register(&cstate_pkg_pmu, + "cstate_die", -1); + } else { + err = perf_pmu_register(&cstate_pkg_pmu, + cstate_pkg_pmu.name, -1); + } if (err) { has_cstate_pkg = false; pr_info("Failed to register cstate pkg pmu\n"); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 7acc526b4ad2..2c8db2c19328 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -337,7 +337,7 @@ static int alloc_pebs_buffer(int cpu) struct debug_store *ds = hwev->ds; size_t bsiz = x86_pmu.pebs_buffer_size; int max, node = cpu_to_node(cpu); - void *buffer, *ibuffer, *cea; + void *buffer, *insn_buff, *cea; if (!x86_pmu.pebs) return 0; @@ -351,12 +351,12 @@ static int alloc_pebs_buffer(int cpu) * buffer then. */ if (x86_pmu.intel_cap.pebs_format < 2) { - ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node); - if (!ibuffer) { + insn_buff = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node); + if (!insn_buff) { dsfree_pages(buffer, bsiz); return -ENOMEM; } - per_cpu(insn_buffer, cpu) = ibuffer; + per_cpu(insn_buffer, cpu) = insn_buff; } hwev->ds_pebs_vaddr = buffer; /* Update the cpu entry area mapping */ @@ -987,7 +987,7 @@ static u64 pebs_update_adaptive_cfg(struct perf_event *event) pebs_data_cfg |= PEBS_DATACFG_GP; if ((sample_type & PERF_SAMPLE_REGS_INTR) && - (attr->sample_regs_intr & PEBS_XMM_REGS)) + (attr->sample_regs_intr & PERF_REG_EXTENDED_MASK)) pebs_data_cfg |= PEBS_DATACFG_XMMS; if (sample_type & PERF_SAMPLE_BRANCH_STACK) { @@ -1964,10 +1964,9 @@ void __init intel_ds_init(void) x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS); x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS); x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE; - if (x86_pmu.version <= 4) { + if (x86_pmu.version <= 4) x86_pmu.pebs_no_isolation = 1; - x86_pmu.pebs_no_xmm_regs = 1; - } + if (x86_pmu.pebs) { char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-'; char *pebs_qual = ""; @@ -2020,9 +2019,9 @@ void __init intel_ds_init(void) PERF_SAMPLE_TIME; x86_pmu.flags |= PMU_FL_PEBS_ALL; pebs_qual = "-baseline"; + x86_get_pmu()->capabilities |= PERF_PMU_CAP_EXTENDED_REGS; } else { /* Only basic record supported */ - x86_pmu.pebs_no_xmm_regs = 1; x86_pmu.large_pebs_flags &= ~(PERF_SAMPLE_ADDR | PERF_SAMPLE_TIME | diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 26c03f5adfb9..64ab51ffdf06 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -55,27 +55,28 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/perf_event.h> +#include <linux/nospec.h> #include <asm/cpu_device_id.h> #include <asm/intel-family.h> #include "../perf_event.h" +#include "../probe.h" MODULE_LICENSE("GPL"); /* * RAPL energy status counters */ -#define RAPL_IDX_PP0_NRG_STAT 0 /* all cores */ -#define INTEL_RAPL_PP0 0x1 /* pseudo-encoding */ -#define RAPL_IDX_PKG_NRG_STAT 1 /* entire package */ -#define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */ -#define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */ -#define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */ -#define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */ -#define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */ -#define RAPL_IDX_PSYS_NRG_STAT 4 /* psys */ -#define INTEL_RAPL_PSYS 0x5 /* pseudo-encoding */ - -#define NR_RAPL_DOMAINS 0x5 +enum perf_rapl_events { + PERF_RAPL_PP0 = 0, /* all cores */ + PERF_RAPL_PKG, /* entire package */ + PERF_RAPL_RAM, /* DRAM */ + PERF_RAPL_PP1, /* gpu */ + PERF_RAPL_PSYS, /* psys */ + + PERF_RAPL_MAX, + NR_RAPL_DOMAINS = PERF_RAPL_MAX, +}; + static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { "pp0-core", "package", @@ -84,33 +85,6 @@ static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { "psys", }; -/* Clients have PP0, PKG */ -#define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\ - 1<<RAPL_IDX_PKG_NRG_STAT|\ - 1<<RAPL_IDX_PP1_NRG_STAT) - -/* Servers have PP0, PKG, RAM */ -#define RAPL_IDX_SRV (1<<RAPL_IDX_PP0_NRG_STAT|\ - 1<<RAPL_IDX_PKG_NRG_STAT|\ - 1<<RAPL_IDX_RAM_NRG_STAT) - -/* Servers have PP0, PKG, RAM, PP1 */ -#define RAPL_IDX_HSW (1<<RAPL_IDX_PP0_NRG_STAT|\ - 1<<RAPL_IDX_PKG_NRG_STAT|\ - 1<<RAPL_IDX_RAM_NRG_STAT|\ - 1<<RAPL_IDX_PP1_NRG_STAT) - -/* SKL clients have PP0, PKG, RAM, PP1, PSYS */ -#define RAPL_IDX_SKL_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\ - 1<<RAPL_IDX_PKG_NRG_STAT|\ - 1<<RAPL_IDX_RAM_NRG_STAT|\ - 1<<RAPL_IDX_PP1_NRG_STAT|\ - 1<<RAPL_IDX_PSYS_NRG_STAT) - -/* Knights Landing has PKG, RAM */ -#define RAPL_IDX_KNL (1<<RAPL_IDX_PKG_NRG_STAT|\ - 1<<RAPL_IDX_RAM_NRG_STAT) - /* * event code: LSB 8 bits, passed in attr->config * any other bit is reserved @@ -149,26 +123,32 @@ struct rapl_pmu { struct rapl_pmus { struct pmu pmu; - unsigned int maxpkg; + unsigned int maxdie; struct rapl_pmu *pmus[]; }; +struct rapl_model { + unsigned long events; + bool apply_quirk; +}; + /* 1/2^hw_unit Joule */ static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly; static struct rapl_pmus *rapl_pmus; static cpumask_t rapl_cpu_mask; static unsigned int rapl_cntr_mask; static u64 rapl_timer_ms; +static struct perf_msr rapl_msrs[]; static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu) { - unsigned int pkgid = topology_logical_package_id(cpu); + unsigned int dieid = topology_logical_die_id(cpu); /* * The unsigned check also catches the '-1' return value for non * existent mappings in the topology map. */ - return pkgid < rapl_pmus->maxpkg ? rapl_pmus->pmus[pkgid] : NULL; + return dieid < rapl_pmus->maxdie ? rapl_pmus->pmus[dieid] : NULL; } static inline u64 rapl_read_counter(struct perf_event *event) @@ -350,7 +330,7 @@ static void rapl_pmu_event_del(struct perf_event *event, int flags) static int rapl_pmu_event_init(struct perf_event *event) { u64 cfg = event->attr.config & RAPL_EVENT_MASK; - int bit, msr, ret = 0; + int bit, ret = 0; struct rapl_pmu *pmu; /* only look at RAPL events */ @@ -366,33 +346,12 @@ static int rapl_pmu_event_init(struct perf_event *event) event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG; - /* - * check event is known (determines counter) - */ - switch (cfg) { - case INTEL_RAPL_PP0: - bit = RAPL_IDX_PP0_NRG_STAT; - msr = MSR_PP0_ENERGY_STATUS; - break; - case INTEL_RAPL_PKG: - bit = RAPL_IDX_PKG_NRG_STAT; - msr = MSR_PKG_ENERGY_STATUS; - break; - case INTEL_RAPL_RAM: - bit = RAPL_IDX_RAM_NRG_STAT; - msr = MSR_DRAM_ENERGY_STATUS; - break; - case INTEL_RAPL_PP1: - bit = RAPL_IDX_PP1_NRG_STAT; - msr = MSR_PP1_ENERGY_STATUS; - break; - case INTEL_RAPL_PSYS: - bit = RAPL_IDX_PSYS_NRG_STAT; - msr = MSR_PLATFORM_ENERGY_STATUS; - break; - default: + if (!cfg || cfg >= NR_RAPL_DOMAINS + 1) return -EINVAL; - } + + cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1); + bit = cfg - 1; + /* check event supported */ if (!(rapl_cntr_mask & (1 << bit))) return -EINVAL; @@ -407,7 +366,7 @@ static int rapl_pmu_event_init(struct perf_event *event) return -EINVAL; event->cpu = pmu->cpu; event->pmu_private = pmu; - event->hw.event_base = msr; + event->hw.event_base = rapl_msrs[bit].msr; event->hw.config = cfg; event->hw.idx = bit; @@ -457,110 +416,111 @@ RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890 RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10"); RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_psys_scale, "2.3283064365386962890625e-10"); -static struct attribute *rapl_events_srv_attr[] = { - EVENT_PTR(rapl_cores), - EVENT_PTR(rapl_pkg), - EVENT_PTR(rapl_ram), +/* + * There are no default events, but we need to create + * "events" group (with empty attrs) before updating + * it with detected events. + */ +static struct attribute *attrs_empty[] = { + NULL, +}; - EVENT_PTR(rapl_cores_unit), - EVENT_PTR(rapl_pkg_unit), - EVENT_PTR(rapl_ram_unit), +static struct attribute_group rapl_pmu_events_group = { + .name = "events", + .attrs = attrs_empty, +}; - EVENT_PTR(rapl_cores_scale), - EVENT_PTR(rapl_pkg_scale), - EVENT_PTR(rapl_ram_scale), +DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7"); +static struct attribute *rapl_formats_attr[] = { + &format_attr_event.attr, NULL, }; -static struct attribute *rapl_events_cln_attr[] = { - EVENT_PTR(rapl_cores), - EVENT_PTR(rapl_pkg), - EVENT_PTR(rapl_gpu), - - EVENT_PTR(rapl_cores_unit), - EVENT_PTR(rapl_pkg_unit), - EVENT_PTR(rapl_gpu_unit), +static struct attribute_group rapl_pmu_format_group = { + .name = "format", + .attrs = rapl_formats_attr, +}; - EVENT_PTR(rapl_cores_scale), - EVENT_PTR(rapl_pkg_scale), - EVENT_PTR(rapl_gpu_scale), +static const struct attribute_group *rapl_attr_groups[] = { + &rapl_pmu_attr_group, + &rapl_pmu_format_group, + &rapl_pmu_events_group, NULL, }; -static struct attribute *rapl_events_hsw_attr[] = { +static struct attribute *rapl_events_cores[] = { EVENT_PTR(rapl_cores), - EVENT_PTR(rapl_pkg), - EVENT_PTR(rapl_gpu), - EVENT_PTR(rapl_ram), - EVENT_PTR(rapl_cores_unit), - EVENT_PTR(rapl_pkg_unit), - EVENT_PTR(rapl_gpu_unit), - EVENT_PTR(rapl_ram_unit), - EVENT_PTR(rapl_cores_scale), - EVENT_PTR(rapl_pkg_scale), - EVENT_PTR(rapl_gpu_scale), - EVENT_PTR(rapl_ram_scale), NULL, }; -static struct attribute *rapl_events_skl_attr[] = { - EVENT_PTR(rapl_cores), - EVENT_PTR(rapl_pkg), - EVENT_PTR(rapl_gpu), - EVENT_PTR(rapl_ram), - EVENT_PTR(rapl_psys), +static struct attribute_group rapl_events_cores_group = { + .name = "events", + .attrs = rapl_events_cores, +}; - EVENT_PTR(rapl_cores_unit), +static struct attribute *rapl_events_pkg[] = { + EVENT_PTR(rapl_pkg), EVENT_PTR(rapl_pkg_unit), - EVENT_PTR(rapl_gpu_unit), - EVENT_PTR(rapl_ram_unit), - EVENT_PTR(rapl_psys_unit), - - EVENT_PTR(rapl_cores_scale), EVENT_PTR(rapl_pkg_scale), - EVENT_PTR(rapl_gpu_scale), - EVENT_PTR(rapl_ram_scale), - EVENT_PTR(rapl_psys_scale), NULL, }; -static struct attribute *rapl_events_knl_attr[] = { - EVENT_PTR(rapl_pkg), - EVENT_PTR(rapl_ram), +static struct attribute_group rapl_events_pkg_group = { + .name = "events", + .attrs = rapl_events_pkg, +}; - EVENT_PTR(rapl_pkg_unit), +static struct attribute *rapl_events_ram[] = { + EVENT_PTR(rapl_ram), EVENT_PTR(rapl_ram_unit), - - EVENT_PTR(rapl_pkg_scale), EVENT_PTR(rapl_ram_scale), NULL, }; -static struct attribute_group rapl_pmu_events_group = { - .name = "events", - .attrs = NULL, /* patched at runtime */ +static struct attribute_group rapl_events_ram_group = { + .name = "events", + .attrs = rapl_events_ram, }; -DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7"); -static struct attribute *rapl_formats_attr[] = { - &format_attr_event.attr, +static struct attribute *rapl_events_gpu[] = { + EVENT_PTR(rapl_gpu), + EVENT_PTR(rapl_gpu_unit), + EVENT_PTR(rapl_gpu_scale), NULL, }; -static struct attribute_group rapl_pmu_format_group = { - .name = "format", - .attrs = rapl_formats_attr, +static struct attribute_group rapl_events_gpu_group = { + .name = "events", + .attrs = rapl_events_gpu, }; -static const struct attribute_group *rapl_attr_groups[] = { - &rapl_pmu_attr_group, - &rapl_pmu_format_group, - &rapl_pmu_events_group, +static struct attribute *rapl_events_psys[] = { + EVENT_PTR(rapl_psys), + EVENT_PTR(rapl_psys_unit), + EVENT_PTR(rapl_psys_scale), NULL, }; +static struct attribute_group rapl_events_psys_group = { + .name = "events", + .attrs = rapl_events_psys, +}; + +static bool test_msr(int idx, void *data) +{ + return test_bit(idx, (unsigned long *) data); +} + +static struct perf_msr rapl_msrs[] = { + [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr }, + [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr }, + [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr }, + [PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr }, + [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr }, +}; + static int rapl_cpu_offline(unsigned int cpu) { struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); @@ -572,7 +532,7 @@ static int rapl_cpu_offline(unsigned int cpu) pmu->cpu = -1; /* Find a new cpu to collect rapl events */ - target = cpumask_any_but(topology_core_cpumask(cpu), cpu); + target = cpumask_any_but(topology_die_cpumask(cpu), cpu); /* Migrate rapl events to the new target */ if (target < nr_cpu_ids) { @@ -599,14 +559,14 @@ static int rapl_cpu_online(unsigned int cpu) pmu->timer_interval = ms_to_ktime(rapl_timer_ms); rapl_hrtimer_init(pmu); - rapl_pmus->pmus[topology_logical_package_id(cpu)] = pmu; + rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu; } /* * Check if there is an online cpu in the package which collects rapl * events already. */ - target = cpumask_any_and(&rapl_cpu_mask, topology_core_cpumask(cpu)); + target = cpumask_any_and(&rapl_cpu_mask, topology_die_cpumask(cpu)); if (target < nr_cpu_ids) return 0; @@ -633,7 +593,7 @@ static int rapl_check_hw_unit(bool apply_quirk) * of 2. Datasheet, September 2014, Reference Number: 330784-001 " */ if (apply_quirk) - rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16; + rapl_hw_unit[PERF_RAPL_RAM] = 16; /* * Calculate the timer rate: @@ -669,23 +629,33 @@ static void cleanup_rapl_pmus(void) { int i; - for (i = 0; i < rapl_pmus->maxpkg; i++) + for (i = 0; i < rapl_pmus->maxdie; i++) kfree(rapl_pmus->pmus[i]); kfree(rapl_pmus); } +const struct attribute_group *rapl_attr_update[] = { + &rapl_events_cores_group, + &rapl_events_pkg_group, + &rapl_events_ram_group, + &rapl_events_gpu_group, + &rapl_events_gpu_group, + NULL, +}; + static int __init init_rapl_pmus(void) { - int maxpkg = topology_max_packages(); + int maxdie = topology_max_packages() * topology_max_die_per_package(); size_t size; - size = sizeof(*rapl_pmus) + maxpkg * sizeof(struct rapl_pmu *); + size = sizeof(*rapl_pmus) + maxdie * sizeof(struct rapl_pmu *); rapl_pmus = kzalloc(size, GFP_KERNEL); if (!rapl_pmus) return -ENOMEM; - rapl_pmus->maxpkg = maxpkg; + rapl_pmus->maxdie = maxdie; rapl_pmus->pmu.attr_groups = rapl_attr_groups; + rapl_pmus->pmu.attr_update = rapl_attr_update; rapl_pmus->pmu.task_ctx_nr = perf_invalid_context; rapl_pmus->pmu.event_init = rapl_pmu_event_init; rapl_pmus->pmu.add = rapl_pmu_event_add; @@ -701,105 +671,96 @@ static int __init init_rapl_pmus(void) #define X86_RAPL_MODEL_MATCH(model, init) \ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&init } -struct intel_rapl_init_fun { - bool apply_quirk; - int cntr_mask; - struct attribute **attrs; -}; - -static const struct intel_rapl_init_fun snb_rapl_init __initconst = { - .apply_quirk = false, - .cntr_mask = RAPL_IDX_CLN, - .attrs = rapl_events_cln_attr, +static struct rapl_model model_snb = { + .events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_PP1), + .apply_quirk = false, }; -static const struct intel_rapl_init_fun hsx_rapl_init __initconst = { - .apply_quirk = true, - .cntr_mask = RAPL_IDX_SRV, - .attrs = rapl_events_srv_attr, +static struct rapl_model model_snbep = { + .events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM), + .apply_quirk = false, }; -static const struct intel_rapl_init_fun hsw_rapl_init __initconst = { - .apply_quirk = false, - .cntr_mask = RAPL_IDX_HSW, - .attrs = rapl_events_hsw_attr, +static struct rapl_model model_hsw = { + .events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM) | + BIT(PERF_RAPL_PP1), + .apply_quirk = false, }; -static const struct intel_rapl_init_fun snbep_rapl_init __initconst = { - .apply_quirk = false, - .cntr_mask = RAPL_IDX_SRV, - .attrs = rapl_events_srv_attr, +static struct rapl_model model_hsx = { + .events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM), + .apply_quirk = true, }; -static const struct intel_rapl_init_fun knl_rapl_init __initconst = { - .apply_quirk = true, - .cntr_mask = RAPL_IDX_KNL, - .attrs = rapl_events_knl_attr, +static struct rapl_model model_knl = { + .events = BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM), + .apply_quirk = true, }; -static const struct intel_rapl_init_fun skl_rapl_init __initconst = { - .apply_quirk = false, - .cntr_mask = RAPL_IDX_SKL_CLN, - .attrs = rapl_events_skl_attr, +static struct rapl_model model_skl = { + .events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM) | + BIT(PERF_RAPL_PP1) | + BIT(PERF_RAPL_PSYS), + .apply_quirk = false, }; -static const struct x86_cpu_id rapl_cpu_match[] __initconst = { - X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE, snb_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE_X, snbep_rapl_init), - - X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE, snb_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE_X, snbep_rapl_init), - - X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_CORE, hsw_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_X, hsx_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_ULT, hsw_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_GT3E, hsw_rapl_init), - - X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_CORE, hsw_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_GT3E, hsw_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, hsx_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_XEON_D, hsx_rapl_init), - - X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, knl_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNM, knl_rapl_init), - - X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_MOBILE, skl_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP, skl_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X, hsx_rapl_init), - - X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_MOBILE, skl_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, skl_rapl_init), - - X86_RAPL_MODEL_MATCH(INTEL_FAM6_CANNONLAKE_MOBILE, skl_rapl_init), - - X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT, hsw_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_X, hsw_rapl_init), - - X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_PLUS, hsw_rapl_init), - - X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE_MOBILE, skl_rapl_init), +static const struct x86_cpu_id rapl_model_match[] __initconst = { + X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE, model_snb), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE_X, model_snbep), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE, model_snb), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE_X, model_snbep), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_CORE, model_hsw), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_X, model_hsx), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_ULT, model_hsw), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_GT3E, model_hsw), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_CORE, model_hsw), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_GT3E, model_hsw), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, model_hsx), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_XEON_D, model_hsx), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, model_knl), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNM, model_knl), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_MOBILE, model_skl), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP, model_skl), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X, model_hsx), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_MOBILE, model_skl), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, model_skl), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_CANNONLAKE_MOBILE, model_skl), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT, model_hsw), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_X, model_hsw), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_PLUS, model_hsw), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE_MOBILE, model_skl), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE_DESKTOP, model_skl), {}, }; -MODULE_DEVICE_TABLE(x86cpu, rapl_cpu_match); +MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); static int __init rapl_pmu_init(void) { const struct x86_cpu_id *id; - struct intel_rapl_init_fun *rapl_init; - bool apply_quirk; + struct rapl_model *rm; int ret; - id = x86_match_cpu(rapl_cpu_match); + id = x86_match_cpu(rapl_model_match); if (!id) return -ENODEV; - rapl_init = (struct intel_rapl_init_fun *)id->driver_data; - apply_quirk = rapl_init->apply_quirk; - rapl_cntr_mask = rapl_init->cntr_mask; - rapl_pmu_events_group.attrs = rapl_init->attrs; + rm = (struct rapl_model *) id->driver_data; + rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX, + false, (void *) &rm->events); - ret = rapl_check_hw_unit(apply_quirk); + ret = rapl_check_hw_unit(rm->apply_quirk); if (ret) return ret; diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 9e3fbd47cb56..3694a5d0703d 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -8,6 +8,7 @@ static struct intel_uncore_type *empty_uncore[] = { NULL, }; struct intel_uncore_type **uncore_msr_uncores = empty_uncore; struct intel_uncore_type **uncore_pci_uncores = empty_uncore; +struct intel_uncore_type **uncore_mmio_uncores = empty_uncore; static bool pcidrv_registered; struct pci_driver *uncore_pci_driver; @@ -15,7 +16,7 @@ struct pci_driver *uncore_pci_driver; DEFINE_RAW_SPINLOCK(pci2phy_map_lock); struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head); struct pci_extra_dev *uncore_extra_pci_dev; -static int max_packages; +static int max_dies; /* mask of cpus that collect uncore events */ static cpumask_t uncore_cpu_mask; @@ -28,7 +29,7 @@ struct event_constraint uncore_constraint_empty = MODULE_LICENSE("GPL"); -static int uncore_pcibus_to_physid(struct pci_bus *bus) +int uncore_pcibus_to_physid(struct pci_bus *bus) { struct pci2phy_map *map; int phys_id = -1; @@ -101,13 +102,13 @@ ssize_t uncore_event_show(struct kobject *kobj, struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) { - unsigned int pkgid = topology_logical_package_id(cpu); + unsigned int dieid = topology_logical_die_id(cpu); /* * The unsigned check also catches the '-1' return value for non * existent mappings in the topology map. */ - return pkgid < max_packages ? pmu->boxes[pkgid] : NULL; + return dieid < max_dies ? pmu->boxes[dieid] : NULL; } u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event) @@ -119,6 +120,21 @@ u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *eve return count; } +void uncore_mmio_exit_box(struct intel_uncore_box *box) +{ + if (box->io_addr) + iounmap(box->io_addr); +} + +u64 uncore_mmio_read_counter(struct intel_uncore_box *box, + struct perf_event *event) +{ + if (!box->io_addr) + return 0; + + return readq(box->io_addr + event->hw.event_base); +} + /* * generic get constraint function for shared match/mask registers. */ @@ -312,7 +328,7 @@ static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, uncore_pmu_init_hrtimer(box); box->cpu = -1; box->pci_phys_id = -1; - box->pkgid = -1; + box->dieid = -1; /* set default hrtimer timeout */ box->hrtimer_duration = UNCORE_PMU_HRTIMER_INTERVAL; @@ -827,10 +843,10 @@ static void uncore_pmu_unregister(struct intel_uncore_pmu *pmu) static void uncore_free_boxes(struct intel_uncore_pmu *pmu) { - int pkg; + int die; - for (pkg = 0; pkg < max_packages; pkg++) - kfree(pmu->boxes[pkg]); + for (die = 0; die < max_dies; die++) + kfree(pmu->boxes[die]); kfree(pmu->boxes); } @@ -867,7 +883,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) if (!pmus) return -ENOMEM; - size = max_packages * sizeof(struct intel_uncore_box *); + size = max_dies * sizeof(struct intel_uncore_box *); for (i = 0; i < type->num_boxes; i++) { pmus[i].func_id = setid ? i : -1; @@ -937,20 +953,21 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id struct intel_uncore_type *type; struct intel_uncore_pmu *pmu = NULL; struct intel_uncore_box *box; - int phys_id, pkg, ret; + int phys_id, die, ret; phys_id = uncore_pcibus_to_physid(pdev->bus); if (phys_id < 0) return -ENODEV; - pkg = topology_phys_to_logical_pkg(phys_id); - if (pkg < 0) + die = (topology_max_die_per_package() > 1) ? phys_id : + topology_phys_to_logical_pkg(phys_id); + if (die < 0) return -EINVAL; if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) { int idx = UNCORE_PCI_DEV_IDX(id->driver_data); - uncore_extra_pci_dev[pkg].dev[idx] = pdev; + uncore_extra_pci_dev[die].dev[idx] = pdev; pci_set_drvdata(pdev, NULL); return 0; } @@ -989,7 +1006,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)]; } - if (WARN_ON_ONCE(pmu->boxes[pkg] != NULL)) + if (WARN_ON_ONCE(pmu->boxes[die] != NULL)) return -EINVAL; box = uncore_alloc_box(type, NUMA_NO_NODE); @@ -1003,13 +1020,13 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id atomic_inc(&box->refcnt); box->pci_phys_id = phys_id; - box->pkgid = pkg; + box->dieid = die; box->pci_dev = pdev; box->pmu = pmu; uncore_box_init(box); pci_set_drvdata(pdev, box); - pmu->boxes[pkg] = box; + pmu->boxes[die] = box; if (atomic_inc_return(&pmu->activeboxes) > 1) return 0; @@ -1017,7 +1034,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id ret = uncore_pmu_register(pmu); if (ret) { pci_set_drvdata(pdev, NULL); - pmu->boxes[pkg] = NULL; + pmu->boxes[die] = NULL; uncore_box_exit(box); kfree(box); } @@ -1028,16 +1045,17 @@ static void uncore_pci_remove(struct pci_dev *pdev) { struct intel_uncore_box *box; struct intel_uncore_pmu *pmu; - int i, phys_id, pkg; + int i, phys_id, die; phys_id = uncore_pcibus_to_physid(pdev->bus); box = pci_get_drvdata(pdev); if (!box) { - pkg = topology_phys_to_logical_pkg(phys_id); + die = (topology_max_die_per_package() > 1) ? phys_id : + topology_phys_to_logical_pkg(phys_id); for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) { - if (uncore_extra_pci_dev[pkg].dev[i] == pdev) { - uncore_extra_pci_dev[pkg].dev[i] = NULL; + if (uncore_extra_pci_dev[die].dev[i] == pdev) { + uncore_extra_pci_dev[die].dev[i] = NULL; break; } } @@ -1050,7 +1068,7 @@ static void uncore_pci_remove(struct pci_dev *pdev) return; pci_set_drvdata(pdev, NULL); - pmu->boxes[box->pkgid] = NULL; + pmu->boxes[box->dieid] = NULL; if (atomic_dec_return(&pmu->activeboxes) == 0) uncore_pmu_unregister(pmu); uncore_box_exit(box); @@ -1062,7 +1080,7 @@ static int __init uncore_pci_init(void) size_t size; int ret; - size = max_packages * sizeof(struct pci_extra_dev); + size = max_dies * sizeof(struct pci_extra_dev); uncore_extra_pci_dev = kzalloc(size, GFP_KERNEL); if (!uncore_extra_pci_dev) { ret = -ENOMEM; @@ -1109,11 +1127,11 @@ static void uncore_change_type_ctx(struct intel_uncore_type *type, int old_cpu, { struct intel_uncore_pmu *pmu = type->pmus; struct intel_uncore_box *box; - int i, pkg; + int i, die; - pkg = topology_logical_package_id(old_cpu < 0 ? new_cpu : old_cpu); + die = topology_logical_die_id(old_cpu < 0 ? new_cpu : old_cpu); for (i = 0; i < type->num_boxes; i++, pmu++) { - box = pmu->boxes[pkg]; + box = pmu->boxes[die]; if (!box) continue; @@ -1141,18 +1159,33 @@ static void uncore_change_context(struct intel_uncore_type **uncores, uncore_change_type_ctx(*uncores, old_cpu, new_cpu); } -static int uncore_event_cpu_offline(unsigned int cpu) +static void uncore_box_unref(struct intel_uncore_type **types, int id) { - struct intel_uncore_type *type, **types = uncore_msr_uncores; + struct intel_uncore_type *type; struct intel_uncore_pmu *pmu; struct intel_uncore_box *box; - int i, pkg, target; + int i; + + for (; *types; types++) { + type = *types; + pmu = type->pmus; + for (i = 0; i < type->num_boxes; i++, pmu++) { + box = pmu->boxes[id]; + if (box && atomic_dec_return(&box->refcnt) == 0) + uncore_box_exit(box); + } + } +} + +static int uncore_event_cpu_offline(unsigned int cpu) +{ + int die, target; /* Check if exiting cpu is used for collecting uncore events */ if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask)) goto unref; /* Find a new cpu to collect uncore events */ - target = cpumask_any_but(topology_core_cpumask(cpu), cpu); + target = cpumask_any_but(topology_die_cpumask(cpu), cpu); /* Migrate uncore events to the new target */ if (target < nr_cpu_ids) @@ -1161,25 +1194,19 @@ static int uncore_event_cpu_offline(unsigned int cpu) target = -1; uncore_change_context(uncore_msr_uncores, cpu, target); + uncore_change_context(uncore_mmio_uncores, cpu, target); uncore_change_context(uncore_pci_uncores, cpu, target); unref: /* Clear the references */ - pkg = topology_logical_package_id(cpu); - for (; *types; types++) { - type = *types; - pmu = type->pmus; - for (i = 0; i < type->num_boxes; i++, pmu++) { - box = pmu->boxes[pkg]; - if (box && atomic_dec_return(&box->refcnt) == 0) - uncore_box_exit(box); - } - } + die = topology_logical_die_id(cpu); + uncore_box_unref(uncore_msr_uncores, die); + uncore_box_unref(uncore_mmio_uncores, die); return 0; } static int allocate_boxes(struct intel_uncore_type **types, - unsigned int pkg, unsigned int cpu) + unsigned int die, unsigned int cpu) { struct intel_uncore_box *box, *tmp; struct intel_uncore_type *type; @@ -1192,20 +1219,20 @@ static int allocate_boxes(struct intel_uncore_type **types, type = *types; pmu = type->pmus; for (i = 0; i < type->num_boxes; i++, pmu++) { - if (pmu->boxes[pkg]) + if (pmu->boxes[die]) continue; box = uncore_alloc_box(type, cpu_to_node(cpu)); if (!box) goto cleanup; box->pmu = pmu; - box->pkgid = pkg; + box->dieid = die; list_add(&box->active_list, &allocated); } } /* Install them in the pmus */ list_for_each_entry_safe(box, tmp, &allocated, active_list) { list_del_init(&box->active_list); - box->pmu->boxes[pkg] = box; + box->pmu->boxes[die] = box; } return 0; @@ -1217,15 +1244,15 @@ cleanup: return -ENOMEM; } -static int uncore_event_cpu_online(unsigned int cpu) +static int uncore_box_ref(struct intel_uncore_type **types, + int id, unsigned int cpu) { - struct intel_uncore_type *type, **types = uncore_msr_uncores; + struct intel_uncore_type *type; struct intel_uncore_pmu *pmu; struct intel_uncore_box *box; - int i, ret, pkg, target; + int i, ret; - pkg = topology_logical_package_id(cpu); - ret = allocate_boxes(types, pkg, cpu); + ret = allocate_boxes(types, id, cpu); if (ret) return ret; @@ -1233,23 +1260,38 @@ static int uncore_event_cpu_online(unsigned int cpu) type = *types; pmu = type->pmus; for (i = 0; i < type->num_boxes; i++, pmu++) { - box = pmu->boxes[pkg]; + box = pmu->boxes[id]; if (box && atomic_inc_return(&box->refcnt) == 1) uncore_box_init(box); } } + return 0; +} + +static int uncore_event_cpu_online(unsigned int cpu) +{ + int die, target, msr_ret, mmio_ret; + + die = topology_logical_die_id(cpu); + msr_ret = uncore_box_ref(uncore_msr_uncores, die, cpu); + mmio_ret = uncore_box_ref(uncore_mmio_uncores, die, cpu); + if (msr_ret && mmio_ret) + return -ENOMEM; /* * Check if there is an online cpu in the package * which collects uncore events already. */ - target = cpumask_any_and(&uncore_cpu_mask, topology_core_cpumask(cpu)); + target = cpumask_any_and(&uncore_cpu_mask, topology_die_cpumask(cpu)); if (target < nr_cpu_ids) return 0; cpumask_set_cpu(cpu, &uncore_cpu_mask); - uncore_change_context(uncore_msr_uncores, -1, cpu); + if (!msr_ret) + uncore_change_context(uncore_msr_uncores, -1, cpu); + if (!mmio_ret) + uncore_change_context(uncore_mmio_uncores, -1, cpu); uncore_change_context(uncore_pci_uncores, -1, cpu); return 0; } @@ -1297,12 +1339,35 @@ err: return ret; } +static int __init uncore_mmio_init(void) +{ + struct intel_uncore_type **types = uncore_mmio_uncores; + int ret; + + ret = uncore_types_init(types, true); + if (ret) + goto err; + + for (; *types; types++) { + ret = type_pmu_register(*types); + if (ret) + goto err; + } + return 0; +err: + uncore_types_exit(uncore_mmio_uncores); + uncore_mmio_uncores = empty_uncore; + return ret; +} + + #define X86_UNCORE_MODEL_MATCH(model, init) \ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&init } struct intel_uncore_init_fun { void (*cpu_init)(void); int (*pci_init)(void); + void (*mmio_init)(void); }; static const struct intel_uncore_init_fun nhm_uncore_init __initconst = { @@ -1373,6 +1438,12 @@ static const struct intel_uncore_init_fun icl_uncore_init __initconst = { .pci_init = skl_uncore_pci_init, }; +static const struct intel_uncore_init_fun snr_uncore_init __initconst = { + .cpu_init = snr_uncore_cpu_init, + .pci_init = snr_uncore_pci_init, + .mmio_init = snr_uncore_mmio_init, +}; + static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM_EP, nhm_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM, nhm_uncore_init), @@ -1400,6 +1471,9 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_MOBILE, skl_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, skl_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE_MOBILE, icl_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE_NNPI, icl_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE_DESKTOP, icl_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ATOM_TREMONT_X, snr_uncore_init), {}, }; @@ -1409,7 +1483,7 @@ static int __init intel_uncore_init(void) { const struct x86_cpu_id *id; struct intel_uncore_init_fun *uncore_init; - int pret = 0, cret = 0, ret; + int pret = 0, cret = 0, mret = 0, ret; id = x86_match_cpu(intel_uncore_match); if (!id) @@ -1418,7 +1492,7 @@ static int __init intel_uncore_init(void) if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) return -ENODEV; - max_packages = topology_max_packages(); + max_dies = topology_max_packages() * topology_max_die_per_package(); uncore_init = (struct intel_uncore_init_fun *)id->driver_data; if (uncore_init->pci_init) { @@ -1432,7 +1506,12 @@ static int __init intel_uncore_init(void) cret = uncore_cpu_init(); } - if (cret && pret) + if (uncore_init->mmio_init) { + uncore_init->mmio_init(); + mret = uncore_mmio_init(); + } + + if (cret && pret && mret) return -ENODEV; /* Install hotplug callbacks to setup the targets for each package */ @@ -1446,6 +1525,7 @@ static int __init intel_uncore_init(void) err: uncore_types_exit(uncore_msr_uncores); + uncore_types_exit(uncore_mmio_uncores); uncore_pci_exit(); return ret; } @@ -1455,6 +1535,7 @@ static void __exit intel_uncore_exit(void) { cpuhp_remove_state(CPUHP_AP_PERF_X86_UNCORE_ONLINE); uncore_types_exit(uncore_msr_uncores); + uncore_types_exit(uncore_mmio_uncores); uncore_pci_exit(); } module_exit(intel_uncore_exit); diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 79eb2e21e4f0..f36f7bebbc1b 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -2,6 +2,7 @@ #include <linux/slab.h> #include <linux/pci.h> #include <asm/apicdef.h> +#include <linux/io-64-nonatomic-lo-hi.h> #include <linux/perf_event.h> #include "../perf_event.h" @@ -56,7 +57,10 @@ struct intel_uncore_type { unsigned fixed_ctr; unsigned fixed_ctl; unsigned box_ctl; - unsigned msr_offset; + union { + unsigned msr_offset; + unsigned mmio_offset; + }; unsigned num_shared_regs:8; unsigned single_fixed:1; unsigned pair_ctr_ctl:1; @@ -108,7 +112,7 @@ struct intel_uncore_extra_reg { struct intel_uncore_box { int pci_phys_id; - int pkgid; /* Logical package ID */ + int dieid; /* Logical die ID */ int n_active; /* number of active events */ int n_events; int cpu; /* cpu to collect events */ @@ -125,7 +129,7 @@ struct intel_uncore_box { struct hrtimer hrtimer; struct list_head list; struct list_head active_list; - void *io_addr; + void __iomem *io_addr; struct intel_uncore_extra_reg shared_regs[0]; }; @@ -159,6 +163,7 @@ struct pci2phy_map { }; struct pci2phy_map *__find_pci2phy_map(int segment); +int uncore_pcibus_to_physid(struct pci_bus *bus); ssize_t uncore_event_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf); @@ -190,6 +195,13 @@ static inline bool uncore_pmc_freerunning(int idx) return idx == UNCORE_PMC_IDX_FREERUNNING; } +static inline +unsigned int uncore_mmio_box_ctl(struct intel_uncore_box *box) +{ + return box->pmu->type->box_ctl + + box->pmu->type->mmio_offset * box->pmu->pmu_idx; +} + static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box) { return box->pmu->type->box_ctl; @@ -330,7 +342,7 @@ unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx) static inline unsigned uncore_fixed_ctl(struct intel_uncore_box *box) { - if (box->pci_dev) + if (box->pci_dev || box->io_addr) return uncore_pci_fixed_ctl(box); else return uncore_msr_fixed_ctl(box); @@ -339,7 +351,7 @@ unsigned uncore_fixed_ctl(struct intel_uncore_box *box) static inline unsigned uncore_fixed_ctr(struct intel_uncore_box *box) { - if (box->pci_dev) + if (box->pci_dev || box->io_addr) return uncore_pci_fixed_ctr(box); else return uncore_msr_fixed_ctr(box); @@ -348,7 +360,7 @@ unsigned uncore_fixed_ctr(struct intel_uncore_box *box) static inline unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx) { - if (box->pci_dev) + if (box->pci_dev || box->io_addr) return uncore_pci_event_ctl(box, idx); else return uncore_msr_event_ctl(box, idx); @@ -357,7 +369,7 @@ unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx) static inline unsigned uncore_perf_ctr(struct intel_uncore_box *box, int idx) { - if (box->pci_dev) + if (box->pci_dev || box->io_addr) return uncore_pci_perf_ctr(box, idx); else return uncore_msr_perf_ctr(box, idx); @@ -419,6 +431,16 @@ static inline bool is_freerunning_event(struct perf_event *event) (((cfg >> 8) & 0xff) >= UNCORE_FREERUNNING_UMASK_START); } +/* Check and reject invalid config */ +static inline int uncore_freerunning_hw_config(struct intel_uncore_box *box, + struct perf_event *event) +{ + if (is_freerunning_event(event)) + return 0; + + return -EINVAL; +} + static inline void uncore_disable_box(struct intel_uncore_box *box) { if (box->pmu->type->ops->disable_box) @@ -467,7 +489,7 @@ static inline void uncore_box_exit(struct intel_uncore_box *box) static inline bool uncore_box_is_fake(struct intel_uncore_box *box) { - return (box->pkgid < 0); + return (box->dieid < 0); } static inline struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event) @@ -482,6 +504,9 @@ static inline struct intel_uncore_box *uncore_event_to_box(struct perf_event *ev struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu); u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event); +void uncore_mmio_exit_box(struct intel_uncore_box *box); +u64 uncore_mmio_read_counter(struct intel_uncore_box *box, + struct perf_event *event); void uncore_pmu_start_hrtimer(struct intel_uncore_box *box); void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box); void uncore_pmu_event_start(struct perf_event *event, int flags); @@ -497,6 +522,7 @@ u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx); extern struct intel_uncore_type **uncore_msr_uncores; extern struct intel_uncore_type **uncore_pci_uncores; +extern struct intel_uncore_type **uncore_mmio_uncores; extern struct pci_driver *uncore_pci_driver; extern raw_spinlock_t pci2phy_map_lock; extern struct list_head pci2phy_map_head; @@ -528,6 +554,9 @@ int knl_uncore_pci_init(void); void knl_uncore_cpu_init(void); int skx_uncore_pci_init(void); void skx_uncore_cpu_init(void); +int snr_uncore_pci_init(void); +void snr_uncore_cpu_init(void); +void snr_uncore_mmio_init(void); /* uncore_nhmex.c */ void nhmex_uncore_cpu_init(void); diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index f8431819b3e1..dbaa1b088a30 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -3,27 +3,29 @@ #include "uncore.h" /* Uncore IMC PCI IDs */ -#define PCI_DEVICE_ID_INTEL_SNB_IMC 0x0100 -#define PCI_DEVICE_ID_INTEL_IVB_IMC 0x0154 -#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150 -#define PCI_DEVICE_ID_INTEL_HSW_IMC 0x0c00 -#define PCI_DEVICE_ID_INTEL_HSW_U_IMC 0x0a04 -#define PCI_DEVICE_ID_INTEL_BDW_IMC 0x1604 -#define PCI_DEVICE_ID_INTEL_SKL_U_IMC 0x1904 -#define PCI_DEVICE_ID_INTEL_SKL_Y_IMC 0x190c -#define PCI_DEVICE_ID_INTEL_SKL_HD_IMC 0x1900 -#define PCI_DEVICE_ID_INTEL_SKL_HQ_IMC 0x1910 -#define PCI_DEVICE_ID_INTEL_SKL_SD_IMC 0x190f -#define PCI_DEVICE_ID_INTEL_SKL_SQ_IMC 0x191f -#define PCI_DEVICE_ID_INTEL_KBL_Y_IMC 0x590c -#define PCI_DEVICE_ID_INTEL_KBL_U_IMC 0x5904 -#define PCI_DEVICE_ID_INTEL_KBL_UQ_IMC 0x5914 -#define PCI_DEVICE_ID_INTEL_KBL_SD_IMC 0x590f -#define PCI_DEVICE_ID_INTEL_KBL_SQ_IMC 0x591f -#define PCI_DEVICE_ID_INTEL_CFL_2U_IMC 0x3ecc -#define PCI_DEVICE_ID_INTEL_CFL_4U_IMC 0x3ed0 -#define PCI_DEVICE_ID_INTEL_CFL_4H_IMC 0x3e10 -#define PCI_DEVICE_ID_INTEL_CFL_6H_IMC 0x3ec4 +#define PCI_DEVICE_ID_INTEL_SNB_IMC 0x0100 +#define PCI_DEVICE_ID_INTEL_IVB_IMC 0x0154 +#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150 +#define PCI_DEVICE_ID_INTEL_HSW_IMC 0x0c00 +#define PCI_DEVICE_ID_INTEL_HSW_U_IMC 0x0a04 +#define PCI_DEVICE_ID_INTEL_BDW_IMC 0x1604 +#define PCI_DEVICE_ID_INTEL_SKL_U_IMC 0x1904 +#define PCI_DEVICE_ID_INTEL_SKL_Y_IMC 0x190c +#define PCI_DEVICE_ID_INTEL_SKL_HD_IMC 0x1900 +#define PCI_DEVICE_ID_INTEL_SKL_HQ_IMC 0x1910 +#define PCI_DEVICE_ID_INTEL_SKL_SD_IMC 0x190f +#define PCI_DEVICE_ID_INTEL_SKL_SQ_IMC 0x191f +#define PCI_DEVICE_ID_INTEL_KBL_Y_IMC 0x590c +#define PCI_DEVICE_ID_INTEL_KBL_U_IMC 0x5904 +#define PCI_DEVICE_ID_INTEL_KBL_UQ_IMC 0x5914 +#define PCI_DEVICE_ID_INTEL_KBL_SD_IMC 0x590f +#define PCI_DEVICE_ID_INTEL_KBL_SQ_IMC 0x591f +#define PCI_DEVICE_ID_INTEL_KBL_HQ_IMC 0x5910 +#define PCI_DEVICE_ID_INTEL_KBL_WQ_IMC 0x5918 +#define PCI_DEVICE_ID_INTEL_CFL_2U_IMC 0x3ecc +#define PCI_DEVICE_ID_INTEL_CFL_4U_IMC 0x3ed0 +#define PCI_DEVICE_ID_INTEL_CFL_4H_IMC 0x3e10 +#define PCI_DEVICE_ID_INTEL_CFL_6H_IMC 0x3ec4 #define PCI_DEVICE_ID_INTEL_CFL_2S_D_IMC 0x3e0f #define PCI_DEVICE_ID_INTEL_CFL_4S_D_IMC 0x3e1f #define PCI_DEVICE_ID_INTEL_CFL_6S_D_IMC 0x3ec2 @@ -34,9 +36,15 @@ #define PCI_DEVICE_ID_INTEL_CFL_4S_S_IMC 0x3e33 #define PCI_DEVICE_ID_INTEL_CFL_6S_S_IMC 0x3eca #define PCI_DEVICE_ID_INTEL_CFL_8S_S_IMC 0x3e32 +#define PCI_DEVICE_ID_INTEL_AML_YD_IMC 0x590c +#define PCI_DEVICE_ID_INTEL_AML_YQ_IMC 0x590d +#define PCI_DEVICE_ID_INTEL_WHL_UQ_IMC 0x3ed0 +#define PCI_DEVICE_ID_INTEL_WHL_4_UQ_IMC 0x3e34 +#define PCI_DEVICE_ID_INTEL_WHL_UD_IMC 0x3e35 #define PCI_DEVICE_ID_INTEL_ICL_U_IMC 0x8a02 #define PCI_DEVICE_ID_INTEL_ICL_U2_IMC 0x8a12 + /* SNB event control */ #define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff #define SNB_UNC_CTL_UMASK_MASK 0x0000ff00 @@ -420,11 +428,6 @@ static void snb_uncore_imc_init_box(struct intel_uncore_box *box) box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL; } -static void snb_uncore_imc_exit_box(struct intel_uncore_box *box) -{ - iounmap(box->io_addr); -} - static void snb_uncore_imc_enable_box(struct intel_uncore_box *box) {} @@ -437,13 +440,6 @@ static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct per static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event) {} -static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - - return (u64)*(unsigned int *)(box->io_addr + hwc->event_base); -} - /* * Keep the custom event_init() function compatible with old event * encoding for free running counters. @@ -570,13 +566,13 @@ static struct pmu snb_uncore_imc_pmu = { static struct intel_uncore_ops snb_uncore_imc_ops = { .init_box = snb_uncore_imc_init_box, - .exit_box = snb_uncore_imc_exit_box, + .exit_box = uncore_mmio_exit_box, .enable_box = snb_uncore_imc_enable_box, .disable_box = snb_uncore_imc_disable_box, .disable_event = snb_uncore_imc_disable_event, .enable_event = snb_uncore_imc_enable_event, .hw_config = snb_uncore_imc_hw_config, - .read_counter = snb_uncore_imc_read_counter, + .read_counter = uncore_mmio_read_counter, }; static struct intel_uncore_type snb_uncore_imc = { @@ -682,6 +678,14 @@ static const struct pci_device_id skl_uncore_pci_ids[] = { .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), }, { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_HQ_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_WQ_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_2U_IMC), .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), }, @@ -737,6 +741,26 @@ static const struct pci_device_id skl_uncore_pci_ids[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_8S_S_IMC), .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_AML_YD_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_AML_YQ_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_WHL_UQ_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_WHL_4_UQ_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_WHL_UD_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, { /* end: all zeroes */ }, }; @@ -807,6 +831,8 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = { IMC_DEV(KBL_UQ_IMC, &skl_uncore_pci_driver), /* 7th Gen Core U Quad Core */ IMC_DEV(KBL_SD_IMC, &skl_uncore_pci_driver), /* 7th Gen Core S Dual Core */ IMC_DEV(KBL_SQ_IMC, &skl_uncore_pci_driver), /* 7th Gen Core S Quad Core */ + IMC_DEV(KBL_HQ_IMC, &skl_uncore_pci_driver), /* 7th Gen Core H Quad Core */ + IMC_DEV(KBL_WQ_IMC, &skl_uncore_pci_driver), /* 7th Gen Core S 4 cores Work Station */ IMC_DEV(CFL_2U_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U 2 Cores */ IMC_DEV(CFL_4U_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U 4 Cores */ IMC_DEV(CFL_4H_IMC, &skl_uncore_pci_driver), /* 8th Gen Core H 4 Cores */ @@ -821,6 +847,11 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = { IMC_DEV(CFL_4S_S_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 4 Cores Server */ IMC_DEV(CFL_6S_S_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 6 Cores Server */ IMC_DEV(CFL_8S_S_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 8 Cores Server */ + IMC_DEV(AML_YD_IMC, &skl_uncore_pci_driver), /* 8th Gen Core Y Mobile Dual Core */ + IMC_DEV(AML_YQ_IMC, &skl_uncore_pci_driver), /* 8th Gen Core Y Mobile Quad Core */ + IMC_DEV(WHL_UQ_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U Mobile Quad Core */ + IMC_DEV(WHL_4_UQ_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U Mobile Quad Core */ + IMC_DEV(WHL_UD_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U Mobile Dual Core */ IMC_DEV(ICL_U_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */ IMC_DEV(ICL_U2_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */ { /* end marker */ } diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index b10e04387f38..b10a5ec79e48 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -324,12 +324,77 @@ #define SKX_M2M_PCI_PMON_CTR0 0x200 #define SKX_M2M_PCI_PMON_BOX_CTL 0x258 +/* SNR Ubox */ +#define SNR_U_MSR_PMON_CTR0 0x1f98 +#define SNR_U_MSR_PMON_CTL0 0x1f91 +#define SNR_U_MSR_PMON_UCLK_FIXED_CTL 0x1f93 +#define SNR_U_MSR_PMON_UCLK_FIXED_CTR 0x1f94 + +/* SNR CHA */ +#define SNR_CHA_RAW_EVENT_MASK_EXT 0x3ffffff +#define SNR_CHA_MSR_PMON_CTL0 0x1c01 +#define SNR_CHA_MSR_PMON_CTR0 0x1c08 +#define SNR_CHA_MSR_PMON_BOX_CTL 0x1c00 +#define SNR_C0_MSR_PMON_BOX_FILTER0 0x1c05 + + +/* SNR IIO */ +#define SNR_IIO_MSR_PMON_CTL0 0x1e08 +#define SNR_IIO_MSR_PMON_CTR0 0x1e01 +#define SNR_IIO_MSR_PMON_BOX_CTL 0x1e00 +#define SNR_IIO_MSR_OFFSET 0x10 +#define SNR_IIO_PMON_RAW_EVENT_MASK_EXT 0x7ffff + +/* SNR IRP */ +#define SNR_IRP0_MSR_PMON_CTL0 0x1ea8 +#define SNR_IRP0_MSR_PMON_CTR0 0x1ea1 +#define SNR_IRP0_MSR_PMON_BOX_CTL 0x1ea0 +#define SNR_IRP_MSR_OFFSET 0x10 + +/* SNR M2PCIE */ +#define SNR_M2PCIE_MSR_PMON_CTL0 0x1e58 +#define SNR_M2PCIE_MSR_PMON_CTR0 0x1e51 +#define SNR_M2PCIE_MSR_PMON_BOX_CTL 0x1e50 +#define SNR_M2PCIE_MSR_OFFSET 0x10 + +/* SNR PCU */ +#define SNR_PCU_MSR_PMON_CTL0 0x1ef1 +#define SNR_PCU_MSR_PMON_CTR0 0x1ef8 +#define SNR_PCU_MSR_PMON_BOX_CTL 0x1ef0 +#define SNR_PCU_MSR_PMON_BOX_FILTER 0x1efc + +/* SNR M2M */ +#define SNR_M2M_PCI_PMON_CTL0 0x468 +#define SNR_M2M_PCI_PMON_CTR0 0x440 +#define SNR_M2M_PCI_PMON_BOX_CTL 0x438 +#define SNR_M2M_PCI_PMON_UMASK_EXT 0xff + +/* SNR PCIE3 */ +#define SNR_PCIE3_PCI_PMON_CTL0 0x508 +#define SNR_PCIE3_PCI_PMON_CTR0 0x4e8 +#define SNR_PCIE3_PCI_PMON_BOX_CTL 0x4e4 + +/* SNR IMC */ +#define SNR_IMC_MMIO_PMON_FIXED_CTL 0x54 +#define SNR_IMC_MMIO_PMON_FIXED_CTR 0x38 +#define SNR_IMC_MMIO_PMON_CTL0 0x40 +#define SNR_IMC_MMIO_PMON_CTR0 0x8 +#define SNR_IMC_MMIO_PMON_BOX_CTL 0x22800 +#define SNR_IMC_MMIO_OFFSET 0x4000 +#define SNR_IMC_MMIO_SIZE 0x4000 +#define SNR_IMC_MMIO_BASE_OFFSET 0xd0 +#define SNR_IMC_MMIO_BASE_MASK 0x1FFFFFFF +#define SNR_IMC_MMIO_MEM0_OFFSET 0xd8 +#define SNR_IMC_MMIO_MEM0_MASK 0x7FF + DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6"); DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21"); DEFINE_UNCORE_FORMAT_ATTR(use_occ_ctr, use_occ_ctr, "config:7"); DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); DEFINE_UNCORE_FORMAT_ATTR(umask_ext, umask, "config:8-15,32-43,45-55"); +DEFINE_UNCORE_FORMAT_ATTR(umask_ext2, umask, "config:8-15,32-57"); +DEFINE_UNCORE_FORMAT_ATTR(umask_ext3, umask, "config:8-15,32-39"); DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16"); DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19"); @@ -343,11 +408,14 @@ DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30"); DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51"); DEFINE_UNCORE_FORMAT_ATTR(occ_edge_det, occ_edge_det, "config:31"); DEFINE_UNCORE_FORMAT_ATTR(ch_mask, ch_mask, "config:36-43"); +DEFINE_UNCORE_FORMAT_ATTR(ch_mask2, ch_mask, "config:36-47"); DEFINE_UNCORE_FORMAT_ATTR(fc_mask, fc_mask, "config:44-46"); +DEFINE_UNCORE_FORMAT_ATTR(fc_mask2, fc_mask, "config:48-50"); DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4"); DEFINE_UNCORE_FORMAT_ATTR(filter_tid2, filter_tid, "config1:0"); DEFINE_UNCORE_FORMAT_ATTR(filter_tid3, filter_tid, "config1:0-5"); DEFINE_UNCORE_FORMAT_ATTR(filter_tid4, filter_tid, "config1:0-8"); +DEFINE_UNCORE_FORMAT_ATTR(filter_tid5, filter_tid, "config1:0-9"); DEFINE_UNCORE_FORMAT_ATTR(filter_cid, filter_cid, "config1:5"); DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8"); DEFINE_UNCORE_FORMAT_ATTR(filter_link2, filter_link, "config1:6-8"); @@ -1058,8 +1126,8 @@ static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_eve if (reg1->idx != EXTRA_REG_NONE) { int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER; - int pkg = box->pkgid; - struct pci_dev *filter_pdev = uncore_extra_pci_dev[pkg].dev[idx]; + int die = box->dieid; + struct pci_dev *filter_pdev = uncore_extra_pci_dev[die].dev[idx]; if (filter_pdev) { pci_write_config_dword(filter_pdev, reg1->reg, @@ -3585,6 +3653,7 @@ static struct uncore_event_desc skx_uncore_iio_freerunning_events[] = { static struct intel_uncore_ops skx_uncore_iio_freerunning_ops = { .read_counter = uncore_msr_read_counter, + .hw_config = uncore_freerunning_hw_config, }; static struct attribute *skx_uncore_iio_freerunning_formats_attr[] = { @@ -3967,3 +4036,535 @@ int skx_uncore_pci_init(void) } /* end of SKX uncore support */ + +/* SNR uncore support */ + +static struct intel_uncore_type snr_uncore_ubox = { + .name = "ubox", + .num_counters = 2, + .num_boxes = 1, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .perf_ctr = SNR_U_MSR_PMON_CTR0, + .event_ctl = SNR_U_MSR_PMON_CTL0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .fixed_ctr = SNR_U_MSR_PMON_UCLK_FIXED_CTR, + .fixed_ctl = SNR_U_MSR_PMON_UCLK_FIXED_CTL, + .ops = &ivbep_uncore_msr_ops, + .format_group = &ivbep_uncore_format_group, +}; + +static struct attribute *snr_uncore_cha_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask_ext2.attr, + &format_attr_edge.attr, + &format_attr_tid_en.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + &format_attr_filter_tid5.attr, + NULL, +}; +static const struct attribute_group snr_uncore_chabox_format_group = { + .name = "format", + .attrs = snr_uncore_cha_formats_attr, +}; + +static int snr_cha_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + + reg1->reg = SNR_C0_MSR_PMON_BOX_FILTER0 + + box->pmu->type->msr_offset * box->pmu->pmu_idx; + reg1->config = event->attr.config1 & SKX_CHA_MSR_PMON_BOX_FILTER_TID; + reg1->idx = 0; + + return 0; +} + +static void snr_cha_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + + if (reg1->idx != EXTRA_REG_NONE) + wrmsrl(reg1->reg, reg1->config); + + wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); +} + +static struct intel_uncore_ops snr_uncore_chabox_ops = { + .init_box = ivbep_uncore_msr_init_box, + .disable_box = snbep_uncore_msr_disable_box, + .enable_box = snbep_uncore_msr_enable_box, + .disable_event = snbep_uncore_msr_disable_event, + .enable_event = snr_cha_enable_event, + .read_counter = uncore_msr_read_counter, + .hw_config = snr_cha_hw_config, +}; + +static struct intel_uncore_type snr_uncore_chabox = { + .name = "cha", + .num_counters = 4, + .num_boxes = 6, + .perf_ctr_bits = 48, + .event_ctl = SNR_CHA_MSR_PMON_CTL0, + .perf_ctr = SNR_CHA_MSR_PMON_CTR0, + .box_ctl = SNR_CHA_MSR_PMON_BOX_CTL, + .msr_offset = HSWEP_CBO_MSR_OFFSET, + .event_mask = HSWEP_S_MSR_PMON_RAW_EVENT_MASK, + .event_mask_ext = SNR_CHA_RAW_EVENT_MASK_EXT, + .ops = &snr_uncore_chabox_ops, + .format_group = &snr_uncore_chabox_format_group, +}; + +static struct attribute *snr_uncore_iio_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh9.attr, + &format_attr_ch_mask2.attr, + &format_attr_fc_mask2.attr, + NULL, +}; + +static const struct attribute_group snr_uncore_iio_format_group = { + .name = "format", + .attrs = snr_uncore_iio_formats_attr, +}; + +static struct intel_uncore_type snr_uncore_iio = { + .name = "iio", + .num_counters = 4, + .num_boxes = 5, + .perf_ctr_bits = 48, + .event_ctl = SNR_IIO_MSR_PMON_CTL0, + .perf_ctr = SNR_IIO_MSR_PMON_CTR0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .event_mask_ext = SNR_IIO_PMON_RAW_EVENT_MASK_EXT, + .box_ctl = SNR_IIO_MSR_PMON_BOX_CTL, + .msr_offset = SNR_IIO_MSR_OFFSET, + .ops = &ivbep_uncore_msr_ops, + .format_group = &snr_uncore_iio_format_group, +}; + +static struct intel_uncore_type snr_uncore_irp = { + .name = "irp", + .num_counters = 2, + .num_boxes = 5, + .perf_ctr_bits = 48, + .event_ctl = SNR_IRP0_MSR_PMON_CTL0, + .perf_ctr = SNR_IRP0_MSR_PMON_CTR0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .box_ctl = SNR_IRP0_MSR_PMON_BOX_CTL, + .msr_offset = SNR_IRP_MSR_OFFSET, + .ops = &ivbep_uncore_msr_ops, + .format_group = &ivbep_uncore_format_group, +}; + +static struct intel_uncore_type snr_uncore_m2pcie = { + .name = "m2pcie", + .num_counters = 4, + .num_boxes = 5, + .perf_ctr_bits = 48, + .event_ctl = SNR_M2PCIE_MSR_PMON_CTL0, + .perf_ctr = SNR_M2PCIE_MSR_PMON_CTR0, + .box_ctl = SNR_M2PCIE_MSR_PMON_BOX_CTL, + .msr_offset = SNR_M2PCIE_MSR_OFFSET, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .ops = &ivbep_uncore_msr_ops, + .format_group = &ivbep_uncore_format_group, +}; + +static int snr_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK; + + if (ev_sel >= 0xb && ev_sel <= 0xe) { + reg1->reg = SNR_PCU_MSR_PMON_BOX_FILTER; + reg1->idx = ev_sel - 0xb; + reg1->config = event->attr.config1 & (0xff << reg1->idx); + } + return 0; +} + +static struct intel_uncore_ops snr_uncore_pcu_ops = { + IVBEP_UNCORE_MSR_OPS_COMMON_INIT(), + .hw_config = snr_pcu_hw_config, + .get_constraint = snbep_pcu_get_constraint, + .put_constraint = snbep_pcu_put_constraint, +}; + +static struct intel_uncore_type snr_uncore_pcu = { + .name = "pcu", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .perf_ctr = SNR_PCU_MSR_PMON_CTR0, + .event_ctl = SNR_PCU_MSR_PMON_CTL0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .box_ctl = SNR_PCU_MSR_PMON_BOX_CTL, + .num_shared_regs = 1, + .ops = &snr_uncore_pcu_ops, + .format_group = &skx_uncore_pcu_format_group, +}; + +enum perf_uncore_snr_iio_freerunning_type_id { + SNR_IIO_MSR_IOCLK, + SNR_IIO_MSR_BW_IN, + + SNR_IIO_FREERUNNING_TYPE_MAX, +}; + +static struct freerunning_counters snr_iio_freerunning[] = { + [SNR_IIO_MSR_IOCLK] = { 0x1eac, 0x1, 0x10, 1, 48 }, + [SNR_IIO_MSR_BW_IN] = { 0x1f00, 0x1, 0x10, 8, 48 }, +}; + +static struct uncore_event_desc snr_uncore_iio_freerunning_events[] = { + /* Free-Running IIO CLOCKS Counter */ + INTEL_UNCORE_EVENT_DESC(ioclk, "event=0xff,umask=0x10"), + /* Free-Running IIO BANDWIDTH IN Counters */ + INTEL_UNCORE_EVENT_DESC(bw_in_port0, "event=0xff,umask=0x20"), + INTEL_UNCORE_EVENT_DESC(bw_in_port0.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port0.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_in_port1, "event=0xff,umask=0x21"), + INTEL_UNCORE_EVENT_DESC(bw_in_port1.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port1.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_in_port2, "event=0xff,umask=0x22"), + INTEL_UNCORE_EVENT_DESC(bw_in_port2.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port2.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_in_port3, "event=0xff,umask=0x23"), + INTEL_UNCORE_EVENT_DESC(bw_in_port3.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port3.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_in_port4, "event=0xff,umask=0x24"), + INTEL_UNCORE_EVENT_DESC(bw_in_port4.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port4.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_in_port5, "event=0xff,umask=0x25"), + INTEL_UNCORE_EVENT_DESC(bw_in_port5.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port5.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_in_port6, "event=0xff,umask=0x26"), + INTEL_UNCORE_EVENT_DESC(bw_in_port6.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port6.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_in_port7, "event=0xff,umask=0x27"), + INTEL_UNCORE_EVENT_DESC(bw_in_port7.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port7.unit, "MiB"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_type snr_uncore_iio_free_running = { + .name = "iio_free_running", + .num_counters = 9, + .num_boxes = 5, + .num_freerunning_types = SNR_IIO_FREERUNNING_TYPE_MAX, + .freerunning = snr_iio_freerunning, + .ops = &skx_uncore_iio_freerunning_ops, + .event_descs = snr_uncore_iio_freerunning_events, + .format_group = &skx_uncore_iio_freerunning_format_group, +}; + +static struct intel_uncore_type *snr_msr_uncores[] = { + &snr_uncore_ubox, + &snr_uncore_chabox, + &snr_uncore_iio, + &snr_uncore_irp, + &snr_uncore_m2pcie, + &snr_uncore_pcu, + &snr_uncore_iio_free_running, + NULL, +}; + +void snr_uncore_cpu_init(void) +{ + uncore_msr_uncores = snr_msr_uncores; +} + +static void snr_m2m_uncore_pci_init_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + int box_ctl = uncore_pci_box_ctl(box); + + __set_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags); + pci_write_config_dword(pdev, box_ctl, IVBEP_PMON_BOX_CTL_INT); +} + +static struct intel_uncore_ops snr_m2m_uncore_pci_ops = { + .init_box = snr_m2m_uncore_pci_init_box, + .disable_box = snbep_uncore_pci_disable_box, + .enable_box = snbep_uncore_pci_enable_box, + .disable_event = snbep_uncore_pci_disable_event, + .enable_event = snbep_uncore_pci_enable_event, + .read_counter = snbep_uncore_pci_read_counter, +}; + +static struct attribute *snr_m2m_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask_ext3.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + NULL, +}; + +static const struct attribute_group snr_m2m_uncore_format_group = { + .name = "format", + .attrs = snr_m2m_uncore_formats_attr, +}; + +static struct intel_uncore_type snr_uncore_m2m = { + .name = "m2m", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .perf_ctr = SNR_M2M_PCI_PMON_CTR0, + .event_ctl = SNR_M2M_PCI_PMON_CTL0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .event_mask_ext = SNR_M2M_PCI_PMON_UMASK_EXT, + .box_ctl = SNR_M2M_PCI_PMON_BOX_CTL, + .ops = &snr_m2m_uncore_pci_ops, + .format_group = &snr_m2m_uncore_format_group, +}; + +static struct intel_uncore_type snr_uncore_pcie3 = { + .name = "pcie3", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .perf_ctr = SNR_PCIE3_PCI_PMON_CTR0, + .event_ctl = SNR_PCIE3_PCI_PMON_CTL0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .box_ctl = SNR_PCIE3_PCI_PMON_BOX_CTL, + .ops = &ivbep_uncore_pci_ops, + .format_group = &ivbep_uncore_format_group, +}; + +enum { + SNR_PCI_UNCORE_M2M, + SNR_PCI_UNCORE_PCIE3, +}; + +static struct intel_uncore_type *snr_pci_uncores[] = { + [SNR_PCI_UNCORE_M2M] = &snr_uncore_m2m, + [SNR_PCI_UNCORE_PCIE3] = &snr_uncore_pcie3, + NULL, +}; + +static const struct pci_device_id snr_uncore_pci_ids[] = { + { /* M2M */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x344a), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(12, 0, SNR_PCI_UNCORE_M2M, 0), + }, + { /* PCIe3 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x334a), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(4, 0, SNR_PCI_UNCORE_PCIE3, 0), + }, + { /* end: all zeroes */ } +}; + +static struct pci_driver snr_uncore_pci_driver = { + .name = "snr_uncore", + .id_table = snr_uncore_pci_ids, +}; + +int snr_uncore_pci_init(void) +{ + /* SNR UBOX DID */ + int ret = snbep_pci2phy_map_init(0x3460, SKX_CPUNODEID, + SKX_GIDNIDMAP, true); + + if (ret) + return ret; + + uncore_pci_uncores = snr_pci_uncores; + uncore_pci_driver = &snr_uncore_pci_driver; + return 0; +} + +static struct pci_dev *snr_uncore_get_mc_dev(int id) +{ + struct pci_dev *mc_dev = NULL; + int phys_id, pkg; + + while (1) { + mc_dev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3451, mc_dev); + if (!mc_dev) + break; + phys_id = uncore_pcibus_to_physid(mc_dev->bus); + if (phys_id < 0) + continue; + pkg = topology_phys_to_logical_pkg(phys_id); + if (pkg < 0) + continue; + else if (pkg == id) + break; + } + return mc_dev; +} + +static void snr_uncore_mmio_init_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = snr_uncore_get_mc_dev(box->dieid); + unsigned int box_ctl = uncore_mmio_box_ctl(box); + resource_size_t addr; + u32 pci_dword; + + if (!pdev) + return; + + pci_read_config_dword(pdev, SNR_IMC_MMIO_BASE_OFFSET, &pci_dword); + addr = (pci_dword & SNR_IMC_MMIO_BASE_MASK) << 23; + + pci_read_config_dword(pdev, SNR_IMC_MMIO_MEM0_OFFSET, &pci_dword); + addr |= (pci_dword & SNR_IMC_MMIO_MEM0_MASK) << 12; + + addr += box_ctl; + + box->io_addr = ioremap(addr, SNR_IMC_MMIO_SIZE); + if (!box->io_addr) + return; + + writel(IVBEP_PMON_BOX_CTL_INT, box->io_addr); +} + +static void snr_uncore_mmio_disable_box(struct intel_uncore_box *box) +{ + u32 config; + + if (!box->io_addr) + return; + + config = readl(box->io_addr); + config |= SNBEP_PMON_BOX_CTL_FRZ; + writel(config, box->io_addr); +} + +static void snr_uncore_mmio_enable_box(struct intel_uncore_box *box) +{ + u32 config; + + if (!box->io_addr) + return; + + config = readl(box->io_addr); + config &= ~SNBEP_PMON_BOX_CTL_FRZ; + writel(config, box->io_addr); +} + +static void snr_uncore_mmio_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (!box->io_addr) + return; + + writel(hwc->config | SNBEP_PMON_CTL_EN, + box->io_addr + hwc->config_base); +} + +static void snr_uncore_mmio_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (!box->io_addr) + return; + + writel(hwc->config, box->io_addr + hwc->config_base); +} + +static struct intel_uncore_ops snr_uncore_mmio_ops = { + .init_box = snr_uncore_mmio_init_box, + .exit_box = uncore_mmio_exit_box, + .disable_box = snr_uncore_mmio_disable_box, + .enable_box = snr_uncore_mmio_enable_box, + .disable_event = snr_uncore_mmio_disable_event, + .enable_event = snr_uncore_mmio_enable_event, + .read_counter = uncore_mmio_read_counter, +}; + +static struct uncore_event_desc snr_uncore_imc_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x00,umask=0x00"), + INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x0f"), + INTEL_UNCORE_EVENT_DESC(cas_count_read.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(cas_count_read.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x30"), + INTEL_UNCORE_EVENT_DESC(cas_count_write.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(cas_count_write.unit, "MiB"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_type snr_uncore_imc = { + .name = "imc", + .num_counters = 4, + .num_boxes = 2, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .fixed_ctr = SNR_IMC_MMIO_PMON_FIXED_CTR, + .fixed_ctl = SNR_IMC_MMIO_PMON_FIXED_CTL, + .event_descs = snr_uncore_imc_events, + .perf_ctr = SNR_IMC_MMIO_PMON_CTR0, + .event_ctl = SNR_IMC_MMIO_PMON_CTL0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .box_ctl = SNR_IMC_MMIO_PMON_BOX_CTL, + .mmio_offset = SNR_IMC_MMIO_OFFSET, + .ops = &snr_uncore_mmio_ops, + .format_group = &skx_uncore_format_group, +}; + +enum perf_uncore_snr_imc_freerunning_type_id { + SNR_IMC_DCLK, + SNR_IMC_DDR, + + SNR_IMC_FREERUNNING_TYPE_MAX, +}; + +static struct freerunning_counters snr_imc_freerunning[] = { + [SNR_IMC_DCLK] = { 0x22b0, 0x0, 0, 1, 48 }, + [SNR_IMC_DDR] = { 0x2290, 0x8, 0, 2, 48 }, +}; + +static struct uncore_event_desc snr_uncore_imc_freerunning_events[] = { + INTEL_UNCORE_EVENT_DESC(dclk, "event=0xff,umask=0x10"), + + INTEL_UNCORE_EVENT_DESC(read, "event=0xff,umask=0x20"), + INTEL_UNCORE_EVENT_DESC(read.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(read.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(write, "event=0xff,umask=0x21"), + INTEL_UNCORE_EVENT_DESC(write.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(write.unit, "MiB"), +}; + +static struct intel_uncore_ops snr_uncore_imc_freerunning_ops = { + .init_box = snr_uncore_mmio_init_box, + .exit_box = uncore_mmio_exit_box, + .read_counter = uncore_mmio_read_counter, + .hw_config = uncore_freerunning_hw_config, +}; + +static struct intel_uncore_type snr_uncore_imc_free_running = { + .name = "imc_free_running", + .num_counters = 3, + .num_boxes = 1, + .num_freerunning_types = SNR_IMC_FREERUNNING_TYPE_MAX, + .freerunning = snr_imc_freerunning, + .ops = &snr_uncore_imc_freerunning_ops, + .event_descs = snr_uncore_imc_freerunning_events, + .format_group = &skx_uncore_iio_freerunning_format_group, +}; + +static struct intel_uncore_type *snr_mmio_uncores[] = { + &snr_uncore_imc, + &snr_uncore_imc_free_running, + NULL, +}; + +void snr_uncore_mmio_init(void) +{ + uncore_mmio_uncores = snr_mmio_uncores; +} + +/* end of SNR uncore support */ diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index f3f4c2263501..9431447541e9 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -1,7 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/perf_event.h> +#include <linux/sysfs.h> #include <linux/nospec.h> #include <asm/intel-family.h> +#include "probe.h" enum perf_msr_id { PERF_MSR_TSC = 0, @@ -12,32 +14,30 @@ enum perf_msr_id { PERF_MSR_PTSC = 5, PERF_MSR_IRPERF = 6, PERF_MSR_THERM = 7, - PERF_MSR_THERM_SNAP = 8, - PERF_MSR_THERM_UNIT = 9, PERF_MSR_EVENT_MAX, }; -static bool test_aperfmperf(int idx) +static bool test_aperfmperf(int idx, void *data) { return boot_cpu_has(X86_FEATURE_APERFMPERF); } -static bool test_ptsc(int idx) +static bool test_ptsc(int idx, void *data) { return boot_cpu_has(X86_FEATURE_PTSC); } -static bool test_irperf(int idx) +static bool test_irperf(int idx, void *data) { return boot_cpu_has(X86_FEATURE_IRPERF); } -static bool test_therm_status(int idx) +static bool test_therm_status(int idx, void *data) { return boot_cpu_has(X86_FEATURE_DTHERM); } -static bool test_intel(int idx) +static bool test_intel(int idx, void *data) { if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || boot_cpu_data.x86 != 6) @@ -98,37 +98,51 @@ static bool test_intel(int idx) return false; } -struct perf_msr { - u64 msr; - struct perf_pmu_events_attr *attr; - bool (*test)(int idx); +PMU_EVENT_ATTR_STRING(tsc, attr_tsc, "event=0x00" ); +PMU_EVENT_ATTR_STRING(aperf, attr_aperf, "event=0x01" ); +PMU_EVENT_ATTR_STRING(mperf, attr_mperf, "event=0x02" ); +PMU_EVENT_ATTR_STRING(pperf, attr_pperf, "event=0x03" ); +PMU_EVENT_ATTR_STRING(smi, attr_smi, "event=0x04" ); +PMU_EVENT_ATTR_STRING(ptsc, attr_ptsc, "event=0x05" ); +PMU_EVENT_ATTR_STRING(irperf, attr_irperf, "event=0x06" ); +PMU_EVENT_ATTR_STRING(cpu_thermal_margin, attr_therm, "event=0x07" ); +PMU_EVENT_ATTR_STRING(cpu_thermal_margin.snapshot, attr_therm_snap, "1" ); +PMU_EVENT_ATTR_STRING(cpu_thermal_margin.unit, attr_therm_unit, "C" ); + +static unsigned long msr_mask; + +PMU_EVENT_GROUP(events, aperf); +PMU_EVENT_GROUP(events, mperf); +PMU_EVENT_GROUP(events, pperf); +PMU_EVENT_GROUP(events, smi); +PMU_EVENT_GROUP(events, ptsc); +PMU_EVENT_GROUP(events, irperf); + +static struct attribute *attrs_therm[] = { + &attr_therm.attr.attr, + &attr_therm_snap.attr.attr, + &attr_therm_unit.attr.attr, + NULL, }; -PMU_EVENT_ATTR_STRING(tsc, evattr_tsc, "event=0x00" ); -PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01" ); -PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02" ); -PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03" ); -PMU_EVENT_ATTR_STRING(smi, evattr_smi, "event=0x04" ); -PMU_EVENT_ATTR_STRING(ptsc, evattr_ptsc, "event=0x05" ); -PMU_EVENT_ATTR_STRING(irperf, evattr_irperf, "event=0x06" ); -PMU_EVENT_ATTR_STRING(cpu_thermal_margin, evattr_therm, "event=0x07" ); -PMU_EVENT_ATTR_STRING(cpu_thermal_margin.snapshot, evattr_therm_snap, "1" ); -PMU_EVENT_ATTR_STRING(cpu_thermal_margin.unit, evattr_therm_unit, "C" ); +static struct attribute_group group_therm = { + .name = "events", + .attrs = attrs_therm, +}; static struct perf_msr msr[] = { - [PERF_MSR_TSC] = { 0, &evattr_tsc, NULL, }, - [PERF_MSR_APERF] = { MSR_IA32_APERF, &evattr_aperf, test_aperfmperf, }, - [PERF_MSR_MPERF] = { MSR_IA32_MPERF, &evattr_mperf, test_aperfmperf, }, - [PERF_MSR_PPERF] = { MSR_PPERF, &evattr_pperf, test_intel, }, - [PERF_MSR_SMI] = { MSR_SMI_COUNT, &evattr_smi, test_intel, }, - [PERF_MSR_PTSC] = { MSR_F15H_PTSC, &evattr_ptsc, test_ptsc, }, - [PERF_MSR_IRPERF] = { MSR_F17H_IRPERF, &evattr_irperf, test_irperf, }, - [PERF_MSR_THERM] = { MSR_IA32_THERM_STATUS, &evattr_therm, test_therm_status, }, - [PERF_MSR_THERM_SNAP] = { MSR_IA32_THERM_STATUS, &evattr_therm_snap, test_therm_status, }, - [PERF_MSR_THERM_UNIT] = { MSR_IA32_THERM_STATUS, &evattr_therm_unit, test_therm_status, }, + [PERF_MSR_TSC] = { .no_check = true, }, + [PERF_MSR_APERF] = { MSR_IA32_APERF, &group_aperf, test_aperfmperf, }, + [PERF_MSR_MPERF] = { MSR_IA32_MPERF, &group_mperf, test_aperfmperf, }, + [PERF_MSR_PPERF] = { MSR_PPERF, &group_pperf, test_intel, }, + [PERF_MSR_SMI] = { MSR_SMI_COUNT, &group_smi, test_intel, }, + [PERF_MSR_PTSC] = { MSR_F15H_PTSC, &group_ptsc, test_ptsc, }, + [PERF_MSR_IRPERF] = { MSR_F17H_IRPERF, &group_irperf, test_irperf, }, + [PERF_MSR_THERM] = { MSR_IA32_THERM_STATUS, &group_therm, test_therm_status, }, }; -static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = { +static struct attribute *events_attrs[] = { + &attr_tsc.attr.attr, NULL, }; @@ -153,6 +167,17 @@ static const struct attribute_group *attr_groups[] = { NULL, }; +const struct attribute_group *attr_update[] = { + &group_aperf, + &group_mperf, + &group_pperf, + &group_smi, + &group_ptsc, + &group_irperf, + &group_therm, + NULL, +}; + static int msr_event_init(struct perf_event *event) { u64 cfg = event->attr.config; @@ -169,7 +194,7 @@ static int msr_event_init(struct perf_event *event) cfg = array_index_nospec((unsigned long)cfg, PERF_MSR_EVENT_MAX); - if (!msr[cfg].attr) + if (!(msr_mask & (1 << cfg))) return -EINVAL; event->hw.idx = -1; @@ -252,32 +277,17 @@ static struct pmu pmu_msr = { .stop = msr_event_stop, .read = msr_event_update, .capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE, + .attr_update = attr_update, }; static int __init msr_init(void) { - int i, j = 0; - if (!boot_cpu_has(X86_FEATURE_TSC)) { pr_cont("no MSR PMU driver.\n"); return 0; } - /* Probe the MSRs. */ - for (i = PERF_MSR_TSC + 1; i < PERF_MSR_EVENT_MAX; i++) { - u64 val; - - /* Virt sucks; you cannot tell if a R/O MSR is present :/ */ - if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val)) - msr[i].attr = NULL; - } - - /* List remaining MSRs in the sysfs attrs. */ - for (i = 0; i < PERF_MSR_EVENT_MAX; i++) { - if (msr[i].attr) - events_attrs[j++] = &msr[i].attr->attr.attr; - } - events_attrs[j] = NULL; + msr_mask = perf_msr_probe(msr, PERF_MSR_EVENT_MAX, true, NULL); perf_pmu_register(&pmu_msr, "msr", -1); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index a6ac2f4f76fc..8751008fc170 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -121,24 +121,6 @@ struct amd_nb { (1ULL << PERF_REG_X86_R14) | \ (1ULL << PERF_REG_X86_R15)) -#define PEBS_XMM_REGS \ - ((1ULL << PERF_REG_X86_XMM0) | \ - (1ULL << PERF_REG_X86_XMM1) | \ - (1ULL << PERF_REG_X86_XMM2) | \ - (1ULL << PERF_REG_X86_XMM3) | \ - (1ULL << PERF_REG_X86_XMM4) | \ - (1ULL << PERF_REG_X86_XMM5) | \ - (1ULL << PERF_REG_X86_XMM6) | \ - (1ULL << PERF_REG_X86_XMM7) | \ - (1ULL << PERF_REG_X86_XMM8) | \ - (1ULL << PERF_REG_X86_XMM9) | \ - (1ULL << PERF_REG_X86_XMM10) | \ - (1ULL << PERF_REG_X86_XMM11) | \ - (1ULL << PERF_REG_X86_XMM12) | \ - (1ULL << PERF_REG_X86_XMM13) | \ - (1ULL << PERF_REG_X86_XMM14) | \ - (1ULL << PERF_REG_X86_XMM15)) - /* * Per register state. */ @@ -631,14 +613,11 @@ struct x86_pmu { int attr_rdpmc_broken; int attr_rdpmc; struct attribute **format_attrs; - struct attribute **event_attrs; - struct attribute **caps_attrs; ssize_t (*events_sysfs_show)(char *page, u64 config); - struct attribute **cpu_events; + const struct attribute_group **attr_update; unsigned long attr_freeze_on_smi; - struct attribute **attrs; /* * CPU Hotplug hooks @@ -668,8 +647,7 @@ struct x86_pmu { pebs_broken :1, pebs_prec_dist :1, pebs_no_tlb :1, - pebs_no_isolation :1, - pebs_no_xmm_regs :1; + pebs_no_isolation :1; int pebs_record_size; int pebs_buffer_size; int max_pebs_events; @@ -905,8 +883,6 @@ static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip) ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event); ssize_t intel_event_sysfs_show(char *page, u64 config); -struct attribute **merge_attr(struct attribute **a, struct attribute **b); - ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page); ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr, diff --git a/arch/x86/events/probe.c b/arch/x86/events/probe.c new file mode 100644 index 000000000000..c2ede2f3b277 --- /dev/null +++ b/arch/x86/events/probe.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/export.h> +#include <linux/types.h> +#include <linux/bits.h> +#include "probe.h" + +static umode_t +not_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return 0; +} + +unsigned long +perf_msr_probe(struct perf_msr *msr, int cnt, bool zero, void *data) +{ + unsigned long avail = 0; + unsigned int bit; + u64 val; + + if (cnt >= BITS_PER_LONG) + return 0; + + for (bit = 0; bit < cnt; bit++) { + if (!msr[bit].no_check) { + struct attribute_group *grp = msr[bit].grp; + + grp->is_visible = not_visible; + + if (msr[bit].test && !msr[bit].test(bit, data)) + continue; + /* Virt sucks; you cannot tell if a R/O MSR is present :/ */ + if (rdmsrl_safe(msr[bit].msr, &val)) + continue; + /* Disable zero counters if requested. */ + if (!zero && !val) + continue; + + grp->is_visible = NULL; + } + avail |= BIT(bit); + } + + return avail; +} +EXPORT_SYMBOL_GPL(perf_msr_probe); diff --git a/arch/x86/events/probe.h b/arch/x86/events/probe.h new file mode 100644 index 000000000000..4c8e0afc5fb5 --- /dev/null +++ b/arch/x86/events/probe.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ARCH_X86_EVENTS_PROBE_H__ +#define __ARCH_X86_EVENTS_PROBE_H__ +#include <linux/sysfs.h> + +struct perf_msr { + u64 msr; + struct attribute_group *grp; + bool (*test)(int idx, void *data); + bool no_check; +}; + +unsigned long +perf_msr_probe(struct perf_msr *msr, int cnt, bool no_zero, void *data); + +#define __PMU_EVENT_GROUP(_name) \ +static struct attribute *attrs_##_name[] = { \ + &attr_##_name.attr.attr, \ + NULL, \ +} + +#define PMU_EVENT_GROUP(_grp, _name) \ +__PMU_EVENT_GROUP(_name); \ +static struct attribute_group group_##_name = { \ + .name = #_grp, \ + .attrs = attrs_##_name, \ +} + +#endif /* __ARCH_X86_EVENTS_PROBE_H__ */ diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 1608050e9df9..0e033ef11a9f 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -17,64 +17,13 @@ #include <linux/version.h> #include <linux/vmalloc.h> #include <linux/mm.h> -#include <linux/clockchips.h> #include <linux/hyperv.h> #include <linux/slab.h> #include <linux/cpuhotplug.h> - -#ifdef CONFIG_HYPERV_TSCPAGE - -static struct ms_hyperv_tsc_page *tsc_pg; - -struct ms_hyperv_tsc_page *hv_get_tsc_page(void) -{ - return tsc_pg; -} -EXPORT_SYMBOL_GPL(hv_get_tsc_page); - -static u64 read_hv_clock_tsc(struct clocksource *arg) -{ - u64 current_tick = hv_read_tsc_page(tsc_pg); - - if (current_tick == U64_MAX) - rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick); - - return current_tick; -} - -static struct clocksource hyperv_cs_tsc = { - .name = "hyperv_clocksource_tsc_page", - .rating = 400, - .read = read_hv_clock_tsc, - .mask = CLOCKSOURCE_MASK(64), - .flags = CLOCK_SOURCE_IS_CONTINUOUS, -}; -#endif - -static u64 read_hv_clock_msr(struct clocksource *arg) -{ - u64 current_tick; - /* - * Read the partition counter to get the current tick count. This count - * is set to 0 when the partition is created and is incremented in - * 100 nanosecond units. - */ - rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick); - return current_tick; -} - -static struct clocksource hyperv_cs_msr = { - .name = "hyperv_clocksource_msr", - .rating = 400, - .read = read_hv_clock_msr, - .mask = CLOCKSOURCE_MASK(64), - .flags = CLOCK_SOURCE_IS_CONTINUOUS, -}; +#include <clocksource/hyperv_timer.h> void *hv_hypercall_pg; EXPORT_SYMBOL_GPL(hv_hypercall_pg); -struct clocksource *hyperv_cs; -EXPORT_SYMBOL_GPL(hyperv_cs); u32 *hv_vp_index; EXPORT_SYMBOL_GPL(hv_vp_index); @@ -343,42 +292,8 @@ void __init hyperv_init(void) x86_init.pci.arch_init = hv_pci_init; - /* - * Register Hyper-V specific clocksource. - */ -#ifdef CONFIG_HYPERV_TSCPAGE - if (ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE) { - union hv_x64_msr_hypercall_contents tsc_msr; - - tsc_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL); - if (!tsc_pg) - goto register_msr_cs; - - hyperv_cs = &hyperv_cs_tsc; - - rdmsrl(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64); - - tsc_msr.enable = 1; - tsc_msr.guest_physical_address = vmalloc_to_pfn(tsc_pg); - - wrmsrl(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64); - - hyperv_cs_tsc.archdata.vclock_mode = VCLOCK_HVCLOCK; - - clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100); - return; - } -register_msr_cs: -#endif - /* - * For 32 bit guests just use the MSR based mechanism for reading - * the partition counter. - */ - - hyperv_cs = &hyperv_cs_msr; - if (ms_hyperv.features & HV_MSR_TIME_REF_COUNT_AVAILABLE) - clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100); - + /* Register Hyper-V specific clocksource */ + hv_init_clocksource(); return; remove_cpuhp_state: diff --git a/arch/x86/include/asm/acenv.h b/arch/x86/include/asm/acenv.h index 1b010a859b8b..9aff97f0de7f 100644 --- a/arch/x86/include/asm/acenv.h +++ b/arch/x86/include/asm/acenv.h @@ -1,12 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * X86 specific ACPICA environments and implementation * * Copyright (C) 2014, Intel Corporation * Author: Lv Zheng <lv.zheng@intel.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #ifndef _ASM_X86_ACENV_H diff --git a/arch/x86/include/asm/acrn.h b/arch/x86/include/asm/acrn.h new file mode 100644 index 000000000000..4adb13f08af7 --- /dev/null +++ b/arch/x86/include/asm/acrn.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_ACRN_H +#define _ASM_X86_ACRN_H + +extern void acrn_hv_callback_vector(void); +#ifdef CONFIG_TRACING +#define trace_acrn_hv_callback_vector acrn_hv_callback_vector +#endif + +extern void acrn_hv_vector_handler(struct pt_regs *regs); +#endif /* _ASM_X86_ACRN_H */ diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 1340fa53b575..050e5f9ebf81 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -53,7 +53,7 @@ extern unsigned int apic_verbosity; extern int local_apic_timer_c2_ok; extern int disable_apic; -extern unsigned int lapic_timer_frequency; +extern unsigned int lapic_timer_period; extern enum apic_intr_mode_id apic_intr_mode; enum apic_intr_mode_id { @@ -155,7 +155,6 @@ static inline int apic_force_enable(unsigned long addr) extern int apic_force_enable(unsigned long addr); #endif -extern void apic_bsp_setup(bool upmode); extern void apic_ap_setup(void); /* @@ -175,6 +174,7 @@ extern void lapic_assign_system_vectors(void); extern void lapic_assign_legacy_vector(unsigned int isairq, bool replace); extern void lapic_online(void); extern void lapic_offline(void); +extern bool apic_needs_pit(void); #else /* !CONFIG_X86_LOCAL_APIC */ static inline void lapic_shutdown(void) { } @@ -188,6 +188,7 @@ static inline void init_bsp_APIC(void) { } static inline void apic_intr_mode_init(void) { } static inline void lapic_assign_system_vectors(void) { } static inline void lapic_assign_legacy_vector(unsigned int i, bool r) { } +static inline bool apic_needs_pit(void) { return true; } #endif /* !CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_X2APIC diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index ea3d95275b43..115127c7ad28 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -54,7 +54,7 @@ static __always_inline void arch_atomic_add(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "addl %1,%0" : "+m" (v->counter) - : "ir" (i)); + : "ir" (i) : "memory"); } /** @@ -68,7 +68,7 @@ static __always_inline void arch_atomic_sub(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "subl %1,%0" : "+m" (v->counter) - : "ir" (i)); + : "ir" (i) : "memory"); } /** @@ -95,7 +95,7 @@ static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v) static __always_inline void arch_atomic_inc(atomic_t *v) { asm volatile(LOCK_PREFIX "incl %0" - : "+m" (v->counter)); + : "+m" (v->counter) :: "memory"); } #define arch_atomic_inc arch_atomic_inc @@ -108,7 +108,7 @@ static __always_inline void arch_atomic_inc(atomic_t *v) static __always_inline void arch_atomic_dec(atomic_t *v) { asm volatile(LOCK_PREFIX "decl %0" - : "+m" (v->counter)); + : "+m" (v->counter) :: "memory"); } #define arch_atomic_dec arch_atomic_dec diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 6a5b0ec460da..52cfaecb13f9 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -9,7 +9,7 @@ /* An 64bit atomic type */ typedef struct { - u64 __aligned(8) counter; + s64 __aligned(8) counter; } atomic64_t; #define ATOMIC64_INIT(val) { (val) } @@ -71,8 +71,7 @@ ATOMIC64_DECL(add_unless); * the old value. */ -static inline long long arch_atomic64_cmpxchg(atomic64_t *v, long long o, - long long n) +static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n) { return arch_cmpxchg64(&v->counter, o, n); } @@ -85,9 +84,9 @@ static inline long long arch_atomic64_cmpxchg(atomic64_t *v, long long o, * Atomically xchgs the value of @v to @n and returns * the old value. */ -static inline long long arch_atomic64_xchg(atomic64_t *v, long long n) +static inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n) { - long long o; + s64 o; unsigned high = (unsigned)(n >> 32); unsigned low = (unsigned)n; alternative_atomic64(xchg, "=&A" (o), @@ -103,7 +102,7 @@ static inline long long arch_atomic64_xchg(atomic64_t *v, long long n) * * Atomically sets the value of @v to @n. */ -static inline void arch_atomic64_set(atomic64_t *v, long long i) +static inline void arch_atomic64_set(atomic64_t *v, s64 i) { unsigned high = (unsigned)(i >> 32); unsigned low = (unsigned)i; @@ -118,9 +117,9 @@ static inline void arch_atomic64_set(atomic64_t *v, long long i) * * Atomically reads the value of @v and returns it. */ -static inline long long arch_atomic64_read(const atomic64_t *v) +static inline s64 arch_atomic64_read(const atomic64_t *v) { - long long r; + s64 r; alternative_atomic64(read, "=&A" (r), "c" (v) : "memory"); return r; } @@ -132,7 +131,7 @@ static inline long long arch_atomic64_read(const atomic64_t *v) * * Atomically adds @i to @v and returns @i + *@v */ -static inline long long arch_atomic64_add_return(long long i, atomic64_t *v) +static inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) { alternative_atomic64(add_return, ASM_OUTPUT2("+A" (i), "+c" (v)), @@ -143,7 +142,7 @@ static inline long long arch_atomic64_add_return(long long i, atomic64_t *v) /* * Other variants with different arithmetic operators: */ -static inline long long arch_atomic64_sub_return(long long i, atomic64_t *v) +static inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v) { alternative_atomic64(sub_return, ASM_OUTPUT2("+A" (i), "+c" (v)), @@ -151,18 +150,18 @@ static inline long long arch_atomic64_sub_return(long long i, atomic64_t *v) return i; } -static inline long long arch_atomic64_inc_return(atomic64_t *v) +static inline s64 arch_atomic64_inc_return(atomic64_t *v) { - long long a; + s64 a; alternative_atomic64(inc_return, "=&A" (a), "S" (v) : "memory", "ecx"); return a; } #define arch_atomic64_inc_return arch_atomic64_inc_return -static inline long long arch_atomic64_dec_return(atomic64_t *v) +static inline s64 arch_atomic64_dec_return(atomic64_t *v) { - long long a; + s64 a; alternative_atomic64(dec_return, "=&A" (a), "S" (v) : "memory", "ecx"); return a; @@ -176,7 +175,7 @@ static inline long long arch_atomic64_dec_return(atomic64_t *v) * * Atomically adds @i to @v. */ -static inline long long arch_atomic64_add(long long i, atomic64_t *v) +static inline s64 arch_atomic64_add(s64 i, atomic64_t *v) { __alternative_atomic64(add, add_return, ASM_OUTPUT2("+A" (i), "+c" (v)), @@ -191,7 +190,7 @@ static inline long long arch_atomic64_add(long long i, atomic64_t *v) * * Atomically subtracts @i from @v. */ -static inline long long arch_atomic64_sub(long long i, atomic64_t *v) +static inline s64 arch_atomic64_sub(s64 i, atomic64_t *v) { __alternative_atomic64(sub, sub_return, ASM_OUTPUT2("+A" (i), "+c" (v)), @@ -234,8 +233,7 @@ static inline void arch_atomic64_dec(atomic64_t *v) * Atomically adds @a to @v, so long as it was not @u. * Returns non-zero if the add was done, zero otherwise. */ -static inline int arch_atomic64_add_unless(atomic64_t *v, long long a, - long long u) +static inline int arch_atomic64_add_unless(atomic64_t *v, s64 a, s64 u) { unsigned low = (unsigned)u; unsigned high = (unsigned)(u >> 32); @@ -254,9 +252,9 @@ static inline int arch_atomic64_inc_not_zero(atomic64_t *v) } #define arch_atomic64_inc_not_zero arch_atomic64_inc_not_zero -static inline long long arch_atomic64_dec_if_positive(atomic64_t *v) +static inline s64 arch_atomic64_dec_if_positive(atomic64_t *v) { - long long r; + s64 r; alternative_atomic64(dec_if_positive, "=&A" (r), "S" (v) : "ecx", "memory"); return r; @@ -266,17 +264,17 @@ static inline long long arch_atomic64_dec_if_positive(atomic64_t *v) #undef alternative_atomic64 #undef __alternative_atomic64 -static inline void arch_atomic64_and(long long i, atomic64_t *v) +static inline void arch_atomic64_and(s64 i, atomic64_t *v) { - long long old, c = 0; + s64 old, c = 0; while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c) c = old; } -static inline long long arch_atomic64_fetch_and(long long i, atomic64_t *v) +static inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v) { - long long old, c = 0; + s64 old, c = 0; while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c) c = old; @@ -284,17 +282,17 @@ static inline long long arch_atomic64_fetch_and(long long i, atomic64_t *v) return old; } -static inline void arch_atomic64_or(long long i, atomic64_t *v) +static inline void arch_atomic64_or(s64 i, atomic64_t *v) { - long long old, c = 0; + s64 old, c = 0; while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c) c = old; } -static inline long long arch_atomic64_fetch_or(long long i, atomic64_t *v) +static inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v) { - long long old, c = 0; + s64 old, c = 0; while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c) c = old; @@ -302,17 +300,17 @@ static inline long long arch_atomic64_fetch_or(long long i, atomic64_t *v) return old; } -static inline void arch_atomic64_xor(long long i, atomic64_t *v) +static inline void arch_atomic64_xor(s64 i, atomic64_t *v) { - long long old, c = 0; + s64 old, c = 0; while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c) c = old; } -static inline long long arch_atomic64_fetch_xor(long long i, atomic64_t *v) +static inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v) { - long long old, c = 0; + s64 old, c = 0; while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c) c = old; @@ -320,9 +318,9 @@ static inline long long arch_atomic64_fetch_xor(long long i, atomic64_t *v) return old; } -static inline long long arch_atomic64_fetch_add(long long i, atomic64_t *v) +static inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) { - long long old, c = 0; + s64 old, c = 0; while ((old = arch_atomic64_cmpxchg(v, c, c + i)) != c) c = old; diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index dadc20adba21..95c6ceac66b9 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -17,7 +17,7 @@ * Atomically reads the value of @v. * Doesn't imply a read memory barrier. */ -static inline long arch_atomic64_read(const atomic64_t *v) +static inline s64 arch_atomic64_read(const atomic64_t *v) { return READ_ONCE((v)->counter); } @@ -29,7 +29,7 @@ static inline long arch_atomic64_read(const atomic64_t *v) * * Atomically sets the value of @v to @i. */ -static inline void arch_atomic64_set(atomic64_t *v, long i) +static inline void arch_atomic64_set(atomic64_t *v, s64 i) { WRITE_ONCE(v->counter, i); } @@ -41,11 +41,11 @@ static inline void arch_atomic64_set(atomic64_t *v, long i) * * Atomically adds @i to @v. */ -static __always_inline void arch_atomic64_add(long i, atomic64_t *v) +static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v) { asm volatile(LOCK_PREFIX "addq %1,%0" : "=m" (v->counter) - : "er" (i), "m" (v->counter)); + : "er" (i), "m" (v->counter) : "memory"); } /** @@ -55,11 +55,11 @@ static __always_inline void arch_atomic64_add(long i, atomic64_t *v) * * Atomically subtracts @i from @v. */ -static inline void arch_atomic64_sub(long i, atomic64_t *v) +static inline void arch_atomic64_sub(s64 i, atomic64_t *v) { asm volatile(LOCK_PREFIX "subq %1,%0" : "=m" (v->counter) - : "er" (i), "m" (v->counter)); + : "er" (i), "m" (v->counter) : "memory"); } /** @@ -71,7 +71,7 @@ static inline void arch_atomic64_sub(long i, atomic64_t *v) * true if the result is zero, or false for all * other cases. */ -static inline bool arch_atomic64_sub_and_test(long i, atomic64_t *v) +static inline bool arch_atomic64_sub_and_test(s64 i, atomic64_t *v) { return GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, e, "er", i); } @@ -87,7 +87,7 @@ static __always_inline void arch_atomic64_inc(atomic64_t *v) { asm volatile(LOCK_PREFIX "incq %0" : "=m" (v->counter) - : "m" (v->counter)); + : "m" (v->counter) : "memory"); } #define arch_atomic64_inc arch_atomic64_inc @@ -101,7 +101,7 @@ static __always_inline void arch_atomic64_dec(atomic64_t *v) { asm volatile(LOCK_PREFIX "decq %0" : "=m" (v->counter) - : "m" (v->counter)); + : "m" (v->counter) : "memory"); } #define arch_atomic64_dec arch_atomic64_dec @@ -142,7 +142,7 @@ static inline bool arch_atomic64_inc_and_test(atomic64_t *v) * if the result is negative, or false when * result is greater than or equal to zero. */ -static inline bool arch_atomic64_add_negative(long i, atomic64_t *v) +static inline bool arch_atomic64_add_negative(s64 i, atomic64_t *v) { return GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, s, "er", i); } @@ -155,43 +155,43 @@ static inline bool arch_atomic64_add_negative(long i, atomic64_t *v) * * Atomically adds @i to @v and returns @i + @v */ -static __always_inline long arch_atomic64_add_return(long i, atomic64_t *v) +static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v) { return i + xadd(&v->counter, i); } -static inline long arch_atomic64_sub_return(long i, atomic64_t *v) +static inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v) { return arch_atomic64_add_return(-i, v); } -static inline long arch_atomic64_fetch_add(long i, atomic64_t *v) +static inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v) { return xadd(&v->counter, i); } -static inline long arch_atomic64_fetch_sub(long i, atomic64_t *v) +static inline s64 arch_atomic64_fetch_sub(s64 i, atomic64_t *v) { return xadd(&v->counter, -i); } -static inline long arch_atomic64_cmpxchg(atomic64_t *v, long old, long new) +static inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { return arch_cmpxchg(&v->counter, old, new); } #define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg -static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, long new) +static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) { return try_cmpxchg(&v->counter, old, new); } -static inline long arch_atomic64_xchg(atomic64_t *v, long new) +static inline s64 arch_atomic64_xchg(atomic64_t *v, s64 new) { return arch_xchg(&v->counter, new); } -static inline void arch_atomic64_and(long i, atomic64_t *v) +static inline void arch_atomic64_and(s64 i, atomic64_t *v) { asm volatile(LOCK_PREFIX "andq %1,%0" : "+m" (v->counter) @@ -199,7 +199,7 @@ static inline void arch_atomic64_and(long i, atomic64_t *v) : "memory"); } -static inline long arch_atomic64_fetch_and(long i, atomic64_t *v) +static inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v) { s64 val = arch_atomic64_read(v); @@ -208,7 +208,7 @@ static inline long arch_atomic64_fetch_and(long i, atomic64_t *v) return val; } -static inline void arch_atomic64_or(long i, atomic64_t *v) +static inline void arch_atomic64_or(s64 i, atomic64_t *v) { asm volatile(LOCK_PREFIX "orq %1,%0" : "+m" (v->counter) @@ -216,7 +216,7 @@ static inline void arch_atomic64_or(long i, atomic64_t *v) : "memory"); } -static inline long arch_atomic64_fetch_or(long i, atomic64_t *v) +static inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v) { s64 val = arch_atomic64_read(v); @@ -225,7 +225,7 @@ static inline long arch_atomic64_fetch_or(long i, atomic64_t *v) return val; } -static inline void arch_atomic64_xor(long i, atomic64_t *v) +static inline void arch_atomic64_xor(s64 i, atomic64_t *v) { asm volatile(LOCK_PREFIX "xorq %1,%0" : "+m" (v->counter) @@ -233,7 +233,7 @@ static inline void arch_atomic64_xor(long i, atomic64_t *v) : "memory"); } -static inline long arch_atomic64_fetch_xor(long i, atomic64_t *v) +static inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v) { s64 val = arch_atomic64_read(v); diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 14de0432d288..84f848c2541a 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -80,8 +80,8 @@ do { \ }) /* Atomic operations are already serializing on x86 */ -#define __smp_mb__before_atomic() barrier() -#define __smp_mb__after_atomic() barrier() +#define __smp_mb__before_atomic() do { } while (0) +#define __smp_mb__after_atomic() do { } while (0) #include <asm-generic/barrier.h> diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 1d337c51f7e6..58acda503817 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -22,8 +22,8 @@ enum cpuid_leafs CPUID_LNX_3, CPUID_7_0_EBX, CPUID_D_1_EAX, - CPUID_F_0_EDX, - CPUID_F_1_EDX, + CPUID_LNX_4, + CPUID_7_1_EAX, CPUID_8000_0008_EBX, CPUID_6_EAX, CPUID_8000_000A_EDX, diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 75f27ee2c263..998c2cc08363 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -239,12 +239,14 @@ #define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ #define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ #define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ +#define X86_FEATURE_FDP_EXCPTN_ONLY ( 9*32+ 6) /* "" FPU data pointer updated only on x87 exceptions */ #define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ #define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ #define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB instructions */ #define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ #define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ #define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ +#define X86_FEATURE_ZERO_FCS_FDS ( 9*32+13) /* "" Zero out FPU CS and FPU DS */ #define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ #define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */ #define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ @@ -269,13 +271,19 @@ #define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 instruction */ #define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS instructions */ -/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (EDX), word 11 */ -#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ +/* + * Extended auxiliary flags: Linux defined - for features scattered in various + * CPUID levels like 0xf, etc. + * + * Reuse free bits when adding new feature flags! + */ +#define X86_FEATURE_CQM_LLC (11*32+ 0) /* LLC QoS if 1 */ +#define X86_FEATURE_CQM_OCCUP_LLC (11*32+ 1) /* LLC occupancy monitoring */ +#define X86_FEATURE_CQM_MBM_TOTAL (11*32+ 2) /* LLC Total MBM monitoring */ +#define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* LLC Local MBM monitoring */ -/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (EDX), word 12 */ -#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring */ -#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */ -#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */ +/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ +#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ @@ -322,6 +330,7 @@ #define X86_FEATURE_UMIP (16*32+ 2) /* User Mode Instruction Protection */ #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ +#define X86_FEATURE_WAITPKG (16*32+ 5) /* UMONITOR/UMWAIT/TPAUSE Instructions */ #define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ #define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */ #define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */ diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 9e27fa05a7ae..4c95c365058a 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -536,7 +536,7 @@ static inline void __fpregs_load_activate(void) struct fpu *fpu = ¤t->thread.fpu; int cpu = smp_processor_id(); - if (WARN_ON_ONCE(current->mm == NULL)) + if (WARN_ON_ONCE(current->flags & PF_KTHREAD)) return; if (!fpregs_state_valid(fpu, cpu)) { @@ -567,11 +567,11 @@ static inline void __fpregs_load_activate(void) * otherwise. * * The FPU context is only stored/restored for a user task and - * ->mm is used to distinguish between kernel and user threads. + * PF_KTHREAD is used to distinguish between kernel and user threads. */ static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) { - if (static_cpu_has(X86_FEATURE_FPU) && current->mm) { + if (static_cpu_has(X86_FEATURE_FPU) && !(current->flags & PF_KTHREAD)) { if (!copy_fpregs_to_fpstate(old_fpu)) old_fpu->last_cpu = -1; else diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 7e42b285c856..c6136d79f8c0 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -47,7 +47,6 @@ extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; extern void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask); -void fpu__xstate_clear_all_cpu_caps(void); void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr); const void *get_xsave_field_ptr(int xfeature_nr); int using_compacted_format(void); diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h index 5cbce6fbb534..296b346184b2 100644 --- a/arch/x86/include/asm/frame.h +++ b/arch/x86/include/asm/frame.h @@ -22,6 +22,35 @@ pop %_ASM_BP .endm +#ifdef CONFIG_X86_64 +/* + * This is a sneaky trick to help the unwinder find pt_regs on the stack. The + * frame pointer is replaced with an encoded pointer to pt_regs. The encoding + * is just setting the LSB, which makes it an invalid stack address and is also + * a signal to the unwinder that it's a pt_regs pointer in disguise. + * + * NOTE: This macro must be used *after* PUSH_AND_CLEAR_REGS because it corrupts + * the original rbp. + */ +.macro ENCODE_FRAME_POINTER ptregs_offset=0 + leaq 1+\ptregs_offset(%rsp), %rbp +.endm +#else /* !CONFIG_X86_64 */ +/* + * This is a sneaky trick to help the unwinder find pt_regs on the stack. The + * frame pointer is replaced with an encoded pointer to pt_regs. The encoding + * is just clearing the MSB, which makes it an invalid stack address and is also + * a signal to the unwinder that it's a pt_regs pointer in disguise. + * + * NOTE: This macro must be used *after* SAVE_ALL because it corrupts the + * original ebp. + */ +.macro ENCODE_FRAME_POINTER + mov %esp, %ebp + andl $0x7fffffff, %ebp +.endm +#endif /* CONFIG_X86_64 */ + #else /* !__ASSEMBLY__ */ #define FRAME_BEGIN \ @@ -30,12 +59,32 @@ #define FRAME_END "pop %" _ASM_BP "\n" +#ifdef CONFIG_X86_64 +#define ENCODE_FRAME_POINTER \ + "lea 1(%rsp), %rbp\n\t" +#else /* !CONFIG_X86_64 */ +#define ENCODE_FRAME_POINTER \ + "movl %esp, %ebp\n\t" \ + "andl $0x7fffffff, %ebp\n\t" +#endif /* CONFIG_X86_64 */ + #endif /* __ASSEMBLY__ */ #define FRAME_OFFSET __ASM_SEL(4, 8) #else /* !CONFIG_FRAME_POINTER */ +#ifdef __ASSEMBLY__ + +.macro ENCODE_FRAME_POINTER ptregs_offset=0 +.endm + +#else /* !__ASSEMBLY */ + +#define ENCODE_FRAME_POINTER + +#endif + #define FRAME_BEGIN #define FRAME_END #define FRAME_OFFSET 0 diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index d9069bb26c7f..07533795b8d2 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -37,7 +37,7 @@ typedef struct { #ifdef CONFIG_X86_MCE_AMD unsigned int irq_deferred_error_count; #endif -#if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN) +#ifdef CONFIG_X86_HV_CALLBACK_VECTOR unsigned int irq_hv_callback_count; #endif #if IS_ENABLED(CONFIG_HYPERV) diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index 67385d56d4f4..6352dee37cda 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -75,16 +75,15 @@ extern unsigned int hpet_readl(unsigned int a); extern void force_hpet_resume(void); struct irq_data; -struct hpet_dev; +struct hpet_channel; struct irq_domain; extern void hpet_msi_unmask(struct irq_data *data); extern void hpet_msi_mask(struct irq_data *data); -extern void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg); -extern void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg); +extern void hpet_msi_write(struct hpet_channel *hc, struct msi_msg *msg); extern struct irq_domain *hpet_create_irq_domain(int hpet_id); extern int hpet_assign_irq(struct irq_domain *domain, - struct hpet_dev *dev, int dev_num); + struct hpet_channel *hc, int dev_num); #ifdef CONFIG_HPET_EMULATE_RTC diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 32e666e1231e..cbd97e22d2f3 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -150,8 +150,11 @@ extern char irq_entries_start[]; #define trace_irq_entries_start irq_entries_start #endif +extern char spurious_entries_start[]; + #define VECTOR_UNUSED NULL -#define VECTOR_RETRIGGERED ((void *)~0UL) +#define VECTOR_SHUTDOWN ((void *)~0UL) +#define VECTOR_RETRIGGERED ((void *)~1UL) typedef struct irq_desc* vector_irq_t[NR_VECTORS]; DECLARE_PER_CPU(vector_irq_t, vector_irq); diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index cdf44aa9a501..af78cd72b8f3 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -401,6 +401,12 @@ enum HV_GENERIC_SET_FORMAT { #define HV_STATUS_INVALID_CONNECTION_ID 18 #define HV_STATUS_INSUFFICIENT_BUFFERS 19 +/* + * The Hyper-V TimeRefCount register and the TSC + * page provide a guest VM clock with 100ns tick rate + */ +#define HV_CLOCK_HZ (NSEC_PER_SEC/100) + typedef struct _HV_REFERENCE_TSC_PAGE { __u32 tsc_sequence; __u32 res1; diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 8c5aaba6633f..50a30f6c668b 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -29,6 +29,7 @@ enum x86_hypervisor_type { X86_HYPER_XEN_HVM, X86_HYPER_KVM, X86_HYPER_JAILHOUSE, + X86_HYPER_ACRN, }; #ifdef CONFIG_HYPERVISOR_GUEST diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 9f15384c504a..0278aa66ef62 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -52,7 +52,11 @@ #define INTEL_FAM6_CANNONLAKE_MOBILE 0x66 +#define INTEL_FAM6_ICELAKE_X 0x6A +#define INTEL_FAM6_ICELAKE_XEON_D 0x6C +#define INTEL_FAM6_ICELAKE_DESKTOP 0x7D #define INTEL_FAM6_ICELAKE_MOBILE 0x7E +#define INTEL_FAM6_ICELAKE_NNPI 0x9D /* "Small Core" Processors (Atom) */ @@ -73,6 +77,7 @@ #define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */ #define INTEL_FAM6_ATOM_GOLDMONT_X 0x5F /* Denverton */ #define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */ + #define INTEL_FAM6_ATOM_TREMONT_X 0x86 /* Jacobsville */ /* Xeon Phi */ diff --git a/arch/x86/include/asm/irq_regs.h b/arch/x86/include/asm/irq_regs.h index 8f3bee821e6c..187ce59aea28 100644 --- a/arch/x86/include/asm/irq_regs.h +++ b/arch/x86/include/asm/irq_regs.h @@ -16,7 +16,7 @@ DECLARE_PER_CPU(struct pt_regs *, irq_regs); static inline struct pt_regs *get_irq_regs(void) { - return this_cpu_read(irq_regs); + return __this_cpu_read(irq_regs); } static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs) @@ -24,7 +24,7 @@ static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs) struct pt_regs *old_regs; old_regs = get_irq_regs(); - this_cpu_write(irq_regs, new_regs); + __this_cpu_write(irq_regs, new_regs); return old_regs; } diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 65191ce8e1cf..06c3cc22a058 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_JUMP_LABEL_H #define _ASM_X86_JUMP_LABEL_H +#define HAVE_JUMP_LABEL_BATCH + #define JUMP_LABEL_NOP_SIZE 5 #ifdef CONFIG_X86_64 diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 003f2daa3b0f..5e7d6b46de97 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -71,22 +71,6 @@ struct kimage; #define KEXEC_BACKUP_SRC_END (640 * 1024UL - 1) /* 640K */ /* - * CPU does not save ss and sp on stack if execution is already - * running in kernel mode at the time of NMI occurrence. This code - * fixes it. - */ -static inline void crash_fixup_ss_esp(struct pt_regs *newregs, - struct pt_regs *oldregs) -{ -#ifdef CONFIG_X86_32 - newregs->sp = (unsigned long)&(oldregs->sp); - asm volatile("xorl %%eax, %%eax\n\t" - "movw %%ss, %%ax\n\t" - :"=a"(newregs->ss)); -#endif -} - -/* * This function is responsible for capturing register states if coming * via panic otherwise just fix up the ss and sp if coming via kernel * mode exception. @@ -96,7 +80,6 @@ static inline void crash_setup_regs(struct pt_regs *newregs, { if (oldregs) { memcpy(newregs, oldregs, sizeof(*newregs)); - crash_fixup_ss_esp(newregs, oldregs); } else { #ifdef CONFIG_X86_32 asm volatile("movl %%ebx,%0" : "=m"(newregs->bx)); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 450d69a1e6fa..26d1eb83f72a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1,11 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Kernel-based Virtual Machine driver for Linux * * This header defines architecture specific interfaces, x86 version - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * */ #ifndef _ASM_X86_KVM_HOST_H diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 616f8e637bc3..0c196c47d621 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -1,13 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * AMD Memory Encryption Support * * Copyright (C) 2016 Advanced Micro Devices, Inc. * * Author: Tom Lendacky <thomas.lendacky@amd.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #ifndef __X86_MEM_ENCRYPT_H__ diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index cc60e617931c..f4fa8a9d5d0b 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -105,6 +105,17 @@ static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type) #define hv_get_crash_ctl(val) \ rdmsrl(HV_X64_MSR_CRASH_CTL, val) +#define hv_get_time_ref_count(val) \ + rdmsrl(HV_X64_MSR_TIME_REF_COUNT, val) + +#define hv_get_reference_tsc(val) \ + rdmsrl(HV_X64_MSR_REFERENCE_TSC, val) +#define hv_set_reference_tsc(val) \ + wrmsrl(HV_X64_MSR_REFERENCE_TSC, val) +#define hv_set_clocksource_vdso(val) \ + ((val).archdata.vclock_mode = VCLOCK_HVCLOCK) +#define hv_get_raw_timer() rdtsc_ordered() + void hyperv_callback_vector(void); void hyperv_reenlightenment_vector(void); #ifdef CONFIG_TRACING @@ -133,7 +144,6 @@ static inline void hv_disable_stimer0_percpu_irq(int irq) {} #if IS_ENABLED(CONFIG_HYPERV) -extern struct clocksource *hyperv_cs; extern void *hv_hypercall_pg; extern void __percpu **hyperv_pcpu_input_arg; @@ -387,73 +397,4 @@ static inline int hyperv_flush_guest_mapping_range(u64 as, } #endif /* CONFIG_HYPERV */ -#ifdef CONFIG_HYPERV_TSCPAGE -struct ms_hyperv_tsc_page *hv_get_tsc_page(void); -static inline u64 hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, - u64 *cur_tsc) -{ - u64 scale, offset; - u32 sequence; - - /* - * The protocol for reading Hyper-V TSC page is specified in Hypervisor - * Top-Level Functional Specification ver. 3.0 and above. To get the - * reference time we must do the following: - * - READ ReferenceTscSequence - * A special '0' value indicates the time source is unreliable and we - * need to use something else. The currently published specification - * versions (up to 4.0b) contain a mistake and wrongly claim '-1' - * instead of '0' as the special value, see commit c35b82ef0294. - * - ReferenceTime = - * ((RDTSC() * ReferenceTscScale) >> 64) + ReferenceTscOffset - * - READ ReferenceTscSequence again. In case its value has changed - * since our first reading we need to discard ReferenceTime and repeat - * the whole sequence as the hypervisor was updating the page in - * between. - */ - do { - sequence = READ_ONCE(tsc_pg->tsc_sequence); - if (!sequence) - return U64_MAX; - /* - * Make sure we read sequence before we read other values from - * TSC page. - */ - smp_rmb(); - - scale = READ_ONCE(tsc_pg->tsc_scale); - offset = READ_ONCE(tsc_pg->tsc_offset); - *cur_tsc = rdtsc_ordered(); - - /* - * Make sure we read sequence after we read all other values - * from TSC page. - */ - smp_rmb(); - - } while (READ_ONCE(tsc_pg->tsc_sequence) != sequence); - - return mul_u64_u64_shr(*cur_tsc, scale, 64) + offset; -} - -static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg) -{ - u64 cur_tsc; - - return hv_read_tsc_page_tsc(tsc_pg, &cur_tsc); -} - -#else -static inline struct ms_hyperv_tsc_page *hv_get_tsc_page(void) -{ - return NULL; -} - -static inline u64 hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, - u64 *cur_tsc) -{ - BUG(); - return U64_MAX; -} -#endif #endif diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 979ef971cc78..6b4fc2788078 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -61,6 +61,15 @@ #define MSR_PLATFORM_INFO_CPUID_FAULT_BIT 31 #define MSR_PLATFORM_INFO_CPUID_FAULT BIT_ULL(MSR_PLATFORM_INFO_CPUID_FAULT_BIT) +#define MSR_IA32_UMWAIT_CONTROL 0xe1 +#define MSR_IA32_UMWAIT_CONTROL_C02_DISABLE BIT(0) +#define MSR_IA32_UMWAIT_CONTROL_RESERVED BIT(1) +/* + * The time field is bit[31:2], but representing a 32bit value with + * bit[1:0] zero. + */ +#define MSR_IA32_UMWAIT_CONTROL_TIME_MASK (~0x03U) + #define MSR_PKG_CST_CONFIG_CONTROL 0x000000e2 #define NHM_C3_AUTO_DEMOTE (1UL << 25) #define NHM_C1_AUTO_DEMOTE (1UL << 26) diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index eb0f80ce8524..e28f8b723b5c 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -86,9 +86,9 @@ static inline void __mwaitx(unsigned long eax, unsigned long ebx, static inline void __sti_mwait(unsigned long eax, unsigned long ecx) { - mds_idle_clear_cpu_buffers(); - trace_hardirqs_on(); + + mds_idle_clear_cpu_buffers(); /* "mwait %eax, %ecx;" */ asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" :: "a" (eax), "c" (ecx)); diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 2474e434a6f7..946f8f1f1efc 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -88,7 +88,7 @@ struct pv_init_ops { * the number of bytes of code generated, as we nop pad the * rest in generic code. */ - unsigned (*patch)(u8 type, void *insnbuf, + unsigned (*patch)(u8 type, void *insn_buff, unsigned long addr, unsigned len); } __no_randomize_layout; @@ -370,18 +370,11 @@ extern struct paravirt_patch_template pv_ops; /* Simple instruction patching code. */ #define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t" -#define DEF_NATIVE(ops, name, code) \ - __visible extern const char start_##ops##_##name[], end_##ops##_##name[]; \ - asm(NATIVE_LABEL("start_", ops, name) code NATIVE_LABEL("end_", ops, name)) +unsigned paravirt_patch_ident_64(void *insn_buff, unsigned len); +unsigned paravirt_patch_default(u8 type, void *insn_buff, unsigned long addr, unsigned len); +unsigned paravirt_patch_insns(void *insn_buff, unsigned len, const char *start, const char *end); -unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len); -unsigned paravirt_patch_default(u8 type, void *insnbuf, - unsigned long addr, unsigned len); - -unsigned paravirt_patch_insns(void *insnbuf, unsigned len, - const char *start, const char *end); - -unsigned native_patch(u8 type, void *ibuf, unsigned long addr, unsigned len); +unsigned native_patch(u8 type, void *insn_buff, unsigned long addr, unsigned len); int paravirt_disable_iospace(void); @@ -679,8 +672,8 @@ u64 _paravirt_ident_64(u64); /* These all sit in the .parainstructions section to tell us what to patch. */ struct paravirt_patch_site { - u8 *instr; /* original instructions */ - u8 instrtype; /* type of this instruction */ + u8 *instr; /* original instructions */ + u8 type; /* type of this instruction */ u8 len; /* length of original instruction */ }; diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 1a19d11cfbbd..2278797c769d 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -87,7 +87,7 @@ * don't give an lvalue though). */ extern void __bad_percpu_size(void); -#define percpu_to_op(op, var, val) \ +#define percpu_to_op(qual, op, var, val) \ do { \ typedef typeof(var) pto_T__; \ if (0) { \ @@ -97,22 +97,22 @@ do { \ } \ switch (sizeof(var)) { \ case 1: \ - asm(op "b %1,"__percpu_arg(0) \ + asm qual (op "b %1,"__percpu_arg(0) \ : "+m" (var) \ : "qi" ((pto_T__)(val))); \ break; \ case 2: \ - asm(op "w %1,"__percpu_arg(0) \ + asm qual (op "w %1,"__percpu_arg(0) \ : "+m" (var) \ : "ri" ((pto_T__)(val))); \ break; \ case 4: \ - asm(op "l %1,"__percpu_arg(0) \ + asm qual (op "l %1,"__percpu_arg(0) \ : "+m" (var) \ : "ri" ((pto_T__)(val))); \ break; \ case 8: \ - asm(op "q %1,"__percpu_arg(0) \ + asm qual (op "q %1,"__percpu_arg(0) \ : "+m" (var) \ : "re" ((pto_T__)(val))); \ break; \ @@ -124,7 +124,7 @@ do { \ * Generate a percpu add to memory instruction and optimize code * if one is added or subtracted. */ -#define percpu_add_op(var, val) \ +#define percpu_add_op(qual, var, val) \ do { \ typedef typeof(var) pao_T__; \ const int pao_ID__ = (__builtin_constant_p(val) && \ @@ -138,41 +138,41 @@ do { \ switch (sizeof(var)) { \ case 1: \ if (pao_ID__ == 1) \ - asm("incb "__percpu_arg(0) : "+m" (var)); \ + asm qual ("incb "__percpu_arg(0) : "+m" (var)); \ else if (pao_ID__ == -1) \ - asm("decb "__percpu_arg(0) : "+m" (var)); \ + asm qual ("decb "__percpu_arg(0) : "+m" (var)); \ else \ - asm("addb %1, "__percpu_arg(0) \ + asm qual ("addb %1, "__percpu_arg(0) \ : "+m" (var) \ : "qi" ((pao_T__)(val))); \ break; \ case 2: \ if (pao_ID__ == 1) \ - asm("incw "__percpu_arg(0) : "+m" (var)); \ + asm qual ("incw "__percpu_arg(0) : "+m" (var)); \ else if (pao_ID__ == -1) \ - asm("decw "__percpu_arg(0) : "+m" (var)); \ + asm qual ("decw "__percpu_arg(0) : "+m" (var)); \ else \ - asm("addw %1, "__percpu_arg(0) \ + asm qual ("addw %1, "__percpu_arg(0) \ : "+m" (var) \ : "ri" ((pao_T__)(val))); \ break; \ case 4: \ if (pao_ID__ == 1) \ - asm("incl "__percpu_arg(0) : "+m" (var)); \ + asm qual ("incl "__percpu_arg(0) : "+m" (var)); \ else if (pao_ID__ == -1) \ - asm("decl "__percpu_arg(0) : "+m" (var)); \ + asm qual ("decl "__percpu_arg(0) : "+m" (var)); \ else \ - asm("addl %1, "__percpu_arg(0) \ + asm qual ("addl %1, "__percpu_arg(0) \ : "+m" (var) \ : "ri" ((pao_T__)(val))); \ break; \ case 8: \ if (pao_ID__ == 1) \ - asm("incq "__percpu_arg(0) : "+m" (var)); \ + asm qual ("incq "__percpu_arg(0) : "+m" (var)); \ else if (pao_ID__ == -1) \ - asm("decq "__percpu_arg(0) : "+m" (var)); \ + asm qual ("decq "__percpu_arg(0) : "+m" (var)); \ else \ - asm("addq %1, "__percpu_arg(0) \ + asm qual ("addq %1, "__percpu_arg(0) \ : "+m" (var) \ : "re" ((pao_T__)(val))); \ break; \ @@ -180,27 +180,27 @@ do { \ } \ } while (0) -#define percpu_from_op(op, var) \ +#define percpu_from_op(qual, op, var) \ ({ \ typeof(var) pfo_ret__; \ switch (sizeof(var)) { \ case 1: \ - asm volatile(op "b "__percpu_arg(1)",%0"\ + asm qual (op "b "__percpu_arg(1)",%0" \ : "=q" (pfo_ret__) \ : "m" (var)); \ break; \ case 2: \ - asm volatile(op "w "__percpu_arg(1)",%0"\ + asm qual (op "w "__percpu_arg(1)",%0" \ : "=r" (pfo_ret__) \ : "m" (var)); \ break; \ case 4: \ - asm volatile(op "l "__percpu_arg(1)",%0"\ + asm qual (op "l "__percpu_arg(1)",%0" \ : "=r" (pfo_ret__) \ : "m" (var)); \ break; \ case 8: \ - asm volatile(op "q "__percpu_arg(1)",%0"\ + asm qual (op "q "__percpu_arg(1)",%0" \ : "=r" (pfo_ret__) \ : "m" (var)); \ break; \ @@ -238,23 +238,23 @@ do { \ pfo_ret__; \ }) -#define percpu_unary_op(op, var) \ +#define percpu_unary_op(qual, op, var) \ ({ \ switch (sizeof(var)) { \ case 1: \ - asm(op "b "__percpu_arg(0) \ + asm qual (op "b "__percpu_arg(0) \ : "+m" (var)); \ break; \ case 2: \ - asm(op "w "__percpu_arg(0) \ + asm qual (op "w "__percpu_arg(0) \ : "+m" (var)); \ break; \ case 4: \ - asm(op "l "__percpu_arg(0) \ + asm qual (op "l "__percpu_arg(0) \ : "+m" (var)); \ break; \ case 8: \ - asm(op "q "__percpu_arg(0) \ + asm qual (op "q "__percpu_arg(0) \ : "+m" (var)); \ break; \ default: __bad_percpu_size(); \ @@ -264,27 +264,27 @@ do { \ /* * Add return operation */ -#define percpu_add_return_op(var, val) \ +#define percpu_add_return_op(qual, var, val) \ ({ \ typeof(var) paro_ret__ = val; \ switch (sizeof(var)) { \ case 1: \ - asm("xaddb %0, "__percpu_arg(1) \ + asm qual ("xaddb %0, "__percpu_arg(1) \ : "+q" (paro_ret__), "+m" (var) \ : : "memory"); \ break; \ case 2: \ - asm("xaddw %0, "__percpu_arg(1) \ + asm qual ("xaddw %0, "__percpu_arg(1) \ : "+r" (paro_ret__), "+m" (var) \ : : "memory"); \ break; \ case 4: \ - asm("xaddl %0, "__percpu_arg(1) \ + asm qual ("xaddl %0, "__percpu_arg(1) \ : "+r" (paro_ret__), "+m" (var) \ : : "memory"); \ break; \ case 8: \ - asm("xaddq %0, "__percpu_arg(1) \ + asm qual ("xaddq %0, "__percpu_arg(1) \ : "+re" (paro_ret__), "+m" (var) \ : : "memory"); \ break; \ @@ -299,13 +299,13 @@ do { \ * expensive due to the implied lock prefix. The processor cannot prefetch * cachelines if xchg is used. */ -#define percpu_xchg_op(var, nval) \ +#define percpu_xchg_op(qual, var, nval) \ ({ \ typeof(var) pxo_ret__; \ typeof(var) pxo_new__ = (nval); \ switch (sizeof(var)) { \ case 1: \ - asm("\n\tmov "__percpu_arg(1)",%%al" \ + asm qual ("\n\tmov "__percpu_arg(1)",%%al" \ "\n1:\tcmpxchgb %2, "__percpu_arg(1) \ "\n\tjnz 1b" \ : "=&a" (pxo_ret__), "+m" (var) \ @@ -313,7 +313,7 @@ do { \ : "memory"); \ break; \ case 2: \ - asm("\n\tmov "__percpu_arg(1)",%%ax" \ + asm qual ("\n\tmov "__percpu_arg(1)",%%ax" \ "\n1:\tcmpxchgw %2, "__percpu_arg(1) \ "\n\tjnz 1b" \ : "=&a" (pxo_ret__), "+m" (var) \ @@ -321,7 +321,7 @@ do { \ : "memory"); \ break; \ case 4: \ - asm("\n\tmov "__percpu_arg(1)",%%eax" \ + asm qual ("\n\tmov "__percpu_arg(1)",%%eax" \ "\n1:\tcmpxchgl %2, "__percpu_arg(1) \ "\n\tjnz 1b" \ : "=&a" (pxo_ret__), "+m" (var) \ @@ -329,7 +329,7 @@ do { \ : "memory"); \ break; \ case 8: \ - asm("\n\tmov "__percpu_arg(1)",%%rax" \ + asm qual ("\n\tmov "__percpu_arg(1)",%%rax" \ "\n1:\tcmpxchgq %2, "__percpu_arg(1) \ "\n\tjnz 1b" \ : "=&a" (pxo_ret__), "+m" (var) \ @@ -345,32 +345,32 @@ do { \ * cmpxchg has no such implied lock semantics as a result it is much * more efficient for cpu local operations. */ -#define percpu_cmpxchg_op(var, oval, nval) \ +#define percpu_cmpxchg_op(qual, var, oval, nval) \ ({ \ typeof(var) pco_ret__; \ typeof(var) pco_old__ = (oval); \ typeof(var) pco_new__ = (nval); \ switch (sizeof(var)) { \ case 1: \ - asm("cmpxchgb %2, "__percpu_arg(1) \ + asm qual ("cmpxchgb %2, "__percpu_arg(1) \ : "=a" (pco_ret__), "+m" (var) \ : "q" (pco_new__), "0" (pco_old__) \ : "memory"); \ break; \ case 2: \ - asm("cmpxchgw %2, "__percpu_arg(1) \ + asm qual ("cmpxchgw %2, "__percpu_arg(1) \ : "=a" (pco_ret__), "+m" (var) \ : "r" (pco_new__), "0" (pco_old__) \ : "memory"); \ break; \ case 4: \ - asm("cmpxchgl %2, "__percpu_arg(1) \ + asm qual ("cmpxchgl %2, "__percpu_arg(1) \ : "=a" (pco_ret__), "+m" (var) \ : "r" (pco_new__), "0" (pco_old__) \ : "memory"); \ break; \ case 8: \ - asm("cmpxchgq %2, "__percpu_arg(1) \ + asm qual ("cmpxchgq %2, "__percpu_arg(1) \ : "=a" (pco_ret__), "+m" (var) \ : "r" (pco_new__), "0" (pco_old__) \ : "memory"); \ @@ -391,58 +391,70 @@ do { \ */ #define this_cpu_read_stable(var) percpu_stable_op("mov", var) -#define raw_cpu_read_1(pcp) percpu_from_op("mov", pcp) -#define raw_cpu_read_2(pcp) percpu_from_op("mov", pcp) -#define raw_cpu_read_4(pcp) percpu_from_op("mov", pcp) - -#define raw_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) -#define raw_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) -#define raw_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val) -#define raw_cpu_add_1(pcp, val) percpu_add_op((pcp), val) -#define raw_cpu_add_2(pcp, val) percpu_add_op((pcp), val) -#define raw_cpu_add_4(pcp, val) percpu_add_op((pcp), val) -#define raw_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) -#define raw_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) -#define raw_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) -#define raw_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val) -#define raw_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val) -#define raw_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val) -#define raw_cpu_xchg_1(pcp, val) percpu_xchg_op(pcp, val) -#define raw_cpu_xchg_2(pcp, val) percpu_xchg_op(pcp, val) -#define raw_cpu_xchg_4(pcp, val) percpu_xchg_op(pcp, val) - -#define this_cpu_read_1(pcp) percpu_from_op("mov", pcp) -#define this_cpu_read_2(pcp) percpu_from_op("mov", pcp) -#define this_cpu_read_4(pcp) percpu_from_op("mov", pcp) -#define this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val) -#define this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val) -#define this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val) -#define this_cpu_add_1(pcp, val) percpu_add_op((pcp), val) -#define this_cpu_add_2(pcp, val) percpu_add_op((pcp), val) -#define this_cpu_add_4(pcp, val) percpu_add_op((pcp), val) -#define this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val) -#define this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val) -#define this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val) -#define this_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val) -#define this_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val) -#define this_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val) -#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval) -#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval) -#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval) - -#define raw_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) -#define raw_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) -#define raw_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) -#define raw_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define raw_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define raw_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) - -#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val) -#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val) -#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) -#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) -#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) +#define raw_cpu_read_1(pcp) percpu_from_op(, "mov", pcp) +#define raw_cpu_read_2(pcp) percpu_from_op(, "mov", pcp) +#define raw_cpu_read_4(pcp) percpu_from_op(, "mov", pcp) + +#define raw_cpu_write_1(pcp, val) percpu_to_op(, "mov", (pcp), val) +#define raw_cpu_write_2(pcp, val) percpu_to_op(, "mov", (pcp), val) +#define raw_cpu_write_4(pcp, val) percpu_to_op(, "mov", (pcp), val) +#define raw_cpu_add_1(pcp, val) percpu_add_op(, (pcp), val) +#define raw_cpu_add_2(pcp, val) percpu_add_op(, (pcp), val) +#define raw_cpu_add_4(pcp, val) percpu_add_op(, (pcp), val) +#define raw_cpu_and_1(pcp, val) percpu_to_op(, "and", (pcp), val) +#define raw_cpu_and_2(pcp, val) percpu_to_op(, "and", (pcp), val) +#define raw_cpu_and_4(pcp, val) percpu_to_op(, "and", (pcp), val) +#define raw_cpu_or_1(pcp, val) percpu_to_op(, "or", (pcp), val) +#define raw_cpu_or_2(pcp, val) percpu_to_op(, "or", (pcp), val) +#define raw_cpu_or_4(pcp, val) percpu_to_op(, "or", (pcp), val) + +/* + * raw_cpu_xchg() can use a load-store since it is not required to be + * IRQ-safe. + */ +#define raw_percpu_xchg_op(var, nval) \ +({ \ + typeof(var) pxo_ret__ = raw_cpu_read(var); \ + raw_cpu_write(var, (nval)); \ + pxo_ret__; \ +}) + +#define raw_cpu_xchg_1(pcp, val) raw_percpu_xchg_op(pcp, val) +#define raw_cpu_xchg_2(pcp, val) raw_percpu_xchg_op(pcp, val) +#define raw_cpu_xchg_4(pcp, val) raw_percpu_xchg_op(pcp, val) + +#define this_cpu_read_1(pcp) percpu_from_op(volatile, "mov", pcp) +#define this_cpu_read_2(pcp) percpu_from_op(volatile, "mov", pcp) +#define this_cpu_read_4(pcp) percpu_from_op(volatile, "mov", pcp) +#define this_cpu_write_1(pcp, val) percpu_to_op(volatile, "mov", (pcp), val) +#define this_cpu_write_2(pcp, val) percpu_to_op(volatile, "mov", (pcp), val) +#define this_cpu_write_4(pcp, val) percpu_to_op(volatile, "mov", (pcp), val) +#define this_cpu_add_1(pcp, val) percpu_add_op(volatile, (pcp), val) +#define this_cpu_add_2(pcp, val) percpu_add_op(volatile, (pcp), val) +#define this_cpu_add_4(pcp, val) percpu_add_op(volatile, (pcp), val) +#define this_cpu_and_1(pcp, val) percpu_to_op(volatile, "and", (pcp), val) +#define this_cpu_and_2(pcp, val) percpu_to_op(volatile, "and", (pcp), val) +#define this_cpu_and_4(pcp, val) percpu_to_op(volatile, "and", (pcp), val) +#define this_cpu_or_1(pcp, val) percpu_to_op(volatile, "or", (pcp), val) +#define this_cpu_or_2(pcp, val) percpu_to_op(volatile, "or", (pcp), val) +#define this_cpu_or_4(pcp, val) percpu_to_op(volatile, "or", (pcp), val) +#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(volatile, pcp, nval) +#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(volatile, pcp, nval) +#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(volatile, pcp, nval) + +#define raw_cpu_add_return_1(pcp, val) percpu_add_return_op(, pcp, val) +#define raw_cpu_add_return_2(pcp, val) percpu_add_return_op(, pcp, val) +#define raw_cpu_add_return_4(pcp, val) percpu_add_return_op(, pcp, val) +#define raw_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(, pcp, oval, nval) +#define raw_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(, pcp, oval, nval) +#define raw_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(, pcp, oval, nval) + +#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(volatile, pcp, val) +#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(volatile, pcp, val) +#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(volatile, pcp, val) +#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(volatile, pcp, oval, nval) +#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(volatile, pcp, oval, nval) +#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(volatile, pcp, oval, nval) #ifdef CONFIG_X86_CMPXCHG64 #define percpu_cmpxchg8b_double(pcp1, pcp2, o1, o2, n1, n2) \ @@ -466,23 +478,23 @@ do { \ * 32 bit must fall back to generic operations. */ #ifdef CONFIG_X86_64 -#define raw_cpu_read_8(pcp) percpu_from_op("mov", pcp) -#define raw_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) -#define raw_cpu_add_8(pcp, val) percpu_add_op((pcp), val) -#define raw_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) -#define raw_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) -#define raw_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) -#define raw_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) -#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) - -#define this_cpu_read_8(pcp) percpu_from_op("mov", pcp) -#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) -#define this_cpu_add_8(pcp, val) percpu_add_op((pcp), val) -#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) -#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) -#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) -#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) -#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) +#define raw_cpu_read_8(pcp) percpu_from_op(, "mov", pcp) +#define raw_cpu_write_8(pcp, val) percpu_to_op(, "mov", (pcp), val) +#define raw_cpu_add_8(pcp, val) percpu_add_op(, (pcp), val) +#define raw_cpu_and_8(pcp, val) percpu_to_op(, "and", (pcp), val) +#define raw_cpu_or_8(pcp, val) percpu_to_op(, "or", (pcp), val) +#define raw_cpu_add_return_8(pcp, val) percpu_add_return_op(, pcp, val) +#define raw_cpu_xchg_8(pcp, nval) raw_percpu_xchg_op(pcp, nval) +#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(, pcp, oval, nval) + +#define this_cpu_read_8(pcp) percpu_from_op(volatile, "mov", pcp) +#define this_cpu_write_8(pcp, val) percpu_to_op(volatile, "mov", (pcp), val) +#define this_cpu_add_8(pcp, val) percpu_add_op(volatile, (pcp), val) +#define this_cpu_and_8(pcp, val) percpu_to_op(volatile, "and", (pcp), val) +#define this_cpu_or_8(pcp, val) percpu_to_op(volatile, "or", (pcp), val) +#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(volatile, pcp, val) +#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(volatile, pcp, nval) +#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(volatile, pcp, oval, nval) /* * Pretty complex macro to generate cmpxchg16 instruction. The instruction diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c34a35c78618..3eab6ece52b4 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -105,7 +105,7 @@ struct cpuinfo_x86 { int x86_power; unsigned long loops_per_jiffy; /* cpuid returned max cores value: */ - u16 x86_max_cores; + u16 x86_max_cores; u16 apicid; u16 initial_apicid; u16 x86_clflush_size; @@ -117,6 +117,8 @@ struct cpuinfo_x86 { u16 logical_proc_id; /* Core id: */ u16 cpu_core_id; + u16 cpu_die_id; + u16 logical_die_id; /* Index into per_cpu list: */ u16 cpu_index; u32 microcode; @@ -144,7 +146,8 @@ enum cpuid_regs_idx { #define X86_VENDOR_TRANSMETA 7 #define X86_VENDOR_NSC 8 #define X86_VENDOR_HYGON 9 -#define X86_VENDOR_NUM 10 +#define X86_VENDOR_ZHAOXIN 10 +#define X86_VENDOR_NUM 11 #define X86_VENDOR_UNKNOWN 0xff diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 8a7fc0cca2d1..78cf265c5b58 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -102,8 +102,7 @@ extern unsigned long profile_pc(struct pt_regs *regs); extern unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs); -extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, - int error_code, int si_code); +extern void send_sigtrap(struct pt_regs *regs, int error_code, int si_code); static inline unsigned long regs_return_value(struct pt_regs *regs) @@ -166,14 +165,10 @@ static inline bool user_64bit_mode(struct pt_regs *regs) #define compat_user_stack_pointer() current_pt_regs()->sp #endif -#ifdef CONFIG_X86_32 -extern unsigned long kernel_stack_pointer(struct pt_regs *regs); -#else static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) { return regs->sp; } -#endif #define GET_IP(regs) ((regs)->ip) #define GET_FP(regs) ((regs)->bp) @@ -201,14 +196,6 @@ static inline unsigned long regs_get_register(struct pt_regs *regs, if (unlikely(offset > MAX_REG_OFFSET)) return 0; #ifdef CONFIG_X86_32 - /* - * Traps from the kernel do not save sp and ss. - * Use the helper function to retrieve sp. - */ - if (offset == offsetof(struct pt_regs, sp) && - regs->cs == __KERNEL_CS) - return kernel_stack_pointer(regs); - /* The selector fields are 16-bit. */ if (offset == offsetof(struct pt_regs, cs) || offset == offsetof(struct pt_regs, ss) || @@ -234,8 +221,7 @@ static inline unsigned long regs_get_register(struct pt_regs *regs, static inline int regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr) { - return ((addr & ~(THREAD_SIZE - 1)) == - (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1))); + return ((addr & ~(THREAD_SIZE - 1)) == (regs->sp & ~(THREAD_SIZE - 1))); } /** @@ -249,7 +235,7 @@ static inline int regs_within_kernel_stack(struct pt_regs *regs, */ static inline unsigned long *regs_get_kernel_stack_nth_addr(struct pt_regs *regs, unsigned int n) { - unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs); + unsigned long *addr = (unsigned long *)regs->sp; addr += n; if (regs_within_kernel_stack(regs, (unsigned long)addr)) diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index b6033680d458..19b695ff2c68 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -2,7 +2,7 @@ #ifndef _ASM_X86_PVCLOCK_H #define _ASM_X86_PVCLOCK_H -#include <linux/clocksource.h> +#include <asm/clocksource.h> #include <asm/pvclock-abi.h> /* some helper functions for xen and kvm pv clock sources */ diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h index 8ea1cfdbeabc..71b32f2570ab 100644 --- a/arch/x86/include/asm/sections.h +++ b/arch/x86/include/asm/sections.h @@ -13,4 +13,6 @@ extern char __end_rodata_aligned[]; extern char __end_rodata_hpage_align[]; #endif +extern char __end_of_kernel_reserve[]; + #endif /* _ASM_X86_SECTIONS_H */ diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index da545df207b2..e1356a3b8223 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -23,6 +23,7 @@ extern unsigned int num_processors; DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map); +DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map); /* cpus sharing the last level cache: */ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id); @@ -162,7 +163,8 @@ __visible void smp_call_function_single_interrupt(struct pt_regs *r); * from the initial startup. We map APIC_BASE very early in page_setup(), * so this is correct in the x86 case. */ -#define raw_smp_processor_id() (this_cpu_read(cpu_number)) +#define raw_smp_processor_id() this_cpu_read(cpu_number) +#define __smp_processor_id() __this_cpu_read(cpu_number) #ifdef CONFIG_X86_32 extern int safe_smp_processor_id(void); diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 0a3c4cab39db..b2e84d113f2a 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -6,6 +6,8 @@ #ifdef __KERNEL__ #include <asm/nops.h> +#include <asm/processor-flags.h> +#include <linux/jump_label.h> /* * Volatile isn't enough to prevent the compiler from reordering the @@ -16,6 +18,10 @@ */ extern unsigned long __force_order; +/* Starts false and gets enabled once CPU feature detection is done. */ +DECLARE_STATIC_KEY_FALSE(cr_pinning); +extern unsigned long cr4_pinned_bits; + static inline unsigned long native_read_cr0(void) { unsigned long val; @@ -25,7 +31,20 @@ static inline unsigned long native_read_cr0(void) static inline void native_write_cr0(unsigned long val) { - asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order)); + unsigned long bits_missing = 0; + +set_register: + asm volatile("mov %0,%%cr0": "+r" (val), "+m" (__force_order)); + + if (static_branch_likely(&cr_pinning)) { + if (unlikely((val & X86_CR0_WP) != X86_CR0_WP)) { + bits_missing = X86_CR0_WP; + val |= bits_missing; + goto set_register; + } + /* Warn after we've set the missing bits. */ + WARN_ONCE(bits_missing, "CR0 WP bit went missing!?\n"); + } } static inline unsigned long native_read_cr2(void) @@ -74,7 +93,21 @@ static inline unsigned long native_read_cr4(void) static inline void native_write_cr4(unsigned long val) { - asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order)); + unsigned long bits_missing = 0; + +set_register: + asm volatile("mov %0,%%cr4": "+r" (val), "+m" (cr4_pinned_bits)); + + if (static_branch_likely(&cr_pinning)) { + if (unlikely((val & cr4_pinned_bits) != cr4_pinned_bits)) { + bits_missing = ~val & cr4_pinned_bits; + val |= bits_missing; + goto set_register; + } + /* Warn after we've set the missing bits. */ + WARN_ONCE(bits_missing, "CR4 bits went missing: %lx!?\n", + bits_missing); + } } #ifdef CONFIG_X86_64 diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index a8d0cdf48616..14db05086bbf 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -78,7 +78,7 @@ static inline unsigned long * get_stack_pointer(struct task_struct *task, struct pt_regs *regs) { if (regs) - return (unsigned long *)kernel_stack_pointer(regs); + return (unsigned long *)regs->sp; if (task == current) return __builtin_frame_address(0); diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 880b5515b1d6..70c09967a999 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -18,6 +18,20 @@ static inline void apply_paravirt(struct paravirt_patch_site *start, #define __parainstructions_end NULL #endif +/* + * Currently, the max observed size in the kernel code is + * JUMP_LABEL_NOP_SIZE/RELATIVEJUMP_SIZE, which are 5. + * Raise it if needed. + */ +#define POKE_MAX_OPCODE_SIZE 5 + +struct text_poke_loc { + void *detour; + void *addr; + size_t len; + const char opcode[POKE_MAX_OPCODE_SIZE]; +}; + extern void text_poke_early(void *addr, const void *opcode, size_t len); /* @@ -38,6 +52,7 @@ extern void *text_poke(void *addr, const void *opcode, size_t len); extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); extern int poke_int3_handler(struct pt_regs *regs); extern void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); +extern void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries); extern int after_bootmem; extern __ro_after_init struct mm_struct *poking_mm; extern __ro_after_init unsigned long poking_addr; @@ -51,7 +66,6 @@ static inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip) #define INT3_INSN_SIZE 1 #define CALL_INSN_SIZE 5 -#ifdef CONFIG_X86_64 static inline void int3_emulate_push(struct pt_regs *regs, unsigned long val) { /* @@ -69,7 +83,6 @@ static inline void int3_emulate_call(struct pt_regs *regs, unsigned long func) int3_emulate_push(regs, regs->ip - INT3_INSN_SIZE + CALL_INSN_SIZE); int3_emulate_jmp(regs, func); } -#endif /* CONFIG_X86_64 */ #endif /* !CONFIG_UML_X86 */ #endif /* _ASM_X86_TEXT_PATCHING_H */ diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h index cef818b16045..8ac563abb567 100644 --- a/arch/x86/include/asm/time.h +++ b/arch/x86/include/asm/time.h @@ -7,6 +7,7 @@ extern void hpet_time_init(void); extern void time_init(void); +extern bool pit_timer_init(void); extern struct clock_event_device *global_clock_event; diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 453cf38a1c33..4b14d2318251 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -106,15 +106,25 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu); #define topology_logical_package_id(cpu) (cpu_data(cpu).logical_proc_id) #define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id) +#define topology_logical_die_id(cpu) (cpu_data(cpu).logical_die_id) +#define topology_die_id(cpu) (cpu_data(cpu).cpu_die_id) #define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id) #ifdef CONFIG_SMP +#define topology_die_cpumask(cpu) (per_cpu(cpu_die_map, cpu)) #define topology_core_cpumask(cpu) (per_cpu(cpu_core_map, cpu)) #define topology_sibling_cpumask(cpu) (per_cpu(cpu_sibling_map, cpu)) extern unsigned int __max_logical_packages; #define topology_max_packages() (__max_logical_packages) +extern unsigned int __max_die_per_package; + +static inline int topology_max_die_per_package(void) +{ + return __max_die_per_package; +} + extern int __max_smt_threads; static inline int topology_max_smt_threads(void) @@ -123,14 +133,21 @@ static inline int topology_max_smt_threads(void) } int topology_update_package_map(unsigned int apicid, unsigned int cpu); +int topology_update_die_map(unsigned int dieid, unsigned int cpu); int topology_phys_to_logical_pkg(unsigned int pkg); +int topology_phys_to_logical_die(unsigned int die, unsigned int cpu); bool topology_is_primary_thread(unsigned int cpu); bool topology_smt_supported(void); #else #define topology_max_packages() (1) static inline int topology_update_package_map(unsigned int apicid, unsigned int cpu) { return 0; } +static inline int +topology_update_die_map(unsigned int dieid, unsigned int cpu) { return 0; } static inline int topology_phys_to_logical_pkg(unsigned int pkg) { return 0; } +static inline int topology_phys_to_logical_die(unsigned int die, + unsigned int cpu) { return 0; } +static inline int topology_max_die_per_package(void) { return 1; } static inline int topology_max_smt_threads(void) { return 1; } static inline bool topology_is_primary_thread(unsigned int cpu) { return true; } static inline bool topology_smt_supported(void) { return false; } diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h new file mode 100644 index 000000000000..ae91429129a6 --- /dev/null +++ b/arch/x86/include/asm/vdso/gettimeofday.h @@ -0,0 +1,261 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Fast user context implementation of clock_gettime, gettimeofday, and time. + * + * Copyright (C) 2019 ARM Limited. + * Copyright 2006 Andi Kleen, SUSE Labs. + * 32 Bit compat layer by Stefani Seibold <stefani@seibold.net> + * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany + */ +#ifndef __ASM_VDSO_GETTIMEOFDAY_H +#define __ASM_VDSO_GETTIMEOFDAY_H + +#ifndef __ASSEMBLY__ + +#include <uapi/linux/time.h> +#include <asm/vgtod.h> +#include <asm/vvar.h> +#include <asm/unistd.h> +#include <asm/msr.h> +#include <asm/pvclock.h> +#include <clocksource/hyperv_timer.h> + +#define __vdso_data (VVAR(_vdso_data)) + +#define VDSO_HAS_TIME 1 + +#define VDSO_HAS_CLOCK_GETRES 1 + +/* + * Declare the memory-mapped vclock data pages. These come from hypervisors. + * If we ever reintroduce something like direct access to an MMIO clock like + * the HPET again, it will go here as well. + * + * A load from any of these pages will segfault if the clock in question is + * disabled, so appropriate compiler barriers and checks need to be used + * to prevent stray loads. + * + * These declarations MUST NOT be const. The compiler will assume that + * an extern const variable has genuinely constant contents, and the + * resulting code won't work, since the whole point is that these pages + * change over time, possibly while we're accessing them. + */ + +#ifdef CONFIG_PARAVIRT_CLOCK +/* + * This is the vCPU 0 pvclock page. We only use pvclock from the vDSO + * if the hypervisor tells us that all vCPUs can get valid data from the + * vCPU 0 page. + */ +extern struct pvclock_vsyscall_time_info pvclock_page + __attribute__((visibility("hidden"))); +#endif + +#ifdef CONFIG_HYPERV_TSCPAGE +extern struct ms_hyperv_tsc_page hvclock_page + __attribute__((visibility("hidden"))); +#endif + +#ifndef BUILD_VDSO32 + +static __always_inline +long clock_gettime_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) +{ + long ret; + + asm ("syscall" : "=a" (ret), "=m" (*_ts) : + "0" (__NR_clock_gettime), "D" (_clkid), "S" (_ts) : + "rcx", "r11"); + + return ret; +} + +static __always_inline +long gettimeofday_fallback(struct __kernel_old_timeval *_tv, + struct timezone *_tz) +{ + long ret; + + asm("syscall" : "=a" (ret) : + "0" (__NR_gettimeofday), "D" (_tv), "S" (_tz) : "memory"); + + return ret; +} + +static __always_inline +long clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) +{ + long ret; + + asm ("syscall" : "=a" (ret), "=m" (*_ts) : + "0" (__NR_clock_getres), "D" (_clkid), "S" (_ts) : + "rcx", "r11"); + + return ret; +} + +#else + +static __always_inline +long clock_gettime_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) +{ + long ret; + + asm ( + "mov %%ebx, %%edx \n" + "mov %[clock], %%ebx \n" + "call __kernel_vsyscall \n" + "mov %%edx, %%ebx \n" + : "=a" (ret), "=m" (*_ts) + : "0" (__NR_clock_gettime64), [clock] "g" (_clkid), "c" (_ts) + : "edx"); + + return ret; +} + +static __always_inline +long gettimeofday_fallback(struct __kernel_old_timeval *_tv, + struct timezone *_tz) +{ + long ret; + + asm( + "mov %%ebx, %%edx \n" + "mov %2, %%ebx \n" + "call __kernel_vsyscall \n" + "mov %%edx, %%ebx \n" + : "=a" (ret) + : "0" (__NR_gettimeofday), "g" (_tv), "c" (_tz) + : "memory", "edx"); + + return ret; +} + +static __always_inline long +clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) +{ + long ret; + + asm ( + "mov %%ebx, %%edx \n" + "mov %[clock], %%ebx \n" + "call __kernel_vsyscall \n" + "mov %%edx, %%ebx \n" + : "=a" (ret), "=m" (*_ts) + : "0" (__NR_clock_getres_time64), [clock] "g" (_clkid), "c" (_ts) + : "edx"); + + return ret; +} + +#endif + +#ifdef CONFIG_PARAVIRT_CLOCK +static u64 vread_pvclock(void) +{ + const struct pvclock_vcpu_time_info *pvti = &pvclock_page.pvti; + u32 version; + u64 ret; + + /* + * Note: The kernel and hypervisor must guarantee that cpu ID + * number maps 1:1 to per-CPU pvclock time info. + * + * Because the hypervisor is entirely unaware of guest userspace + * preemption, it cannot guarantee that per-CPU pvclock time + * info is updated if the underlying CPU changes or that that + * version is increased whenever underlying CPU changes. + * + * On KVM, we are guaranteed that pvti updates for any vCPU are + * atomic as seen by *all* vCPUs. This is an even stronger + * guarantee than we get with a normal seqlock. + * + * On Xen, we don't appear to have that guarantee, but Xen still + * supplies a valid seqlock using the version field. + * + * We only do pvclock vdso timing at all if + * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to + * mean that all vCPUs have matching pvti and that the TSC is + * synced, so we can just look at vCPU 0's pvti. + */ + + do { + version = pvclock_read_begin(pvti); + + if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) + return U64_MAX; + + ret = __pvclock_read_cycles(pvti, rdtsc_ordered()); + } while (pvclock_read_retry(pvti, version)); + + return ret; +} +#endif + +#ifdef CONFIG_HYPERV_TSCPAGE +static u64 vread_hvclock(void) +{ + return hv_read_tsc_page(&hvclock_page); +} +#endif + +static inline u64 __arch_get_hw_counter(s32 clock_mode) +{ + if (clock_mode == VCLOCK_TSC) + return (u64)rdtsc_ordered(); + /* + * For any memory-mapped vclock type, we need to make sure that gcc + * doesn't cleverly hoist a load before the mode check. Otherwise we + * might end up touching the memory-mapped page even if the vclock in + * question isn't enabled, which will segfault. Hence the barriers. + */ +#ifdef CONFIG_PARAVIRT_CLOCK + if (clock_mode == VCLOCK_PVCLOCK) { + barrier(); + return vread_pvclock(); + } +#endif +#ifdef CONFIG_HYPERV_TSCPAGE + if (clock_mode == VCLOCK_HVCLOCK) { + barrier(); + return vread_hvclock(); + } +#endif + return U64_MAX; +} + +static __always_inline const struct vdso_data *__arch_get_vdso_data(void) +{ + return __vdso_data; +} + +/* + * x86 specific delta calculation. + * + * The regular implementation assumes that clocksource reads are globally + * monotonic. The TSC can be slightly off across sockets which can cause + * the regular delta calculation (@cycles - @last) to return a huge time + * jump. + * + * Therefore it needs to be verified that @cycles are greater than + * @last. If not then use @last, which is the base time of the current + * conversion period. + * + * This variant also removes the masking of the subtraction because the + * clocksource mask of all VDSO capable clocksources on x86 is U64_MAX + * which would result in a pointless operation. The compiler cannot + * optimize it away as the mask comes from the vdso data and is not compile + * time constant. + */ +static __always_inline +u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult) +{ + if (cycles > last) + return (cycles - last) * mult; + return 0; +} +#define vdso_calc_delta vdso_calc_delta + +#endif /* !__ASSEMBLY__ */ + +#endif /* __ASM_VDSO_GETTIMEOFDAY_H */ diff --git a/arch/x86/include/asm/vdso/vsyscall.h b/arch/x86/include/asm/vdso/vsyscall.h new file mode 100644 index 000000000000..0026ab2123ce --- /dev/null +++ b/arch/x86/include/asm/vdso/vsyscall.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_VDSO_VSYSCALL_H +#define __ASM_VDSO_VSYSCALL_H + +#ifndef __ASSEMBLY__ + +#include <linux/hrtimer.h> +#include <linux/timekeeper_internal.h> +#include <vdso/datapage.h> +#include <asm/vgtod.h> +#include <asm/vvar.h> + +int vclocks_used __read_mostly; + +DEFINE_VVAR(struct vdso_data, _vdso_data); +/* + * Update the vDSO data page to keep in sync with kernel timekeeping. + */ +static __always_inline +struct vdso_data *__x86_get_k_vdso_data(void) +{ + return _vdso_data; +} +#define __arch_get_k_vdso_data __x86_get_k_vdso_data + +static __always_inline +int __x86_get_clock_mode(struct timekeeper *tk) +{ + int vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; + + /* Mark the new vclock used. */ + BUILD_BUG_ON(VCLOCK_MAX >= 32); + WRITE_ONCE(vclocks_used, READ_ONCE(vclocks_used) | (1 << vclock_mode)); + + return vclock_mode; +} +#define __arch_get_clock_mode __x86_get_clock_mode + +/* The asm-generic header needs to be included after the definitions above */ +#include <asm-generic/vdso/vsyscall.h> + +#endif /* !__ASSEMBLY__ */ + +#endif /* __ASM_VDSO_VSYSCALL_H */ diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index 913a133f8e6f..a2638c6124ed 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -3,7 +3,9 @@ #define _ASM_X86_VGTOD_H #include <linux/compiler.h> -#include <linux/clocksource.h> +#include <asm/clocksource.h> +#include <vdso/datapage.h> +#include <vdso/helpers.h> #include <uapi/linux/time.h> @@ -13,81 +15,10 @@ typedef u64 gtod_long_t; typedef unsigned long gtod_long_t; #endif -/* - * There is one of these objects in the vvar page for each - * vDSO-accelerated clockid. For high-resolution clocks, this encodes - * the time corresponding to vsyscall_gtod_data.cycle_last. For coarse - * clocks, this encodes the actual time. - * - * To confuse the reader, for high-resolution clocks, nsec is left-shifted - * by vsyscall_gtod_data.shift. - */ -struct vgtod_ts { - u64 sec; - u64 nsec; -}; - -#define VGTOD_BASES (CLOCK_TAI + 1) -#define VGTOD_HRES (BIT(CLOCK_REALTIME) | BIT(CLOCK_MONOTONIC) | BIT(CLOCK_TAI)) -#define VGTOD_COARSE (BIT(CLOCK_REALTIME_COARSE) | BIT(CLOCK_MONOTONIC_COARSE)) - -/* - * vsyscall_gtod_data will be accessed by 32 and 64 bit code at the same time - * so be carefull by modifying this structure. - */ -struct vsyscall_gtod_data { - unsigned int seq; - - int vclock_mode; - u64 cycle_last; - u64 mask; - u32 mult; - u32 shift; - - struct vgtod_ts basetime[VGTOD_BASES]; - - int tz_minuteswest; - int tz_dsttime; -}; -extern struct vsyscall_gtod_data vsyscall_gtod_data; - extern int vclocks_used; static inline bool vclock_was_used(int vclock) { return READ_ONCE(vclocks_used) & (1 << vclock); } -static inline unsigned int gtod_read_begin(const struct vsyscall_gtod_data *s) -{ - unsigned int ret; - -repeat: - ret = READ_ONCE(s->seq); - if (unlikely(ret & 1)) { - cpu_relax(); - goto repeat; - } - smp_rmb(); - return ret; -} - -static inline int gtod_read_retry(const struct vsyscall_gtod_data *s, - unsigned int start) -{ - smp_rmb(); - return unlikely(s->seq != start); -} - -static inline void gtod_write_begin(struct vsyscall_gtod_data *s) -{ - ++s->seq; - smp_wmb(); -} - -static inline void gtod_write_end(struct vsyscall_gtod_data *s) -{ - smp_wmb(); - ++s->seq; -} - #endif /* _ASM_X86_VGTOD_H */ diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h index 1fc7a0d1e877..9aad0e0876fb 100644 --- a/arch/x86/include/asm/virtext.h +++ b/arch/x86/include/asm/virtext.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* CPU virtualization extensions handling * * This should carry the code for handling CPU virtualization extensions @@ -8,9 +9,6 @@ * Copyright (C) 2008, Red Hat Inc. * * Contains code from KVM, Copyright (C) 2006 Qumranet, Inc. - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. */ #ifndef _ASM_X86_VIRTEX_H #define _ASM_X86_VIRTEX_H diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index b986b2ca688a..ab60a71a8dcb 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -13,10 +13,12 @@ extern void set_vsyscall_pgtable_user_bits(pgd_t *root); * Called on instruction fetch fault in vsyscall page. * Returns true if handled. */ -extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address); +extern bool emulate_vsyscall(unsigned long error_code, + struct pt_regs *regs, unsigned long address); #else static inline void map_vsyscall(void) {} -static inline bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) +static inline bool emulate_vsyscall(unsigned long error_code, + struct pt_regs *regs, unsigned long address) { return false; } diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h index e474f5c6e387..32f5d9a0b90e 100644 --- a/arch/x86/include/asm/vvar.h +++ b/arch/x86/include/asm/vvar.h @@ -32,19 +32,20 @@ extern char __vvar_page; #define DECLARE_VVAR(offset, type, name) \ - extern type vvar_ ## name __attribute__((visibility("hidden"))); + extern type vvar_ ## name[CS_BASES] \ + __attribute__((visibility("hidden"))); #define VVAR(name) (vvar_ ## name) #define DEFINE_VVAR(type, name) \ - type name \ + type name[CS_BASES] \ __attribute__((section(".vvar_" #name), aligned(16))) __visible #endif /* DECLARE_VVAR(offset, type, name) */ -DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data) +DECLARE_VVAR(128, struct vdso_data, _vdso_data) #undef DECLARE_VVAR diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 60733f137e9a..c895df5482c5 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -29,6 +29,8 @@ #define XLF_EFI_HANDOVER_32 (1<<2) #define XLF_EFI_HANDOVER_64 (1<<3) #define XLF_EFI_KEXEC (1<<4) +#define XLF_5LEVEL (1<<5) +#define XLF_5LEVEL_ENABLED (1<<6) #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 7a0e64ccd6ff..d6ab5b4d15e5 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -383,6 +383,9 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2) #define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3) +#define KVM_STATE_NESTED_FORMAT_VMX 0 +#define KVM_STATE_NESTED_FORMAT_SVM 1 /* unused */ + #define KVM_STATE_NESTED_GUEST_MODE 0x00000001 #define KVM_STATE_NESTED_RUN_PENDING 0x00000002 #define KVM_STATE_NESTED_EVMCS 0x00000004 @@ -390,9 +393,16 @@ struct kvm_sync_regs { #define KVM_STATE_NESTED_SMM_GUEST_MODE 0x00000001 #define KVM_STATE_NESTED_SMM_VMXON 0x00000002 -struct kvm_vmx_nested_state { +#define KVM_STATE_NESTED_VMX_VMCS_SIZE 0x1000 + +struct kvm_vmx_nested_state_data { + __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; + __u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; +}; + +struct kvm_vmx_nested_state_hdr { __u64 vmxon_pa; - __u64 vmcs_pa; + __u64 vmcs12_pa; struct { __u16 flags; @@ -401,24 +411,25 @@ struct kvm_vmx_nested_state { /* for KVM_CAP_NESTED_STATE */ struct kvm_nested_state { - /* KVM_STATE_* flags */ __u16 flags; - - /* 0 for VMX, 1 for SVM. */ __u16 format; - - /* 128 for SVM, 128 + VMCS size for VMX. */ __u32 size; union { - /* VMXON, VMCS */ - struct kvm_vmx_nested_state vmx; + struct kvm_vmx_nested_state_hdr vmx; /* Pad the header to 128 bytes. */ __u8 pad[120]; - }; + } hdr; - __u8 data[0]; + /* + * Define data region as 0 bytes to preserve backwards-compatability + * to old definition of kvm_nested_state in order to avoid changing + * KVM_{GET,PUT}_NESTED_STATE ioctl values. + */ + union { + struct kvm_vmx_nested_state_data vmx[0]; + } data; }; #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/include/uapi/asm/perf_regs.h b/arch/x86/include/uapi/asm/perf_regs.h index ac67bbea10ca..7c9d2bb3833b 100644 --- a/arch/x86/include/uapi/asm/perf_regs.h +++ b/arch/x86/include/uapi/asm/perf_regs.h @@ -52,4 +52,7 @@ enum perf_event_x86_regs { /* These include both GPRs and XMMX registers */ PERF_REG_X86_XMM_MAX = PERF_REG_X86_XMM15 + 2, }; + +#define PERF_REG_EXTENDED_MASK (~((1ULL << PERF_REG_X86_XMM0) - 1)) + #endif /* _ASM_X86_PERF_REGS_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index ce1b5cc360a2..3578ad248bc9 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -30,7 +30,7 @@ KASAN_SANITIZE_paravirt.o := n OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y OBJECT_FILES_NON_STANDARD_test_nx.o := y -OBJECT_FILES_NON_STANDARD_paravirt_patch_$(BITS).o := y +OBJECT_FILES_NON_STANDARD_paravirt_patch.o := y ifdef CONFIG_FRAME_POINTER OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y @@ -112,7 +112,7 @@ obj-$(CONFIG_AMD_NB) += amd_nb.o obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o -obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o +obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index a5e5484988fd..caf2edccbad2 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -64,6 +64,21 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, c->x86_stepping >= 0x0e)) flags->bm_check = 1; } + + if (c->x86_vendor == X86_VENDOR_ZHAOXIN) { + /* + * All Zhaoxin CPUs that support C3 share cache. + * And caches should not be flushed by software while + * entering C3 type state. + */ + flags->bm_check = 1; + /* + * On all recent Zhaoxin platforms, ARB_DISABLE is a nop. + * So, set bm_control to zero to indicate that ARB_DISABLE + * is not required while entering C3 type state. + */ + flags->bm_control = 0; + } } EXPORT_SYMBOL(acpi_processor_power_init_bm_check); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 390596b761e3..99ef8b6f9a1a 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -14,6 +14,7 @@ #include <linux/kdebug.h> #include <linux/kprobes.h> #include <linux/mmu_context.h> +#include <linux/bsearch.h> #include <asm/text-patching.h> #include <asm/alternative.h> #include <asm/sections.h> @@ -277,7 +278,7 @@ static inline bool is_jmp(const u8 opcode) } static void __init_or_module -recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf) +recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff) { u8 *next_rip, *tgt_rip; s32 n_dspl, o_dspl; @@ -286,7 +287,7 @@ recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf) if (a->replacementlen != 5) return; - o_dspl = *(s32 *)(insnbuf + 1); + o_dspl = *(s32 *)(insn_buff + 1); /* next_rip of the replacement JMP */ next_rip = repl_insn + a->replacementlen; @@ -312,9 +313,9 @@ recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf) two_byte_jmp: n_dspl -= 2; - insnbuf[0] = 0xeb; - insnbuf[1] = (s8)n_dspl; - add_nops(insnbuf + 2, 3); + insn_buff[0] = 0xeb; + insn_buff[1] = (s8)n_dspl; + add_nops(insn_buff + 2, 3); repl_len = 2; goto done; @@ -322,8 +323,8 @@ two_byte_jmp: five_byte_jmp: n_dspl -= 5; - insnbuf[0] = 0xe9; - *(s32 *)&insnbuf[1] = n_dspl; + insn_buff[0] = 0xe9; + *(s32 *)&insn_buff[1] = n_dspl; repl_len = 5; @@ -370,7 +371,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, { struct alt_instr *a; u8 *instr, *replacement; - u8 insnbuf[MAX_PATCH_LEN]; + u8 insn_buff[MAX_PATCH_LEN]; DPRINTK("alt table %px, -> %px", start, end); /* @@ -383,11 +384,11 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, * order. */ for (a = start; a < end; a++) { - int insnbuf_sz = 0; + int insn_buff_sz = 0; instr = (u8 *)&a->instr_offset + a->instr_offset; replacement = (u8 *)&a->repl_offset + a->repl_offset; - BUG_ON(a->instrlen > sizeof(insnbuf)); + BUG_ON(a->instrlen > sizeof(insn_buff)); BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); if (!boot_cpu_has(a->cpuid)) { if (a->padlen > 1) @@ -405,8 +406,8 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr); DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement); - memcpy(insnbuf, replacement, a->replacementlen); - insnbuf_sz = a->replacementlen; + memcpy(insn_buff, replacement, a->replacementlen); + insn_buff_sz = a->replacementlen; /* * 0xe8 is a relative jump; fix the offset. @@ -414,24 +415,24 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, * Instruction length is checked before the opcode to avoid * accessing uninitialized bytes for zero-length replacements. */ - if (a->replacementlen == 5 && *insnbuf == 0xe8) { - *(s32 *)(insnbuf + 1) += replacement - instr; + if (a->replacementlen == 5 && *insn_buff == 0xe8) { + *(s32 *)(insn_buff + 1) += replacement - instr; DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx", - *(s32 *)(insnbuf + 1), - (unsigned long)instr + *(s32 *)(insnbuf + 1) + 5); + *(s32 *)(insn_buff + 1), + (unsigned long)instr + *(s32 *)(insn_buff + 1) + 5); } if (a->replacementlen && is_jmp(replacement[0])) - recompute_jump(a, instr, replacement, insnbuf); + recompute_jump(a, instr, replacement, insn_buff); if (a->instrlen > a->replacementlen) { - add_nops(insnbuf + a->replacementlen, + add_nops(insn_buff + a->replacementlen, a->instrlen - a->replacementlen); - insnbuf_sz += a->instrlen - a->replacementlen; + insn_buff_sz += a->instrlen - a->replacementlen; } - DUMP_BYTES(insnbuf, insnbuf_sz, "%px: final_insn: ", instr); + DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr); - text_poke_early(instr, insnbuf, insnbuf_sz); + text_poke_early(instr, insn_buff, insn_buff_sz); } } @@ -593,33 +594,104 @@ void __init_or_module apply_paravirt(struct paravirt_patch_site *start, struct paravirt_patch_site *end) { struct paravirt_patch_site *p; - char insnbuf[MAX_PATCH_LEN]; + char insn_buff[MAX_PATCH_LEN]; for (p = start; p < end; p++) { unsigned int used; BUG_ON(p->len > MAX_PATCH_LEN); /* prep the buffer with the original instructions */ - memcpy(insnbuf, p->instr, p->len); - used = pv_ops.init.patch(p->instrtype, insnbuf, - (unsigned long)p->instr, p->len); + memcpy(insn_buff, p->instr, p->len); + used = pv_ops.init.patch(p->type, insn_buff, (unsigned long)p->instr, p->len); BUG_ON(used > p->len); /* Pad the rest with nops */ - add_nops(insnbuf + used, p->len - used); - text_poke_early(p->instr, insnbuf, p->len); + add_nops(insn_buff + used, p->len - used); + text_poke_early(p->instr, insn_buff, p->len); } } extern struct paravirt_patch_site __start_parainstructions[], __stop_parainstructions[]; #endif /* CONFIG_PARAVIRT */ +/* + * Self-test for the INT3 based CALL emulation code. + * + * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up + * properly and that there is a stack gap between the INT3 frame and the + * previous context. Without this gap doing a virtual PUSH on the interrupted + * stack would corrupt the INT3 IRET frame. + * + * See entry_{32,64}.S for more details. + */ +static void __init int3_magic(unsigned int *ptr) +{ + *ptr = 1; +} + +extern __initdata unsigned long int3_selftest_ip; /* defined in asm below */ + +static int __init +int3_exception_notify(struct notifier_block *self, unsigned long val, void *data) +{ + struct die_args *args = data; + struct pt_regs *regs = args->regs; + + if (!regs || user_mode(regs)) + return NOTIFY_DONE; + + if (val != DIE_INT3) + return NOTIFY_DONE; + + if (regs->ip - INT3_INSN_SIZE != int3_selftest_ip) + return NOTIFY_DONE; + + int3_emulate_call(regs, (unsigned long)&int3_magic); + return NOTIFY_STOP; +} + +static void __init int3_selftest(void) +{ + static __initdata struct notifier_block int3_exception_nb = { + .notifier_call = int3_exception_notify, + .priority = INT_MAX-1, /* last */ + }; + unsigned int val = 0; + + BUG_ON(register_die_notifier(&int3_exception_nb)); + + /* + * Basically: int3_magic(&val); but really complicated :-) + * + * Stick the address of the INT3 instruction into int3_selftest_ip, + * then trigger the INT3, padded with NOPs to match a CALL instruction + * length. + */ + asm volatile ("1: int3; nop; nop; nop; nop\n\t" + ".pushsection .init.data,\"aw\"\n\t" + ".align " __ASM_SEL(4, 8) "\n\t" + ".type int3_selftest_ip, @object\n\t" + ".size int3_selftest_ip, " __ASM_SEL(4, 8) "\n\t" + "int3_selftest_ip:\n\t" + __ASM_SEL(.long, .quad) " 1b\n\t" + ".popsection\n\t" + : : __ASM_SEL_RAW(a, D) (&val) : "memory"); + + BUG_ON(val != 1); + + unregister_die_notifier(&int3_exception_nb); +} + void __init alternative_instructions(void) { - /* The patching is not fully atomic, so try to avoid local interruptions - that might execute the to be patched code. - Other CPUs are not running. */ + int3_selftest(); + + /* + * The patching is not fully atomic, so try to avoid local + * interruptions that might execute the to be patched code. + * Other CPUs are not running. + */ stop_nmi(); /* @@ -644,10 +716,11 @@ void __init alternative_instructions(void) _text, _etext); } - if (!uniproc_patched || num_possible_cpus() == 1) + if (!uniproc_patched || num_possible_cpus() == 1) { free_init_pages("SMP alternatives", (unsigned long)__smp_locks, (unsigned long)__smp_locks_end); + } #endif apply_paravirt(__parainstructions, __parainstructions_end); @@ -848,81 +921,133 @@ static void do_sync_core(void *info) sync_core(); } -static bool bp_patching_in_progress; -static void *bp_int3_handler, *bp_int3_addr; +static struct bp_patching_desc { + struct text_poke_loc *vec; + int nr_entries; +} bp_patching; + +static int patch_cmp(const void *key, const void *elt) +{ + struct text_poke_loc *tp = (struct text_poke_loc *) elt; + + if (key < tp->addr) + return -1; + if (key > tp->addr) + return 1; + return 0; +} +NOKPROBE_SYMBOL(patch_cmp); int poke_int3_handler(struct pt_regs *regs) { + struct text_poke_loc *tp; + unsigned char int3 = 0xcc; + void *ip; + /* * Having observed our INT3 instruction, we now must observe - * bp_patching_in_progress. + * bp_patching.nr_entries. * - * in_progress = TRUE INT3 + * nr_entries != 0 INT3 * WMB RMB - * write INT3 if (in_progress) + * write INT3 if (nr_entries) * - * Idem for bp_int3_handler. + * Idem for other elements in bp_patching. */ smp_rmb(); - if (likely(!bp_patching_in_progress)) + if (likely(!bp_patching.nr_entries)) return 0; - if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr) + if (user_mode(regs)) return 0; - /* set up the specified breakpoint handler */ - regs->ip = (unsigned long) bp_int3_handler; + /* + * Discount the sizeof(int3). See text_poke_bp_batch(). + */ + ip = (void *) regs->ip - sizeof(int3); + + /* + * Skip the binary search if there is a single member in the vector. + */ + if (unlikely(bp_patching.nr_entries > 1)) { + tp = bsearch(ip, bp_patching.vec, bp_patching.nr_entries, + sizeof(struct text_poke_loc), + patch_cmp); + if (!tp) + return 0; + } else { + tp = bp_patching.vec; + if (tp->addr != ip) + return 0; + } + + /* set up the specified breakpoint detour */ + regs->ip = (unsigned long) tp->detour; return 1; } NOKPROBE_SYMBOL(poke_int3_handler); /** - * text_poke_bp() -- update instructions on live kernel on SMP - * @addr: address to patch - * @opcode: opcode of new instruction - * @len: length to copy - * @handler: address to jump to when the temporary breakpoint is hit + * text_poke_bp_batch() -- update instructions on live kernel on SMP + * @tp: vector of instructions to patch + * @nr_entries: number of entries in the vector * * Modify multi-byte instruction by using int3 breakpoint on SMP. * We completely avoid stop_machine() here, and achieve the * synchronization using int3 breakpoint. * * The way it is done: - * - add a int3 trap to the address that will be patched + * - For each entry in the vector: + * - add a int3 trap to the address that will be patched * - sync cores - * - update all but the first byte of the patched range + * - For each entry in the vector: + * - update all but the first byte of the patched range * - sync cores - * - replace the first byte (int3) by the first byte of - * replacing opcode + * - For each entry in the vector: + * - replace the first byte (int3) by the first byte of + * replacing opcode * - sync cores */ -void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) +void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) { + int patched_all_but_first = 0; unsigned char int3 = 0xcc; - - bp_int3_handler = handler; - bp_int3_addr = (u8 *)addr + sizeof(int3); - bp_patching_in_progress = true; + unsigned int i; lockdep_assert_held(&text_mutex); + bp_patching.vec = tp; + bp_patching.nr_entries = nr_entries; + /* * Corresponding read barrier in int3 notifier for making sure the - * in_progress and handler are correctly ordered wrt. patching. + * nr_entries and handler are correctly ordered wrt. patching. */ smp_wmb(); - text_poke(addr, &int3, sizeof(int3)); + /* + * First step: add a int3 trap to the address that will be patched. + */ + for (i = 0; i < nr_entries; i++) + text_poke(tp[i].addr, &int3, sizeof(int3)); on_each_cpu(do_sync_core, NULL, 1); - if (len - sizeof(int3) > 0) { - /* patch all but the first byte */ - text_poke((char *)addr + sizeof(int3), - (const char *) opcode + sizeof(int3), - len - sizeof(int3)); + /* + * Second step: update all but the first byte of the patched range. + */ + for (i = 0; i < nr_entries; i++) { + if (tp[i].len - sizeof(int3) > 0) { + text_poke((char *)tp[i].addr + sizeof(int3), + (const char *)tp[i].opcode + sizeof(int3), + tp[i].len - sizeof(int3)); + patched_all_but_first++; + } + } + + if (patched_all_but_first) { /* * According to Intel, this core syncing is very likely * not necessary and we'd be safe even without it. But @@ -931,14 +1056,47 @@ void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) on_each_cpu(do_sync_core, NULL, 1); } - /* patch the first byte */ - text_poke(addr, opcode, sizeof(int3)); + /* + * Third step: replace the first byte (int3) by the first byte of + * replacing opcode. + */ + for (i = 0; i < nr_entries; i++) + text_poke(tp[i].addr, tp[i].opcode, sizeof(int3)); on_each_cpu(do_sync_core, NULL, 1); /* * sync_core() implies an smp_mb() and orders this store against * the writing of the new instruction. */ - bp_patching_in_progress = false; + bp_patching.vec = NULL; + bp_patching.nr_entries = 0; } +/** + * text_poke_bp() -- update instructions on live kernel on SMP + * @addr: address to patch + * @opcode: opcode of new instruction + * @len: length to copy + * @handler: address to jump to when the temporary breakpoint is hit + * + * Update a single instruction with the vector in the stack, avoiding + * dynamically allocated memory. This function should be used when it is + * not possible to allocate memory. + */ +void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) +{ + struct text_poke_loc tp = { + .detour = handler, + .addr = addr, + .len = len, + }; + + if (len > POKE_MAX_OPCODE_SIZE) { + WARN_ONCE(1, "len is larger than %d\n", POKE_MAX_OPCODE_SIZE); + return; + } + + memcpy((void *)tp.opcode, opcode, len); + + text_poke_bp_batch(&tp, 1); +} diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index cc51275c8759..d63e63b7d1d9 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -1,6 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Shared support code for AMD K8 northbridges and derivates. - * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2. + * Copyright 2006 Andi Kleen, SUSE Labs. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -71,7 +72,7 @@ static const struct pci_device_id hygon_root_ids[] = { {} }; -const struct pci_device_id hygon_nb_misc_ids[] = { +static const struct pci_device_id hygon_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_HYGON, PCI_DEVICE_ID_AMD_17H_DF_F3) }, {} }; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 177aa8ef2afa..1bd91cb7b320 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -195,7 +195,7 @@ static struct resource lapic_resource = { .flags = IORESOURCE_MEM | IORESOURCE_BUSY, }; -unsigned int lapic_timer_frequency = 0; +unsigned int lapic_timer_period = 0; static void apic_pm_activate(void); @@ -501,7 +501,7 @@ lapic_timer_set_periodic_oneshot(struct clock_event_device *evt, bool oneshot) if (evt->features & CLOCK_EVT_FEAT_DUMMY) return 0; - __setup_APIC_LVTT(lapic_timer_frequency, oneshot, 1); + __setup_APIC_LVTT(lapic_timer_period, oneshot, 1); return 0; } @@ -805,11 +805,11 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc) static int __init lapic_init_clockevent(void) { - if (!lapic_timer_frequency) + if (!lapic_timer_period) return -1; /* Calculate the scaled math multiplication factor */ - lapic_clockevent.mult = div_sc(lapic_timer_frequency/APIC_DIVISOR, + lapic_clockevent.mult = div_sc(lapic_timer_period/APIC_DIVISOR, TICK_NSEC, lapic_clockevent.shift); lapic_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent); @@ -821,6 +821,33 @@ static int __init lapic_init_clockevent(void) return 0; } +bool __init apic_needs_pit(void) +{ + /* + * If the frequencies are not known, PIT is required for both TSC + * and apic timer calibration. + */ + if (!tsc_khz || !cpu_khz) + return true; + + /* Is there an APIC at all? */ + if (!boot_cpu_has(X86_FEATURE_APIC)) + return true; + + /* Deadline timer is based on TSC so no further PIT action required */ + if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + return false; + + /* APIC timer disabled? */ + if (disable_apic_timer) + return true; + /* + * The APIC timer frequency is known already, no PIT calibration + * required. If unknown, let the PIT be initialized. + */ + return lapic_timer_period == 0; +} + static int __init calibrate_APIC_clock(void) { struct clock_event_device *levt = this_cpu_ptr(&lapic_events); @@ -839,7 +866,7 @@ static int __init calibrate_APIC_clock(void) */ if (!lapic_init_clockevent()) { apic_printk(APIC_VERBOSE, "lapic timer already calibrated %d\n", - lapic_timer_frequency); + lapic_timer_period); /* * Direct calibration methods must have an always running * local APIC timer, no need for broadcast timer. @@ -884,13 +911,13 @@ static int __init calibrate_APIC_clock(void) pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1, &delta, &deltatsc); - lapic_timer_frequency = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; + lapic_timer_period = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; lapic_init_clockevent(); apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult); apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", - lapic_timer_frequency); + lapic_timer_period); if (boot_cpu_has(X86_FEATURE_TSC)) { apic_printk(APIC_VERBOSE, "..... CPU clock speed is " @@ -901,13 +928,13 @@ static int __init calibrate_APIC_clock(void) apic_printk(APIC_VERBOSE, "..... host bus clock speed is " "%u.%04u MHz.\n", - lapic_timer_frequency / (1000000 / HZ), - lapic_timer_frequency % (1000000 / HZ)); + lapic_timer_period / (1000000 / HZ), + lapic_timer_period % (1000000 / HZ)); /* * Do a sanity check on the APIC calibration result */ - if (lapic_timer_frequency < (1000000 / HZ)) { + if (lapic_timer_period < (1000000 / HZ)) { local_irq_enable(); pr_warning("APIC frequency too slow, disabling apic timer\n"); return -1; @@ -1351,6 +1378,8 @@ void __init init_bsp_APIC(void) apic_write(APIC_LVT1, value); } +static void __init apic_bsp_setup(bool upmode); + /* Init the interrupt delivery mode for the BSP */ void __init apic_intr_mode_init(void) { @@ -1464,7 +1493,8 @@ static void apic_pending_intr_clear(void) if (queued) { if (boot_cpu_has(X86_FEATURE_TSC) && cpu_khz) { ntsc = rdtsc(); - max_loops = (cpu_khz << 10) - (ntsc - tsc); + max_loops = (long long)cpu_khz << 10; + max_loops -= ntsc - tsc; } else { max_loops--; } @@ -2040,21 +2070,32 @@ __visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs) entering_irq(); trace_spurious_apic_entry(vector); + inc_irq_stat(irq_spurious_count); + + /* + * If this is a spurious interrupt then do not acknowledge + */ + if (vector == SPURIOUS_APIC_VECTOR) { + /* See SDM vol 3 */ + pr_info("Spurious APIC interrupt (vector 0xFF) on CPU#%d, should never happen.\n", + smp_processor_id()); + goto out; + } + /* - * Check if this really is a spurious interrupt and ACK it - * if it is a vectored one. Just in case... - * Spurious interrupts should not be ACKed. + * If it is a vectored one, verify it's set in the ISR. If set, + * acknowledge it. */ v = apic_read(APIC_ISR + ((vector & ~0x1f) >> 1)); - if (v & (1 << (vector & 0x1f))) + if (v & (1 << (vector & 0x1f))) { + pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Acked\n", + vector, smp_processor_id()); ack_APIC_irq(); - - inc_irq_stat(irq_spurious_count); - - /* see sw-dev-man vol 3, chapter 7.4.13.5 */ - pr_info("spurious APIC interrupt through vector %02x on CPU#%d, " - "should never happen.\n", vector, smp_processor_id()); - + } else { + pr_info("Spurious interrupt (vector 0x%02x) on CPU#%d. Not pending!\n", + vector, smp_processor_id()); + } +out: trace_spurious_apic_exit(vector); exiting_irq(); } @@ -2415,11 +2456,8 @@ static void __init apic_bsp_up_setup(void) /** * apic_bsp_setup - Setup function for local apic and io-apic * @upmode: Force UP mode (for APIC_init_uniprocessor) - * - * Returns: - * apic_id of BSP APIC */ -void __init apic_bsp_setup(bool upmode) +static void __init apic_bsp_setup(bool upmode) { connect_bsp_APIC(); if (upmode) diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index bf083c3f1d73..bbdca603f94a 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -78,7 +78,7 @@ flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) int cpu = smp_processor_id(); if (cpu < BITS_PER_LONG) - clear_bit(cpu, &mask); + __clear_bit(cpu, &mask); _flat_send_IPI_mask(mask, vector); } @@ -92,7 +92,7 @@ static void flat_send_IPI_allbutself(int vector) unsigned long mask = cpumask_bits(cpu_online_mask)[0]; if (cpu < BITS_PER_LONG) - clear_bit(cpu, &mask); + __clear_bit(cpu, &mask); _flat_send_IPI_mask(mask, vector); } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 53aa234a6803..c7bb6c69f21c 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -58,6 +58,7 @@ #include <asm/acpi.h> #include <asm/dma.h> #include <asm/timer.h> +#include <asm/time.h> #include <asm/i8259.h> #include <asm/setup.h> #include <asm/irq_remapping.h> @@ -1893,6 +1894,50 @@ static int ioapic_set_affinity(struct irq_data *irq_data, return ret; } +/* + * Interrupt shutdown masks the ioapic pin, but the interrupt might already + * be in flight, but not yet serviced by the target CPU. That means + * __synchronize_hardirq() would return and claim that everything is calmed + * down. So free_irq() would proceed and deactivate the interrupt and free + * resources. + * + * Once the target CPU comes around to service it it will find a cleared + * vector and complain. While the spurious interrupt is harmless, the full + * release of resources might prevent the interrupt from being acknowledged + * which keeps the hardware in a weird state. + * + * Verify that the corresponding Remote-IRR bits are clear. + */ +static int ioapic_irq_get_chip_state(struct irq_data *irqd, + enum irqchip_irq_state which, + bool *state) +{ + struct mp_chip_data *mcd = irqd->chip_data; + struct IO_APIC_route_entry rentry; + struct irq_pin_list *p; + + if (which != IRQCHIP_STATE_ACTIVE) + return -EINVAL; + + *state = false; + raw_spin_lock(&ioapic_lock); + for_each_irq_pin(p, mcd->irq_2_pin) { + rentry = __ioapic_read_entry(p->apic, p->pin); + /* + * The remote IRR is only valid in level trigger mode. It's + * meaning is undefined for edge triggered interrupts and + * irrelevant because the IO-APIC treats them as fire and + * forget. + */ + if (rentry.irr && rentry.trigger) { + *state = true; + break; + } + } + raw_spin_unlock(&ioapic_lock); + return 0; +} + static struct irq_chip ioapic_chip __read_mostly = { .name = "IO-APIC", .irq_startup = startup_ioapic_irq, @@ -1902,6 +1947,7 @@ static struct irq_chip ioapic_chip __read_mostly = { .irq_eoi = ioapic_ack_level, .irq_set_affinity = ioapic_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_get_irqchip_state = ioapic_irq_get_chip_state, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -1914,6 +1960,7 @@ static struct irq_chip ioapic_ir_chip __read_mostly = { .irq_eoi = ioapic_ir_ack_level, .irq_set_affinity = ioapic_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_get_irqchip_state = ioapic_irq_get_chip_state, .flags = IRQCHIP_SKIP_SET_WAKE, }; @@ -2083,6 +2130,9 @@ static inline void __init check_timer(void) unsigned long flags; int no_pin1 = 0; + if (!global_clock_event) + return; + local_irq_save(flags); /* diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 72a94401f9e0..7f7533462474 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Support of MSI, HPET and DMAR interrupts. * @@ -5,10 +6,6 @@ * Moved from arch/x86/kernel/apic/io_apic.c. * Jiang Liu <jiang.liu@linux.intel.com> * Convert to hierarchical irqdomain - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/mm.h> #include <linux/interrupt.h> @@ -373,14 +370,14 @@ struct irq_domain *hpet_create_irq_domain(int hpet_id) return d; } -int hpet_assign_irq(struct irq_domain *domain, struct hpet_dev *dev, +int hpet_assign_irq(struct irq_domain *domain, struct hpet_channel *hc, int dev_num) { struct irq_alloc_info info; init_irq_alloc_info(&info, NULL); info.type = X86_IRQ_ALLOC_TYPE_HPET; - info.hpet_data = dev; + info.hpet_data = hc; info.hpet_id = hpet_dev_id(domain); info.hpet_index = dev_num; diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 3173e07d3791..fdacb864c3dd 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Local APIC related interfaces to support IOAPIC, MSI, etc. * @@ -5,10 +6,6 @@ * Moved from arch/x86/kernel/apic/io_apic.c. * Jiang Liu <jiang.liu@linux.intel.com> * Enable support of hierarchical irqdomains - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/interrupt.h> #include <linux/irq.h> @@ -343,7 +340,7 @@ static void clear_irq_vector(struct irq_data *irqd) trace_vector_clear(irqd->irq, vector, apicd->cpu, apicd->prev_vector, apicd->prev_cpu); - per_cpu(vector_irq, apicd->cpu)[vector] = VECTOR_UNUSED; + per_cpu(vector_irq, apicd->cpu)[vector] = VECTOR_SHUTDOWN; irq_matrix_free(vector_matrix, apicd->cpu, vector, managed); apicd->vector = 0; @@ -352,7 +349,7 @@ static void clear_irq_vector(struct irq_data *irqd) if (!vector) return; - per_cpu(vector_irq, apicd->prev_cpu)[vector] = VECTOR_UNUSED; + per_cpu(vector_irq, apicd->prev_cpu)[vector] = VECTOR_SHUTDOWN; irq_matrix_free(vector_matrix, apicd->prev_cpu, vector, managed); apicd->prev_vector = 0; apicd->move_in_progress = 0; diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 7685444a106b..609e499387a1 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -50,7 +50,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) cpumask_copy(tmpmsk, mask); /* If IPI should not be sent to self, clear current CPU */ if (apic_dest != APIC_DEST_ALLINC) - cpumask_clear_cpu(smp_processor_id(), tmpmsk); + __cpumask_clear_cpu(smp_processor_id(), tmpmsk); /* Collapse cpus in a cluster so a single IPI per cluster is sent */ for_each_cpu(cpu, tmpmsk) { diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 168543d077d7..da64452584b0 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -38,7 +38,6 @@ static void __used common(void) #endif BLANK(); - OFFSET(TASK_TI_flags, task_struct, thread_info.flags); OFFSET(TASK_addr_limit, task_struct, thread.addr_limit); BLANK(); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 5102bf7c8192..d7a1e5a9331c 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -24,6 +24,7 @@ obj-y += match.o obj-y += bugs.o obj-y += aperfmperf.o obj-y += cpuid-deps.o +obj-y += umwait.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o @@ -38,6 +39,7 @@ obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o +obj-$(CONFIG_CPU_SUP_ZHAOXIN) += zhaoxin.o obj-$(CONFIG_X86_MCE) += mce/ obj-$(CONFIG_MTRR) += mtrr/ @@ -47,6 +49,7 @@ obj-$(CONFIG_X86_CPU_RESCTRL) += resctrl/ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o +obj-$(CONFIG_ACRN_GUEST) += acrn.o ifdef CONFIG_X86_FEATURE_NAMES quiet_cmd_mkcapflags = MKCAP $@ @@ -54,8 +57,7 @@ quiet_cmd_mkcapflags = MKCAP $@ cpufeature = $(src)/../../include/asm/cpufeatures.h -targets += capflags.c $(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE $(call if_changed,mkcapflags) endif -clean-files += capflags.c +targets += capflags.c diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c new file mode 100644 index 000000000000..676022e71791 --- /dev/null +++ b/arch/x86/kernel/cpu/acrn.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * ACRN detection support + * + * Copyright (C) 2019 Intel Corporation. All rights reserved. + * + * Jason Chen CJ <jason.cj.chen@intel.com> + * Zhao Yakui <yakui.zhao@intel.com> + * + */ + +#include <linux/interrupt.h> +#include <asm/acrn.h> +#include <asm/apic.h> +#include <asm/desc.h> +#include <asm/hypervisor.h> +#include <asm/irq_regs.h> + +static uint32_t __init acrn_detect(void) +{ + return hypervisor_cpuid_base("ACRNACRNACRN\0\0", 0); +} + +static void __init acrn_init_platform(void) +{ + /* Setup the IDT for ACRN hypervisor callback */ + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, acrn_hv_callback_vector); +} + +static bool acrn_x2apic_available(void) +{ + /* + * x2apic is not supported for now. Future enablement will have to check + * X86_FEATURE_X2APIC to determine whether x2apic is supported in the + * guest. + */ + return false; +} + +static void (*acrn_intr_handler)(void); + +__visible void __irq_entry acrn_hv_vector_handler(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + /* + * The hypervisor requires that the APIC EOI should be acked. + * If the APIC EOI is not acked, the APIC ISR bit for the + * HYPERVISOR_CALLBACK_VECTOR will not be cleared and then it + * will block the interrupt whose vector is lower than + * HYPERVISOR_CALLBACK_VECTOR. + */ + entering_ack_irq(); + inc_irq_stat(irq_hv_callback_count); + + if (acrn_intr_handler) + acrn_intr_handler(); + + exiting_irq(); + set_irq_regs(old_regs); +} + +const __initconst struct hypervisor_x86 x86_hyper_acrn = { + .name = "ACRN", + .detect = acrn_detect, + .type = X86_HYPER_ACRN, + .init.init_platform = acrn_init_platform, + .init.x2apic_available = acrn_x2apic_available, +}; diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index e71a6ff8a67e..e2f319dc992d 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -13,6 +13,7 @@ #include <linux/percpu.h> #include <linux/cpufreq.h> #include <linux/smp.h> +#include <linux/sched/isolation.h> #include "cpu.h" @@ -85,6 +86,9 @@ unsigned int aperfmperf_get_khz(int cpu) if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) return 0; + if (!housekeeping_cpu(cpu, HK_FLAG_MISC)) + return 0; + aperfmperf_snapshot_cpu(cpu, ktime_get(), true); return per_cpu(samples.khz, cpu); } @@ -101,9 +105,12 @@ void arch_freq_prepare_all(void) if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) return; - for_each_online_cpu(cpu) + for_each_online_cpu(cpu) { + if (!housekeeping_cpu(cpu, HK_FLAG_MISC)) + continue; if (!aperfmperf_snapshot_cpu(cpu, now, false)) wait = true; + } if (wait) msleep(APERFMPERF_REFRESH_DELAY_MS); @@ -117,6 +124,9 @@ unsigned int arch_freq_get_on_cpu(int cpu) if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) return 0; + if (!housekeeping_cpu(cpu, HK_FLAG_MISC)) + return 0; + if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true)) return per_cpu(samples.khz, cpu); diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 03b4cc0ec3a7..66ca906aa790 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -836,6 +836,16 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) } /* + * If SSBD is controlled by the SPEC_CTRL MSR, then set the proper + * bit in the mask to allow guests to use the mitigation even in the + * case where the host does not enable it. + */ + if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || + static_cpu_has(X86_FEATURE_AMD_SSBD)) { + x86_spec_ctrl_mask |= SPEC_CTRL_SSBD; + } + + /* * We have three CPU feature flags that are in play here: * - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible. * - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass @@ -852,7 +862,6 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) x86_amd_ssb_disable(); } else { x86_spec_ctrl_base |= SPEC_CTRL_SSBD; - x86_spec_ctrl_mask |= SPEC_CTRL_SSBD; wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); } } diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index 395d46f78582..c7503be92f35 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -658,8 +658,7 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id) if (c->x86 < 0x17) { /* LLC is at the node level. */ per_cpu(cpu_llc_id, cpu) = node_id; - } else if (c->x86 == 0x17 && - c->x86_model >= 0 && c->x86_model <= 0x1F) { + } else if (c->x86 == 0x17 && c->x86_model <= 0x1F) { /* * LLC is at the core complex level. * Core complex ID is ApicId[3] for these processors. diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2c57fffebf9b..309b6b9b49d4 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -366,6 +366,25 @@ out: cr4_clear_bits(X86_CR4_UMIP); } +DEFINE_STATIC_KEY_FALSE_RO(cr_pinning); +EXPORT_SYMBOL(cr_pinning); +unsigned long cr4_pinned_bits __ro_after_init; +EXPORT_SYMBOL(cr4_pinned_bits); + +/* + * Once CPU feature detection is finished (and boot params have been + * parsed), record any of the sensitive CR bits that are set, and + * enable CR pinning. + */ +static void __init setup_cr_pinning(void) +{ + unsigned long mask; + + mask = (X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP); + cr4_pinned_bits = this_cpu_read(cpu_tlbstate.cr4) & mask; + static_key_enable(&cr_pinning.key); +} + /* * Protection Keys are not available in 32-bit mode. */ @@ -801,6 +820,30 @@ static void init_speculation_control(struct cpuinfo_x86 *c) } } +static void init_cqm(struct cpuinfo_x86 *c) +{ + if (!cpu_has(c, X86_FEATURE_CQM_LLC)) { + c->x86_cache_max_rmid = -1; + c->x86_cache_occ_scale = -1; + return; + } + + /* will be overridden if occupancy monitoring exists */ + c->x86_cache_max_rmid = cpuid_ebx(0xf); + + if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) || + cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) || + cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) { + u32 eax, ebx, ecx, edx; + + /* QoS sub-leaf, EAX=0Fh, ECX=1 */ + cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx); + + c->x86_cache_max_rmid = ecx; + c->x86_cache_occ_scale = ebx; + } +} + void get_cpu_cap(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; @@ -823,6 +866,12 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[CPUID_7_0_EBX] = ebx; c->x86_capability[CPUID_7_ECX] = ecx; c->x86_capability[CPUID_7_EDX] = edx; + + /* Check valid sub-leaf index before accessing it */ + if (eax >= 1) { + cpuid_count(0x00000007, 1, &eax, &ebx, &ecx, &edx); + c->x86_capability[CPUID_7_1_EAX] = eax; + } } /* Extended state features: level 0x0000000d */ @@ -832,33 +881,6 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[CPUID_D_1_EAX] = eax; } - /* Additional Intel-defined flags: level 0x0000000F */ - if (c->cpuid_level >= 0x0000000F) { - - /* QoS sub-leaf, EAX=0Fh, ECX=0 */ - cpuid_count(0x0000000F, 0, &eax, &ebx, &ecx, &edx); - c->x86_capability[CPUID_F_0_EDX] = edx; - - if (cpu_has(c, X86_FEATURE_CQM_LLC)) { - /* will be overridden if occupancy monitoring exists */ - c->x86_cache_max_rmid = ebx; - - /* QoS sub-leaf, EAX=0Fh, ECX=1 */ - cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx); - c->x86_capability[CPUID_F_1_EDX] = edx; - - if ((cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) || - ((cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL)) || - (cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)))) { - c->x86_cache_max_rmid = ecx; - c->x86_cache_occ_scale = ebx; - } - } else { - c->x86_cache_max_rmid = -1; - c->x86_cache_occ_scale = -1; - } - } - /* AMD-defined flags: level 0x80000001 */ eax = cpuid_eax(0x80000000); c->extended_cpuid_level = eax; @@ -889,6 +911,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c) init_scattered_cpuid_features(c); init_speculation_control(c); + init_cqm(c); /* * Clear/Set all flags overridden by options, after probe. @@ -1299,6 +1322,7 @@ static void validate_apic_and_package_id(struct cpuinfo_x86 *c) cpu, apicid, c->initial_apicid); } BUG_ON(topology_update_package_map(c->phys_proc_id, cpu)); + BUG_ON(topology_update_die_map(c->cpu_die_id, cpu)); #else c->logical_proc_id = 0; #endif @@ -1464,6 +1488,7 @@ void __init identify_boot_cpu(void) enable_sep_cpu(); #endif cpu_detect_tlb(&boot_cpu_data); + setup_cr_pinning(); } void identify_secondary_cpu(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index 2c0bd38a44ab..b5353244749b 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -20,6 +20,7 @@ struct cpuid_dep { * but it's difficult to tell that to the init reference checker. */ static const struct cpuid_dep cpuid_deps[] = { + { X86_FEATURE_FXSR, X86_FEATURE_FPU }, { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE }, { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE }, { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE }, @@ -27,7 +28,11 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_PKU, X86_FEATURE_XSAVE }, { X86_FEATURE_MPX, X86_FEATURE_XSAVE }, { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE }, + { X86_FEATURE_CMOV, X86_FEATURE_FXSR }, + { X86_FEATURE_MMX, X86_FEATURE_FXSR }, + { X86_FEATURE_MMXEXT, X86_FEATURE_MMX }, { X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR }, + { X86_FEATURE_XSAVE, X86_FEATURE_FXSR }, { X86_FEATURE_XMM, X86_FEATURE_FXSR }, { X86_FEATURE_XMM2, X86_FEATURE_XMM }, { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, @@ -59,6 +64,10 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F }, { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F }, + { X86_FEATURE_CQM_OCCUP_LLC, X86_FEATURE_CQM_LLC }, + { X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC }, + { X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC }, + { X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL }, {} }; diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 479ca4728de0..87e39ad8d873 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -32,6 +32,7 @@ extern const struct hypervisor_x86 x86_hyper_xen_pv; extern const struct hypervisor_x86 x86_hyper_xen_hvm; extern const struct hypervisor_x86 x86_hyper_kvm; extern const struct hypervisor_x86 x86_hyper_jailhouse; +extern const struct hypervisor_x86 x86_hyper_acrn; static const __initconst struct hypervisor_x86 * const hypervisors[] = { @@ -49,6 +50,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = #ifdef CONFIG_JAILHOUSE_GUEST &x86_hyper_jailhouse, #endif +#ifdef CONFIG_ACRN_GUEST + &x86_hyper_acrn, +#endif }; enum x86_hypervisor_type x86_hyper_type; diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index f17c1a714779..8d6d92ebeb54 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -66,6 +66,32 @@ void check_mpx_erratum(struct cpuinfo_x86 *c) } } +/* + * Processors which have self-snooping capability can handle conflicting + * memory type across CPUs by snooping its own cache. However, there exists + * CPU models in which having conflicting memory types still leads to + * unpredictable behavior, machine check errors, or hangs. Clear this + * feature to prevent its use on machines with known erratas. + */ +static void check_memory_type_self_snoop_errata(struct cpuinfo_x86 *c) +{ + switch (c->x86_model) { + case INTEL_FAM6_CORE_YONAH: + case INTEL_FAM6_CORE2_MEROM: + case INTEL_FAM6_CORE2_MEROM_L: + case INTEL_FAM6_CORE2_PENRYN: + case INTEL_FAM6_CORE2_DUNNINGTON: + case INTEL_FAM6_NEHALEM: + case INTEL_FAM6_NEHALEM_G: + case INTEL_FAM6_NEHALEM_EP: + case INTEL_FAM6_NEHALEM_EX: + case INTEL_FAM6_WESTMERE: + case INTEL_FAM6_WESTMERE_EP: + case INTEL_FAM6_SANDYBRIDGE: + setup_clear_cpu_cap(X86_FEATURE_SELFSNOOP); + } +} + static bool ring3mwait_disabled __read_mostly; static int __init ring3mwait_disable(char *__unused) @@ -304,6 +330,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) } check_mpx_erratum(c); + check_memory_type_self_snoop_errata(c); /* * Get the number of SMT siblings early from the extended topology diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 785050af85e5..6ea7fdc82f3c 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -99,11 +99,6 @@ static struct smca_bank_name smca_names[] = { [SMCA_PCIE] = { "pcie", "PCI Express Unit" }, }; -static u32 smca_bank_addrs[MAX_NR_BANKS][NR_BLOCKS] __ro_after_init = -{ - [0 ... MAX_NR_BANKS - 1] = { [0 ... NR_BLOCKS - 1] = -1 } -}; - static const char *smca_get_name(enum smca_bank_types t) { if (t >= N_SMCA_BANK_TYPES) @@ -197,6 +192,9 @@ static char buf_mcatype[MAX_MCATYPE_NAME_LEN]; static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */ +/* Map of banks that have more than MCA_MISC0 available. */ +static DEFINE_PER_CPU(u32, smca_misc_banks_map); + static void amd_threshold_interrupt(void); static void amd_deferred_error_interrupt(void); @@ -206,6 +204,28 @@ static void default_deferred_error_interrupt(void) } void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; +static void smca_set_misc_banks_map(unsigned int bank, unsigned int cpu) +{ + u32 low, high; + + /* + * For SMCA enabled processors, BLKPTR field of the first MISC register + * (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4). + */ + if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) + return; + + if (!(low & MCI_CONFIG_MCAX)) + return; + + if (rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high)) + return; + + if (low & MASK_BLKPTR_LO) + per_cpu(smca_misc_banks_map, cpu) |= BIT(bank); + +} + static void smca_configure(unsigned int bank, unsigned int cpu) { unsigned int i, hwid_mcatype; @@ -243,6 +263,8 @@ static void smca_configure(unsigned int bank, unsigned int cpu) wrmsr(smca_config, low, high); } + smca_set_misc_banks_map(bank, cpu); + /* Return early if this bank was already initialized. */ if (smca_banks[bank].hwid) return; @@ -453,50 +475,29 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) wrmsr(MSR_CU_DEF_ERR, low, high); } -static u32 smca_get_block_address(unsigned int bank, unsigned int block) +static u32 smca_get_block_address(unsigned int bank, unsigned int block, + unsigned int cpu) { - u32 low, high; - u32 addr = 0; - - if (smca_get_bank_type(bank) == SMCA_RESERVED) - return addr; - if (!block) return MSR_AMD64_SMCA_MCx_MISC(bank); - /* Check our cache first: */ - if (smca_bank_addrs[bank][block] != -1) - return smca_bank_addrs[bank][block]; - - /* - * For SMCA enabled processors, BLKPTR field of the first MISC register - * (MCx_MISC0) indicates presence of additional MISC regs set (MISC1-4). - */ - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) - goto out; - - if (!(low & MCI_CONFIG_MCAX)) - goto out; - - if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) && - (low & MASK_BLKPTR_LO)) - addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); + if (!(per_cpu(smca_misc_banks_map, cpu) & BIT(bank))) + return 0; -out: - smca_bank_addrs[bank][block] = addr; - return addr; + return MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); } static u32 get_block_address(u32 current_addr, u32 low, u32 high, - unsigned int bank, unsigned int block) + unsigned int bank, unsigned int block, + unsigned int cpu) { u32 addr = 0, offset = 0; - if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS)) + if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS)) return addr; if (mce_flags.smca) - return smca_get_block_address(bank, block); + return smca_get_block_address(bank, block, cpu); /* Fall back to method we used for older processors: */ switch (block) { @@ -624,18 +625,19 @@ void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) /* cpu init entry point, called from mce.c with preempt off */ void mce_amd_feature_init(struct cpuinfo_x86 *c) { - u32 low = 0, high = 0, address = 0; unsigned int bank, block, cpu = smp_processor_id(); + u32 low = 0, high = 0, address = 0; int offset = -1; - for (bank = 0; bank < mca_cfg.banks; ++bank) { + + for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) { if (mce_flags.smca) smca_configure(bank, cpu); disable_err_thresholding(c, bank); for (block = 0; block < NR_BLOCKS; ++block) { - address = get_block_address(address, low, high, bank, block); + address = get_block_address(address, low, high, bank, block, cpu); if (!address) break; @@ -973,7 +975,7 @@ static void amd_deferred_error_interrupt(void) { unsigned int bank; - for (bank = 0; bank < mca_cfg.banks; ++bank) + for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) log_error_deferred(bank); } @@ -1014,7 +1016,7 @@ static void amd_threshold_interrupt(void) struct threshold_block *first_block = NULL, *block = NULL, *tmp = NULL; unsigned int bank, cpu = smp_processor_id(); - for (bank = 0; bank < mca_cfg.banks; ++bank) { + for (bank = 0; bank < this_cpu_read(mce_num_banks); ++bank) { if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; @@ -1201,7 +1203,7 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, u32 low, high; int err; - if ((bank >= mca_cfg.banks) || (block >= NR_BLOCKS)) + if ((bank >= per_cpu(mce_num_banks, cpu)) || (block >= NR_BLOCKS)) return 0; if (rdmsr_safe_on_cpu(cpu, address, &low, &high)) @@ -1252,7 +1254,7 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, if (err) goto out_free; recurse: - address = get_block_address(address, low, high, bank, ++block); + address = get_block_address(address, low, high, bank, ++block, cpu); if (!address) return 0; @@ -1435,7 +1437,7 @@ int mce_threshold_remove_device(unsigned int cpu) { unsigned int bank; - for (bank = 0; bank < mca_cfg.banks; ++bank) { + for (bank = 0; bank < per_cpu(mce_num_banks, cpu); ++bank) { if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; threshold_remove_bank(cpu, bank); @@ -1456,14 +1458,14 @@ int mce_threshold_create_device(unsigned int cpu) if (bp) return 0; - bp = kcalloc(mca_cfg.banks, sizeof(struct threshold_bank *), + bp = kcalloc(per_cpu(mce_num_banks, cpu), sizeof(struct threshold_bank *), GFP_KERNEL); if (!bp) return -ENOMEM; per_cpu(threshold_banks, cpu) = bp; - for (bank = 0; bank < mca_cfg.banks; ++bank) { + for (bank = 0; bank < per_cpu(mce_num_banks, cpu); ++bank) { if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; err = threshold_create_bank(cpu, bank); diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 282916f3b8d8..743370ee4983 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -65,7 +65,23 @@ static DEFINE_MUTEX(mce_sysfs_mutex); DEFINE_PER_CPU(unsigned, mce_exception_count); -struct mce_bank *mce_banks __read_mostly; +DEFINE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks); + +struct mce_bank { + u64 ctl; /* subevents to enable */ + bool init; /* initialise bank? */ +}; +static DEFINE_PER_CPU_READ_MOSTLY(struct mce_bank[MAX_NR_BANKS], mce_banks_array); + +#define ATTR_LEN 16 +/* One object for each MCE bank, shared by all CPUs */ +struct mce_bank_dev { + struct device_attribute attr; /* device attribute */ + char attrname[ATTR_LEN]; /* attribute name */ + u8 bank; /* bank number */ +}; +static struct mce_bank_dev mce_bank_devs[MAX_NR_BANKS]; + struct mce_vendor_flags mce_flags __read_mostly; struct mca_config mca_cfg __read_mostly = { @@ -675,6 +691,7 @@ DEFINE_PER_CPU(unsigned, mce_poll_count); */ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) { + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); bool error_seen = false; struct mce m; int i; @@ -686,7 +703,7 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) if (flags & MCP_TIMESTAMP) m.tsc = rdtsc(); - for (i = 0; i < mca_cfg.banks; i++) { + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { if (!mce_banks[i].ctl || !test_bit(i, *b)) continue; @@ -788,7 +805,7 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, char *tmp; int i; - for (i = 0; i < mca_cfg.banks; i++) { + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { m->status = mce_rdmsrl(msr_ops.status(i)); if (!(m->status & MCI_STATUS_VAL)) continue; @@ -1068,7 +1085,7 @@ static void mce_clear_state(unsigned long *toclear) { int i; - for (i = 0; i < mca_cfg.banks; i++) { + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { if (test_bit(i, toclear)) mce_wrmsrl(msr_ops.status(i), 0); } @@ -1122,10 +1139,11 @@ static void __mc_scan_banks(struct mce *m, struct mce *final, unsigned long *toclear, unsigned long *valid_banks, int no_way_out, int *worst) { + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); struct mca_config *cfg = &mca_cfg; int severity, i; - for (i = 0; i < cfg->banks; i++) { + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { __clear_bit(i, toclear); if (!test_bit(i, valid_banks)) continue; @@ -1330,7 +1348,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) local_irq_enable(); if (kill_it || do_memory_failure(&m)) - force_sig(SIGBUS, current); + force_sig(SIGBUS); local_irq_disable(); ist_end_non_atomic(); } else { @@ -1463,27 +1481,29 @@ int mce_notify_irq(void) } EXPORT_SYMBOL_GPL(mce_notify_irq); -static int __mcheck_cpu_mce_banks_init(void) +static void __mcheck_cpu_mce_banks_init(void) { + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + u8 n_banks = this_cpu_read(mce_num_banks); int i; - mce_banks = kcalloc(MAX_NR_BANKS, sizeof(struct mce_bank), GFP_KERNEL); - if (!mce_banks) - return -ENOMEM; - - for (i = 0; i < MAX_NR_BANKS; i++) { + for (i = 0; i < n_banks; i++) { struct mce_bank *b = &mce_banks[i]; + /* + * Init them all, __mcheck_cpu_apply_quirks() is going to apply + * the required vendor quirks before + * __mcheck_cpu_init_clear_banks() does the final bank setup. + */ b->ctl = -1ULL; b->init = 1; } - return 0; } /* * Initialize Machine Checks for a CPU. */ -static int __mcheck_cpu_cap_init(void) +static void __mcheck_cpu_cap_init(void) { u64 cap; u8 b; @@ -1491,16 +1511,16 @@ static int __mcheck_cpu_cap_init(void) rdmsrl(MSR_IA32_MCG_CAP, cap); b = cap & MCG_BANKCNT_MASK; - if (WARN_ON_ONCE(b > MAX_NR_BANKS)) + + if (b > MAX_NR_BANKS) { + pr_warn("CPU%d: Using only %u machine check banks out of %u\n", + smp_processor_id(), MAX_NR_BANKS, b); b = MAX_NR_BANKS; + } - mca_cfg.banks = max(mca_cfg.banks, b); + this_cpu_write(mce_num_banks, b); - if (!mce_banks) { - int err = __mcheck_cpu_mce_banks_init(); - if (err) - return err; - } + __mcheck_cpu_mce_banks_init(); /* Use accurate RIP reporting if available. */ if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) @@ -1508,8 +1528,6 @@ static int __mcheck_cpu_cap_init(void) if (cap & MCG_SER_P) mca_cfg.ser = 1; - - return 0; } static void __mcheck_cpu_init_generic(void) @@ -1536,9 +1554,10 @@ static void __mcheck_cpu_init_generic(void) static void __mcheck_cpu_init_clear_banks(void) { + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); int i; - for (i = 0; i < mca_cfg.banks; i++) { + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { struct mce_bank *b = &mce_banks[i]; if (!b->init) @@ -1549,6 +1568,33 @@ static void __mcheck_cpu_init_clear_banks(void) } /* + * Do a final check to see if there are any unused/RAZ banks. + * + * This must be done after the banks have been initialized and any quirks have + * been applied. + * + * Do not call this from any user-initiated flows, e.g. CPU hotplug or sysfs. + * Otherwise, a user who disables a bank will not be able to re-enable it + * without a system reboot. + */ +static void __mcheck_cpu_check_banks(void) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + u64 msrval; + int i; + + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { + struct mce_bank *b = &mce_banks[i]; + + if (!b->init) + continue; + + rdmsrl(msr_ops.ctl(i), msrval); + b->init = !!msrval; + } +} + +/* * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM * Vol 3B Table 15-20). But this confuses both the code that determines @@ -1579,6 +1625,7 @@ static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) /* Add per CPU specific workarounds here */ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) { + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); struct mca_config *cfg = &mca_cfg; if (c->x86_vendor == X86_VENDOR_UNKNOWN) { @@ -1588,7 +1635,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD) { - if (c->x86 == 15 && cfg->banks > 4) { + if (c->x86 == 15 && this_cpu_read(mce_num_banks) > 4) { /* * disable GART TBL walk error reporting, which * trips off incorrectly with the IOMMU & 3ware @@ -1607,7 +1654,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) * Various K7s with broken bank 0 around. Always disable * by default. */ - if (c->x86 == 6 && cfg->banks > 0) + if (c->x86 == 6 && this_cpu_read(mce_num_banks) > 0) mce_banks[0].ctl = 0; /* @@ -1629,7 +1676,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) * valid event later, merely don't write CTL0. */ - if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0) + if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0) mce_banks[0].init = 0; /* @@ -1815,7 +1862,9 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) if (!mce_available(c)) return; - if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { + __mcheck_cpu_cap_init(); + + if (__mcheck_cpu_apply_quirks(c) < 0) { mca_cfg.disabled = 1; return; } @@ -1832,6 +1881,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(c); __mcheck_cpu_init_clear_banks(); + __mcheck_cpu_check_banks(); __mcheck_cpu_setup_timer(); } @@ -1863,7 +1913,7 @@ static void __mce_disable_bank(void *arg) void mce_disable_bank(int bank) { - if (bank >= mca_cfg.banks) { + if (bank >= this_cpu_read(mce_num_banks)) { pr_warn(FW_BUG "Ignoring request to disable invalid MCA bank %d.\n", bank); @@ -1949,9 +1999,10 @@ int __init mcheck_init(void) */ static void mce_disable_error_reporting(void) { + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); int i; - for (i = 0; i < mca_cfg.banks; i++) { + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { struct mce_bank *b = &mce_banks[i]; if (b->init) @@ -2051,26 +2102,47 @@ static struct bus_type mce_subsys = { DEFINE_PER_CPU(struct device *, mce_device); -static inline struct mce_bank *attr_to_bank(struct device_attribute *attr) +static inline struct mce_bank_dev *attr_to_bank(struct device_attribute *attr) { - return container_of(attr, struct mce_bank, attr); + return container_of(attr, struct mce_bank_dev, attr); } static ssize_t show_bank(struct device *s, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); + u8 bank = attr_to_bank(attr)->bank; + struct mce_bank *b; + + if (bank >= per_cpu(mce_num_banks, s->id)) + return -EINVAL; + + b = &per_cpu(mce_banks_array, s->id)[bank]; + + if (!b->init) + return -ENODEV; + + return sprintf(buf, "%llx\n", b->ctl); } static ssize_t set_bank(struct device *s, struct device_attribute *attr, const char *buf, size_t size) { + u8 bank = attr_to_bank(attr)->bank; + struct mce_bank *b; u64 new; if (kstrtou64(buf, 0, &new) < 0) return -EINVAL; - attr_to_bank(attr)->ctl = new; + if (bank >= per_cpu(mce_num_banks, s->id)) + return -EINVAL; + + b = &per_cpu(mce_banks_array, s->id)[bank]; + + if (!b->init) + return -ENODEV; + + b->ctl = new; mce_restart(); return size; @@ -2185,7 +2257,7 @@ static void mce_device_release(struct device *dev) kfree(dev); } -/* Per cpu device init. All of the cpus still share the same ctrl bank: */ +/* Per CPU device init. All of the CPUs still share the same bank device: */ static int mce_device_create(unsigned int cpu) { struct device *dev; @@ -2217,8 +2289,8 @@ static int mce_device_create(unsigned int cpu) if (err) goto error; } - for (j = 0; j < mca_cfg.banks; j++) { - err = device_create_file(dev, &mce_banks[j].attr); + for (j = 0; j < per_cpu(mce_num_banks, cpu); j++) { + err = device_create_file(dev, &mce_bank_devs[j].attr); if (err) goto error2; } @@ -2228,7 +2300,7 @@ static int mce_device_create(unsigned int cpu) return 0; error2: while (--j >= 0) - device_remove_file(dev, &mce_banks[j].attr); + device_remove_file(dev, &mce_bank_devs[j].attr); error: while (--i >= 0) device_remove_file(dev, mce_device_attrs[i]); @@ -2249,8 +2321,8 @@ static void mce_device_remove(unsigned int cpu) for (i = 0; mce_device_attrs[i]; i++) device_remove_file(dev, mce_device_attrs[i]); - for (i = 0; i < mca_cfg.banks; i++) - device_remove_file(dev, &mce_banks[i].attr); + for (i = 0; i < per_cpu(mce_num_banks, cpu); i++) + device_remove_file(dev, &mce_bank_devs[i].attr); device_unregister(dev); cpumask_clear_cpu(cpu, mce_device_initialized); @@ -2271,6 +2343,7 @@ static void mce_disable_cpu(void) static void mce_reenable_cpu(void) { + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); int i; if (!mce_available(raw_cpu_ptr(&cpu_info))) @@ -2278,7 +2351,7 @@ static void mce_reenable_cpu(void) if (!cpuhp_tasks_frozen) cmci_reenable(); - for (i = 0; i < mca_cfg.banks; i++) { + for (i = 0; i < this_cpu_read(mce_num_banks); i++) { struct mce_bank *b = &mce_banks[i]; if (b->init) @@ -2328,10 +2401,12 @@ static __init void mce_init_banks(void) { int i; - for (i = 0; i < mca_cfg.banks; i++) { - struct mce_bank *b = &mce_banks[i]; + for (i = 0; i < MAX_NR_BANKS; i++) { + struct mce_bank_dev *b = &mce_bank_devs[i]; struct device_attribute *a = &b->attr; + b->bank = i; + sysfs_attr_init(&a->attr); a->attr.name = b->attrname; snprintf(b->attrname, ATTR_LEN, "bank%d", i); @@ -2441,22 +2516,16 @@ static int fake_panic_set(void *data, u64 val) DEFINE_DEBUGFS_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set, "%llu\n"); -static int __init mcheck_debugfs_init(void) +static void __init mcheck_debugfs_init(void) { - struct dentry *dmce, *ffake_panic; + struct dentry *dmce; dmce = mce_get_debugfs_dir(); - if (!dmce) - return -ENOMEM; - ffake_panic = debugfs_create_file_unsafe("fake_panic", 0444, dmce, - NULL, &fake_panic_fops); - if (!ffake_panic) - return -ENOMEM; - - return 0; + debugfs_create_file_unsafe("fake_panic", 0444, dmce, NULL, + &fake_panic_fops); } #else -static int __init mcheck_debugfs_init(void) { return -EINVAL; } +static void __init mcheck_debugfs_init(void) { } #endif DEFINE_STATIC_KEY_FALSE(mcsafe_key); @@ -2464,8 +2533,6 @@ EXPORT_SYMBOL_GPL(mcsafe_key); static int __init mcheck_late_init(void) { - pr_info("Using %d MCE banks\n", mca_cfg.banks); - if (mca_cfg.recovery) static_branch_inc(&mcsafe_key); diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 5d108f70f315..1f30117b24ba 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -645,7 +645,6 @@ static const struct file_operations readme_fops = { static struct dfs_node { char *name; - struct dentry *d; const struct file_operations *fops; umode_t perm; } dfs_fls[] = { @@ -659,49 +658,23 @@ static struct dfs_node { { .name = "README", .fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH }, }; -static int __init debugfs_init(void) +static void __init debugfs_init(void) { unsigned int i; dfs_inj = debugfs_create_dir("mce-inject", NULL); - if (!dfs_inj) - return -EINVAL; - - for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) { - dfs_fls[i].d = debugfs_create_file(dfs_fls[i].name, - dfs_fls[i].perm, - dfs_inj, - &i_mce, - dfs_fls[i].fops); - - if (!dfs_fls[i].d) - goto err_dfs_add; - } - - return 0; - -err_dfs_add: - while (i-- > 0) - debugfs_remove(dfs_fls[i].d); - debugfs_remove(dfs_inj); - dfs_inj = NULL; - - return -ENODEV; + for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) + debugfs_create_file(dfs_fls[i].name, dfs_fls[i].perm, dfs_inj, + &i_mce, dfs_fls[i].fops); } static int __init inject_init(void) { - int err; - if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL)) return -ENOMEM; - err = debugfs_init(); - if (err) { - free_cpumask_var(mce_inject_cpumask); - return err; - } + debugfs_init(); register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify"); mce_register_injector_chain(&inject_nb); diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index a34b55baa7aa..43031db429d2 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -22,17 +22,8 @@ enum severity_level { extern struct blocking_notifier_head x86_mce_decoder_chain; -#define ATTR_LEN 16 #define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */ -/* One object for each MCE bank, shared by all CPUs */ -struct mce_bank { - u64 ctl; /* subevents to enable */ - unsigned char init; /* initialise bank? */ - struct device_attribute attr; /* device attribute */ - char attrname[ATTR_LEN]; /* attribute name */ -}; - struct mce_evt_llist { struct llist_node llnode; struct mce mce; @@ -47,7 +38,6 @@ struct llist_node *mce_gen_pool_prepare_records(void); extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp); struct dentry *mce_get_debugfs_dir(void); -extern struct mce_bank *mce_banks; extern mce_banks_t mce_banks_ce_disabled; #ifdef CONFIG_X86_MCE_INTEL @@ -128,7 +118,6 @@ struct mca_config { bios_cmci_threshold : 1, __reserved : 59; - u8 banks; s8 bootlog; int tolerant; int monarch_timeout; @@ -137,6 +126,7 @@ struct mca_config { }; extern struct mca_config mca_cfg; +DECLARE_PER_CPU_READ_MOSTLY(unsigned int, mce_num_banks); struct mce_vendor_flags { /* diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index 2d33a26d257e..210f1f5db5f7 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -400,21 +400,13 @@ static const struct file_operations severities_coverage_fops = { static int __init severities_debugfs_init(void) { - struct dentry *dmce, *fsev; + struct dentry *dmce; dmce = mce_get_debugfs_dir(); - if (!dmce) - goto err_out; - - fsev = debugfs_create_file("severities-coverage", 0444, dmce, NULL, - &severities_coverage_fops); - if (!fsev) - goto err_out; + debugfs_create_file("severities-coverage", 0444, dmce, NULL, + &severities_coverage_fops); return 0; - -err_out: - return -ENOMEM; } late_initcall(severities_debugfs_init); #endif /* CONFIG_DEBUG_FS */ diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 70a04436380e..cb0fdcaf1415 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -789,13 +789,16 @@ static struct syscore_ops mc_syscore_ops = { .resume = mc_bp_resume, }; -static int mc_cpu_online(unsigned int cpu) +static int mc_cpu_starting(unsigned int cpu) { - struct device *dev; - - dev = get_cpu_device(cpu); microcode_update_cpu(cpu); pr_debug("CPU%d added\n", cpu); + return 0; +} + +static int mc_cpu_online(unsigned int cpu) +{ + struct device *dev = get_cpu_device(cpu); if (sysfs_create_group(&dev->kobj, &mc_attr_group)) pr_err("Failed to create group for CPU%d\n", cpu); @@ -872,6 +875,8 @@ int __init microcode_init(void) goto out_ucode_group; register_syscore_ops(&mc_syscore_ops); + cpuhp_setup_state_nocalls(CPUHP_AP_MICROCODE_LOADER, "x86/microcode:starting", + mc_cpu_starting, NULL); cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/microcode:online", mc_cpu_online, mc_cpu_down_prep); diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh index d0dfb892c72f..aed45b8895d5 100644 --- a/arch/x86/kernel/cpu/mkcapflags.sh +++ b/arch/x86/kernel/cpu/mkcapflags.sh @@ -4,6 +4,8 @@ # Generate the x86_cap/bug_flags[] arrays from include/asm/cpufeatures.h # +set -e + IN=$1 OUT=$2 diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 7df29f08871b..062f77279ce3 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -17,6 +17,7 @@ #include <linux/irq.h> #include <linux/kexec.h> #include <linux/i8253.h> +#include <linux/random.h> #include <asm/processor.h> #include <asm/hypervisor.h> #include <asm/hyperv-tlfs.h> @@ -80,6 +81,7 @@ __visible void __irq_entry hv_stimer0_vector_handler(struct pt_regs *regs) inc_irq_stat(hyperv_stimer0_count); if (hv_stimer0_handler) hv_stimer0_handler(); + add_interrupt_randomness(HYPERV_STIMER0_VECTOR, 0); ack_APIC_irq(); exiting_irq(); @@ -89,7 +91,7 @@ __visible void __irq_entry hv_stimer0_vector_handler(struct pt_regs *regs) int hv_setup_stimer0_irq(int *irq, int *vector, void (*handler)(void)) { *vector = HYPERV_STIMER0_VECTOR; - *irq = 0; /* Unused on x86/x64 */ + *irq = -1; /* Unused on x86/x64 */ hv_stimer0_handler = handler; return 0; } @@ -266,9 +268,9 @@ static void __init ms_hyperv_init_platform(void) rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency); hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ); - lapic_timer_frequency = hv_lapic_frequency; + lapic_timer_period = hv_lapic_frequency; pr_info("Hyper-V: LAPIC Timer Frequency: %#x\n", - lapic_timer_frequency); + lapic_timer_period); } register_nmi_handler(NMI_UNKNOWN, hv_nmi_unknown, NMI_FLAG_FIRST, diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 9356c1c9024d..aa5c064a6a22 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -743,7 +743,15 @@ static void prepare_set(void) __acquires(set_atomicity_lock) /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ cr0 = read_cr0() | X86_CR0_CD; write_cr0(cr0); - wbinvd(); + + /* + * Cache flushing is the most time-consuming step when programming + * the MTRRs. Fortunately, as per the Intel Software Development + * Manual, we can skip it if the processor supports cache self- + * snooping. + */ + if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) + wbinvd(); /* Save value of CR4 and clear Page Global Enable (bit 7) */ if (boot_cpu_has(X86_FEATURE_PGE)) { @@ -760,7 +768,10 @@ static void prepare_set(void) __acquires(set_atomicity_lock) /* Disable MTRRs, and set the default type to uncached */ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); - wbinvd(); + + /* Again, only flush caches if we have to. */ + if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) + wbinvd(); } static void post_set(void) __releases(set_atomicity_lock) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 7ee93125a211..397206f23d14 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -360,6 +360,9 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) struct list_head *head; struct rdtgroup *entry; + if (!is_mbm_local_enabled()) + return; + r_mba = &rdt_resources_all[RDT_RESOURCE_MBA]; closid = rgrp->closid; rmid = rgrp->mon.rmid; diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 604c0e3bcc83..d7623e1b927d 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -431,11 +431,7 @@ static int pseudo_lock_fn(void *_rdtgrp) #else register unsigned int line_size asm("esi"); register unsigned int size asm("edi"); -#ifdef CONFIG_X86_64 - register void *mem_r asm("rbx"); -#else - register void *mem_r asm("ebx"); -#endif /* CONFIG_X86_64 */ + register void *mem_r asm(_ASM_BX); #endif /* CONFIG_KASAN */ /* @@ -1503,7 +1499,7 @@ static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) * may be scheduled elsewhere and invalidate entries in the * pseudo-locked region. */ - if (!cpumask_subset(¤t->cpus_allowed, &plr->d->cpu_mask)) { + if (!cpumask_subset(current->cpus_ptr, &plr->d->cpu_mask)) { mutex_unlock(&rdtgroup_mutex); return -EINVAL; } diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 2f48f208f7e2..bf3034994754 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -796,8 +796,12 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { struct rdt_resource *r = of->kn->parent->priv; - u32 sw_shareable = 0, hw_shareable = 0; - u32 exclusive = 0, pseudo_locked = 0; + /* + * Use unsigned long even though only 32 bits are used to ensure + * test_bit() is used safely. + */ + unsigned long sw_shareable = 0, hw_shareable = 0; + unsigned long exclusive = 0, pseudo_locked = 0; struct rdt_domain *dom; int i, hwb, swb, excl, psl; enum rdtgrp_mode mode; @@ -842,10 +846,10 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of, } for (i = r->cache.cbm_len - 1; i >= 0; i--) { pseudo_locked = dom->plr ? dom->plr->cbm : 0; - hwb = test_bit(i, (unsigned long *)&hw_shareable); - swb = test_bit(i, (unsigned long *)&sw_shareable); - excl = test_bit(i, (unsigned long *)&exclusive); - psl = test_bit(i, (unsigned long *)&pseudo_locked); + hwb = test_bit(i, &hw_shareable); + swb = test_bit(i, &sw_shareable); + excl = test_bit(i, &exclusive); + psl = test_bit(i, &pseudo_locked); if (hwb && swb) seq_putc(seq, 'X'); else if (hwb && !swb) @@ -2484,28 +2488,21 @@ out_destroy: * modification to the CBM if the default does not satisfy the * requirements. */ -static void cbm_ensure_valid(u32 *_val, struct rdt_resource *r) +static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r) { - /* - * Convert the u32 _val to an unsigned long required by all the bit - * operations within this function. No more than 32 bits of this - * converted value can be accessed because all bit operations are - * additionally provided with cbm_len that is initialized during - * hardware enumeration using five bits from the EAX register and - * thus never can exceed 32 bits. - */ - unsigned long *val = (unsigned long *)_val; unsigned int cbm_len = r->cache.cbm_len; unsigned long first_bit, zero_bit; + unsigned long val = _val; - if (*val == 0) - return; + if (!val) + return 0; - first_bit = find_first_bit(val, cbm_len); - zero_bit = find_next_zero_bit(val, cbm_len, first_bit); + first_bit = find_first_bit(&val, cbm_len); + zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); /* Clear any remaining bits to ensure contiguous region */ - bitmap_clear(val, zero_bit, cbm_len - zero_bit); + bitmap_clear(&val, zero_bit, cbm_len - zero_bit); + return (u32)val; } /* @@ -2534,7 +2531,12 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct rdt_resource *r, if (closid_allocated(i) && i != closid) { mode = rdtgroup_mode_by_closid(i); if (mode == RDT_MODE_PSEUDO_LOCKSETUP) - break; + /* + * ctrl values for locksetup aren't relevant + * until the schemata is written, and the mode + * becomes RDT_MODE_PSEUDO_LOCKED. + */ + continue; /* * If CDP is active include peer domain's * usage to ensure there is no overlap @@ -2558,7 +2560,7 @@ static int __init_one_rdt_domain(struct rdt_domain *d, struct rdt_resource *r, * Force the initial CBM to be valid, user can * modify the CBM based on system availability. */ - cbm_ensure_valid(&d->new_ctrl, r); + d->new_ctrl = cbm_ensure_valid(d->new_ctrl, r); /* * Assign the u32 CBM to an unsigned long to ensure that * bitmap_weight() does not access out-of-bound memory. diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 94aa1c72ca98..adf9b71386ef 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -26,6 +26,10 @@ struct cpuid_bit { static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, + { X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 }, + { X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 }, + { X86_FEATURE_CQM_MBM_LOCAL, CPUID_EDX, 2, 0x0000000f, 1 }, { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 }, { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 }, { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 8f6c784141d1..ee48c3fc8a65 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -15,33 +15,66 @@ /* leaf 0xb SMT level */ #define SMT_LEVEL 0 -/* leaf 0xb sub-leaf types */ +/* extended topology sub-leaf types */ #define INVALID_TYPE 0 #define SMT_TYPE 1 #define CORE_TYPE 2 +#define DIE_TYPE 5 #define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff) #define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) #define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff) -int detect_extended_topology_early(struct cpuinfo_x86 *c) -{ #ifdef CONFIG_SMP +unsigned int __max_die_per_package __read_mostly = 1; +EXPORT_SYMBOL(__max_die_per_package); + +/* + * Check if given CPUID extended toplogy "leaf" is implemented + */ +static int check_extended_topology_leaf(int leaf) +{ unsigned int eax, ebx, ecx, edx; - if (c->cpuid_level < 0xb) + cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx); + + if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) return -1; - cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx); + return 0; +} +/* + * Return best CPUID Extended Toplogy Leaf supported + */ +static int detect_extended_topology_leaf(struct cpuinfo_x86 *c) +{ + if (c->cpuid_level >= 0x1f) { + if (check_extended_topology_leaf(0x1f) == 0) + return 0x1f; + } - /* - * check if the cpuid leaf 0xb is actually implemented. - */ - if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) + if (c->cpuid_level >= 0xb) { + if (check_extended_topology_leaf(0xb) == 0) + return 0xb; + } + + return -1; +} +#endif + +int detect_extended_topology_early(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + unsigned int eax, ebx, ecx, edx; + int leaf; + + leaf = detect_extended_topology_leaf(c); + if (leaf < 0) return -1; set_cpu_cap(c, X86_FEATURE_XTOPOLOGY); + cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx); /* * initial apic id, which also represents 32-bit extended x2apic id. */ @@ -52,7 +85,7 @@ int detect_extended_topology_early(struct cpuinfo_x86 *c) } /* - * Check for extended topology enumeration cpuid leaf 0xb and if it + * Check for extended topology enumeration cpuid leaf, and if it * exists, use it for populating initial_apicid and cpu topology * detection. */ @@ -60,22 +93,28 @@ int detect_extended_topology(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP unsigned int eax, ebx, ecx, edx, sub_index; - unsigned int ht_mask_width, core_plus_mask_width; + unsigned int ht_mask_width, core_plus_mask_width, die_plus_mask_width; unsigned int core_select_mask, core_level_siblings; + unsigned int die_select_mask, die_level_siblings; + int leaf; - if (detect_extended_topology_early(c) < 0) + leaf = detect_extended_topology_leaf(c); + if (leaf < 0) return -1; /* * Populate HT related information from sub-leaf level 0. */ - cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx); + cpuid_count(leaf, SMT_LEVEL, &eax, &ebx, &ecx, &edx); + c->initial_apicid = edx; core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + die_level_siblings = LEVEL_MAX_SIBLINGS(ebx); + die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); sub_index = 1; do { - cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx); + cpuid_count(leaf, sub_index, &eax, &ebx, &ecx, &edx); /* * Check for the Core type in the implemented sub leaves. @@ -83,23 +122,34 @@ int detect_extended_topology(struct cpuinfo_x86 *c) if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) { core_level_siblings = LEVEL_MAX_SIBLINGS(ebx); core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); - break; + die_level_siblings = core_level_siblings; + die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + } + if (LEAFB_SUBTYPE(ecx) == DIE_TYPE) { + die_level_siblings = LEVEL_MAX_SIBLINGS(ebx); + die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); } sub_index++; } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE); core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width; - - c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, ht_mask_width) - & core_select_mask; - c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, core_plus_mask_width); + die_select_mask = (~(-1 << die_plus_mask_width)) >> + core_plus_mask_width; + + c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid, + ht_mask_width) & core_select_mask; + c->cpu_die_id = apic->phys_pkg_id(c->initial_apicid, + core_plus_mask_width) & die_select_mask; + c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, + die_plus_mask_width); /* * Reinit the apicid, now that we have extended initial_apicid. */ c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); c->x86_max_cores = (core_level_siblings / smp_num_siblings); + __max_die_per_package = (die_level_siblings / core_level_siblings); #endif return 0; } diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c new file mode 100644 index 000000000000..6a204e7336c1 --- /dev/null +++ b/arch/x86/kernel/cpu/umwait.c @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/syscore_ops.h> +#include <linux/suspend.h> +#include <linux/cpu.h> + +#include <asm/msr.h> + +#define UMWAIT_C02_ENABLE 0 + +#define UMWAIT_CTRL_VAL(max_time, c02_disable) \ + (((max_time) & MSR_IA32_UMWAIT_CONTROL_TIME_MASK) | \ + ((c02_disable) & MSR_IA32_UMWAIT_CONTROL_C02_DISABLE)) + +/* + * Cache IA32_UMWAIT_CONTROL MSR. This is a systemwide control. By default, + * umwait max time is 100000 in TSC-quanta and C0.2 is enabled + */ +static u32 umwait_control_cached = UMWAIT_CTRL_VAL(100000, UMWAIT_C02_ENABLE); + +/* + * Serialize access to umwait_control_cached and IA32_UMWAIT_CONTROL MSR in + * the sysfs write functions. + */ +static DEFINE_MUTEX(umwait_lock); + +static void umwait_update_control_msr(void * unused) +{ + lockdep_assert_irqs_disabled(); + wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0); +} + +/* + * The CPU hotplug callback sets the control MSR to the global control + * value. + * + * Disable interrupts so the read of umwait_control_cached and the WRMSR + * are protected against a concurrent sysfs write. Otherwise the sysfs + * write could update the cached value after it had been read on this CPU + * and issue the IPI before the old value had been written. The IPI would + * interrupt, write the new value and after return from IPI the previous + * value would be written by this CPU. + * + * With interrupts disabled the upcoming CPU either sees the new control + * value or the IPI is updating this CPU to the new control value after + * interrupts have been reenabled. + */ +static int umwait_cpu_online(unsigned int cpu) +{ + local_irq_disable(); + umwait_update_control_msr(NULL); + local_irq_enable(); + return 0; +} + +/* + * On resume, restore IA32_UMWAIT_CONTROL MSR on the boot processor which + * is the only active CPU at this time. The MSR is set up on the APs via the + * CPU hotplug callback. + * + * This function is invoked on resume from suspend and hibernation. On + * resume from suspend the restore should be not required, but we neither + * trust the firmware nor does it matter if the same value is written + * again. + */ +static void umwait_syscore_resume(void) +{ + umwait_update_control_msr(NULL); +} + +static struct syscore_ops umwait_syscore_ops = { + .resume = umwait_syscore_resume, +}; + +/* sysfs interface */ + +/* + * When bit 0 in IA32_UMWAIT_CONTROL MSR is 1, C0.2 is disabled. + * Otherwise, C0.2 is enabled. + */ +static inline bool umwait_ctrl_c02_enabled(u32 ctrl) +{ + return !(ctrl & MSR_IA32_UMWAIT_CONTROL_C02_DISABLE); +} + +static inline u32 umwait_ctrl_max_time(u32 ctrl) +{ + return ctrl & MSR_IA32_UMWAIT_CONTROL_TIME_MASK; +} + +static inline void umwait_update_control(u32 maxtime, bool c02_enable) +{ + u32 ctrl = maxtime & MSR_IA32_UMWAIT_CONTROL_TIME_MASK; + + if (!c02_enable) + ctrl |= MSR_IA32_UMWAIT_CONTROL_C02_DISABLE; + + WRITE_ONCE(umwait_control_cached, ctrl); + /* Propagate to all CPUs */ + on_each_cpu(umwait_update_control_msr, NULL, 1); +} + +static ssize_t +enable_c02_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + u32 ctrl = READ_ONCE(umwait_control_cached); + + return sprintf(buf, "%d\n", umwait_ctrl_c02_enabled(ctrl)); +} + +static ssize_t enable_c02_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + bool c02_enable; + u32 ctrl; + int ret; + + ret = kstrtobool(buf, &c02_enable); + if (ret) + return ret; + + mutex_lock(&umwait_lock); + + ctrl = READ_ONCE(umwait_control_cached); + if (c02_enable != umwait_ctrl_c02_enabled(ctrl)) + umwait_update_control(ctrl, c02_enable); + + mutex_unlock(&umwait_lock); + + return count; +} +static DEVICE_ATTR_RW(enable_c02); + +static ssize_t +max_time_show(struct device *kobj, struct device_attribute *attr, char *buf) +{ + u32 ctrl = READ_ONCE(umwait_control_cached); + + return sprintf(buf, "%u\n", umwait_ctrl_max_time(ctrl)); +} + +static ssize_t max_time_store(struct device *kobj, + struct device_attribute *attr, + const char *buf, size_t count) +{ + u32 max_time, ctrl; + int ret; + + ret = kstrtou32(buf, 0, &max_time); + if (ret) + return ret; + + /* bits[1:0] must be zero */ + if (max_time & ~MSR_IA32_UMWAIT_CONTROL_TIME_MASK) + return -EINVAL; + + mutex_lock(&umwait_lock); + + ctrl = READ_ONCE(umwait_control_cached); + if (max_time != umwait_ctrl_max_time(ctrl)) + umwait_update_control(max_time, umwait_ctrl_c02_enabled(ctrl)); + + mutex_unlock(&umwait_lock); + + return count; +} +static DEVICE_ATTR_RW(max_time); + +static struct attribute *umwait_attrs[] = { + &dev_attr_enable_c02.attr, + &dev_attr_max_time.attr, + NULL +}; + +static struct attribute_group umwait_attr_group = { + .attrs = umwait_attrs, + .name = "umwait_control", +}; + +static int __init umwait_init(void) +{ + struct device *dev; + int ret; + + if (!boot_cpu_has(X86_FEATURE_WAITPKG)) + return -ENODEV; + + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "umwait:online", + umwait_cpu_online, NULL); + + register_syscore_ops(&umwait_syscore_ops); + + /* + * Add umwait control interface. Ignore failure, so at least the + * default values are set up in case the machine manages to boot. + */ + dev = cpu_subsys.dev_root; + return sysfs_create_group(&dev->kobj, &umwait_attr_group); +} +device_initcall(umwait_init); diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 0eda91f8eeac..3c648476d4fb 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -157,7 +157,7 @@ static void __init vmware_platform_setup(void) #ifdef CONFIG_X86_LOCAL_APIC /* Skip lapic calibration since we know the bus frequency. */ - lapic_timer_frequency = ecx / HZ; + lapic_timer_period = ecx / HZ; pr_info("Host bus clock speed read from hypervisor : %u Hz\n", ecx); #endif diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c new file mode 100644 index 000000000000..8e6f2f4b4afe --- /dev/null +++ b/arch/x86/kernel/cpu/zhaoxin.c @@ -0,0 +1,167 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/sched.h> +#include <linux/sched/clock.h> + +#include <asm/cpufeature.h> + +#include "cpu.h" + +#define MSR_ZHAOXIN_FCR57 0x00001257 + +#define ACE_PRESENT (1 << 6) +#define ACE_ENABLED (1 << 7) +#define ACE_FCR (1 << 7) /* MSR_ZHAOXIN_FCR */ + +#define RNG_PRESENT (1 << 2) +#define RNG_ENABLED (1 << 3) +#define RNG_ENABLE (1 << 8) /* MSR_ZHAOXIN_RNG */ + +#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000 +#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000 +#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000 +#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001 +#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002 +#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020 + +static void init_zhaoxin_cap(struct cpuinfo_x86 *c) +{ + u32 lo, hi; + + /* Test for Extended Feature Flags presence */ + if (cpuid_eax(0xC0000000) >= 0xC0000001) { + u32 tmp = cpuid_edx(0xC0000001); + + /* Enable ACE unit, if present and disabled */ + if ((tmp & (ACE_PRESENT | ACE_ENABLED)) == ACE_PRESENT) { + rdmsr(MSR_ZHAOXIN_FCR57, lo, hi); + /* Enable ACE unit */ + lo |= ACE_FCR; + wrmsr(MSR_ZHAOXIN_FCR57, lo, hi); + pr_info("CPU: Enabled ACE h/w crypto\n"); + } + + /* Enable RNG unit, if present and disabled */ + if ((tmp & (RNG_PRESENT | RNG_ENABLED)) == RNG_PRESENT) { + rdmsr(MSR_ZHAOXIN_FCR57, lo, hi); + /* Enable RNG unit */ + lo |= RNG_ENABLE; + wrmsr(MSR_ZHAOXIN_FCR57, lo, hi); + pr_info("CPU: Enabled h/w RNG\n"); + } + + /* + * Store Extended Feature Flags as word 5 of the CPU + * capability bit array + */ + c->x86_capability[CPUID_C000_0001_EDX] = cpuid_edx(0xC0000001); + } + + if (c->x86 >= 0x6) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + + cpu_detect_cache_sizes(c); +} + +static void early_init_zhaoxin(struct cpuinfo_x86 *c) +{ + if (c->x86 >= 0x6) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_SYSENTER32); +#endif + if (c->x86_power & (1 << 8)) { + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); + } + + if (c->cpuid_level >= 0x00000001) { + u32 eax, ebx, ecx, edx; + + cpuid(0x00000001, &eax, &ebx, &ecx, &edx); + /* + * If HTT (EDX[28]) is set EBX[16:23] contain the number of + * apicids which are reserved per package. Store the resulting + * shift value for the package management code. + */ + if (edx & (1U << 28)) + c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff); + } + +} + +static void zhaoxin_detect_vmx_virtcap(struct cpuinfo_x86 *c) +{ + u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2; + + rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high); + msr_ctl = vmx_msr_high | vmx_msr_low; + + if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW) + set_cpu_cap(c, X86_FEATURE_TPR_SHADOW); + if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI) + set_cpu_cap(c, X86_FEATURE_VNMI); + if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) { + rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, + vmx_msr_low, vmx_msr_high); + msr_ctl2 = vmx_msr_high | vmx_msr_low; + if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) && + (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)) + set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY); + if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT) + set_cpu_cap(c, X86_FEATURE_EPT); + if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID) + set_cpu_cap(c, X86_FEATURE_VPID); + } +} + +static void init_zhaoxin(struct cpuinfo_x86 *c) +{ + early_init_zhaoxin(c); + init_intel_cacheinfo(c); + detect_num_cpu_cores(c); +#ifdef CONFIG_X86_32 + detect_ht(c); +#endif + + if (c->cpuid_level > 9) { + unsigned int eax = cpuid_eax(10); + + /* + * Check for version and the number of counters + * Version(eax[7:0]) can't be 0; + * Counters(eax[15:8]) should be greater than 1; + */ + if ((eax & 0xff) && (((eax >> 8) & 0xff) > 1)) + set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); + } + + if (c->x86 >= 0x6) + init_zhaoxin_cap(c); +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); +#endif + + if (cpu_has(c, X86_FEATURE_VMX)) + zhaoxin_detect_vmx_virtcap(c); +} + +#ifdef CONFIG_X86_32 +static unsigned int +zhaoxin_size_cache(struct cpuinfo_x86 *c, unsigned int size) +{ + return size; +} +#endif + +static const struct cpu_dev zhaoxin_cpu_dev = { + .c_vendor = "zhaoxin", + .c_ident = { " Shanghai " }, + .c_early_init = early_init_zhaoxin, + .c_init = init_zhaoxin, +#ifdef CONFIG_X86_32 + .legacy_cache_size = zhaoxin_size_cache, +#endif + .c_x86_vendor = X86_VENDOR_ZHAOXIN, +}; + +cpu_dev_register(zhaoxin_cpu_dev); diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 576b2e1bfc12..2bf70a2fed90 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -56,7 +56,6 @@ struct crash_memmap_data { */ crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL; EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss); -unsigned long crash_zero_bytes; static inline void cpu_crash_vmclear_loaded_vmcss(void) { @@ -73,14 +72,6 @@ static inline void cpu_crash_vmclear_loaded_vmcss(void) static void kdump_nmi_callback(int cpu, struct pt_regs *regs) { -#ifdef CONFIG_X86_32 - struct pt_regs fixed_regs; - - if (!user_mode(regs)) { - crash_fixup_ss_esp(&fixed_regs, regs); - regs = &fixed_regs; - } -#endif crash_save_cpu(regs, cpu); /* @@ -181,6 +172,9 @@ void native_machine_crash_shutdown(struct pt_regs *regs) } #ifdef CONFIG_KEXEC_FILE + +static unsigned long crash_zero_bytes; + static int get_nr_ram_ranges_callback(struct resource *res, void *arg) { unsigned int *nr_ranges = arg; @@ -381,6 +375,12 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &cmd, memmap_entry_callback); + /* Add e820 reserved ranges */ + cmd.type = E820_TYPE_RESERVED; + flags = IORESOURCE_MEM; + walk_iomem_res_desc(IORES_DESC_RESERVED, flags, 0, -1, &cmd, + memmap_entry_callback); + /* Add crashk_low_res region */ if (crashk_low_res.end) { ei.addr = crashk_low_res.start; diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 8f32e705a980..e69408bf664b 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1063,10 +1063,10 @@ static unsigned long __init e820_type_to_iores_desc(struct e820_entry *entry) case E820_TYPE_NVS: return IORES_DESC_ACPI_NV_STORAGE; case E820_TYPE_PMEM: return IORES_DESC_PERSISTENT_MEMORY; case E820_TYPE_PRAM: return IORES_DESC_PERSISTENT_MEMORY_LEGACY; + case E820_TYPE_RESERVED: return IORES_DESC_RESERVED; case E820_TYPE_RESERVED_KERN: /* Fall-through: */ case E820_TYPE_RAM: /* Fall-through: */ case E820_TYPE_UNUSABLE: /* Fall-through: */ - case E820_TYPE_RESERVED: /* Fall-through: */ default: return IORES_DESC_NONE; } } diff --git a/arch/x86/kernel/eisa.c b/arch/x86/kernel/eisa.c index e8c8c5d78dbd..e963344b0449 100644 --- a/arch/x86/kernel/eisa.c +++ b/arch/x86/kernel/eisa.c @@ -1,7 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * EISA specific code - * - * This file is licensed under the GPL V2 */ #include <linux/ioport.h> #include <linux/eisa.h> diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 466fca686fb9..12c70840980e 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -43,18 +43,6 @@ static DEFINE_PER_CPU(bool, in_kernel_fpu); */ DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); -static void kernel_fpu_disable(void) -{ - WARN_ON_FPU(this_cpu_read(in_kernel_fpu)); - this_cpu_write(in_kernel_fpu, true); -} - -static void kernel_fpu_enable(void) -{ - WARN_ON_FPU(!this_cpu_read(in_kernel_fpu)); - this_cpu_write(in_kernel_fpu, false); -} - static bool kernel_fpu_disabled(void) { return this_cpu_read(in_kernel_fpu); @@ -94,42 +82,33 @@ bool irq_fpu_usable(void) } EXPORT_SYMBOL(irq_fpu_usable); -static void __kernel_fpu_begin(void) +void kernel_fpu_begin(void) { - struct fpu *fpu = ¤t->thread.fpu; + preempt_disable(); WARN_ON_FPU(!irq_fpu_usable()); + WARN_ON_FPU(this_cpu_read(in_kernel_fpu)); - kernel_fpu_disable(); + this_cpu_write(in_kernel_fpu, true); - if (current->mm) { - if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { - set_thread_flag(TIF_NEED_FPU_LOAD); - /* - * Ignore return value -- we don't care if reg state - * is clobbered. - */ - copy_fpregs_to_fpstate(fpu); - } + if (!(current->flags & PF_KTHREAD) && + !test_thread_flag(TIF_NEED_FPU_LOAD)) { + set_thread_flag(TIF_NEED_FPU_LOAD); + /* + * Ignore return value -- we don't care if reg state + * is clobbered. + */ + copy_fpregs_to_fpstate(¤t->thread.fpu); } __cpu_invalidate_fpregs_state(); } - -static void __kernel_fpu_end(void) -{ - kernel_fpu_enable(); -} - -void kernel_fpu_begin(void) -{ - preempt_disable(); - __kernel_fpu_begin(); -} EXPORT_SYMBOL_GPL(kernel_fpu_begin); void kernel_fpu_end(void) { - __kernel_fpu_end(); + WARN_ON_FPU(!this_cpu_read(in_kernel_fpu)); + + this_cpu_write(in_kernel_fpu, false); preempt_enable(); } EXPORT_SYMBOL_GPL(kernel_fpu_end); @@ -155,7 +134,6 @@ void fpu__save(struct fpu *fpu) trace_x86_fpu_after_save(fpu); fpregs_unlock(); } -EXPORT_SYMBOL_GPL(fpu__save); /* * Legacy x87 fpstate state init: diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index ef0030e3fe6b..6ce7e0a23268 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -204,12 +204,6 @@ static void __init fpu__init_system_xstate_size_legacy(void) */ if (!boot_cpu_has(X86_FEATURE_FPU)) { - /* - * Disable xsave as we do not support it if i387 - * emulation is enabled. - */ - setup_clear_cpu_cap(X86_FEATURE_XSAVE); - setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); fpu_kernel_xstate_size = sizeof(struct swregs_state); } else { if (boot_cpu_has(X86_FEATURE_FXSR)) @@ -252,17 +246,20 @@ static void __init fpu__init_parse_early_param(void) char *argptr = arg; int bit; +#ifdef CONFIG_X86_32 if (cmdline_find_option_bool(boot_command_line, "no387")) +#ifdef CONFIG_MATH_EMULATION setup_clear_cpu_cap(X86_FEATURE_FPU); +#else + pr_err("Option 'no387' required CONFIG_MATH_EMULATION enabled.\n"); +#endif - if (cmdline_find_option_bool(boot_command_line, "nofxsr")) { + if (cmdline_find_option_bool(boot_command_line, "nofxsr")) setup_clear_cpu_cap(X86_FEATURE_FXSR); - setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT); - setup_clear_cpu_cap(X86_FEATURE_XMM); - } +#endif if (cmdline_find_option_bool(boot_command_line, "noxsave")) - fpu__xstate_clear_all_cpu_caps(); + setup_clear_cpu_cap(X86_FEATURE_XSAVE); if (cmdline_find_option_bool(boot_command_line, "noxsaveopt")) setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 5a8d118bc423..0071b794ed19 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -5,6 +5,7 @@ #include <linux/compat.h> #include <linux/cpu.h> +#include <linux/pagemap.h> #include <asm/fpu/internal.h> #include <asm/fpu/signal.h> @@ -61,6 +62,11 @@ static inline int save_fsave_header(struct task_struct *tsk, void __user *buf) struct user_i387_ia32_struct env; struct _fpstate_32 __user *fp = buf; + fpregs_lock(); + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) + copy_fxregs_to_kernel(&tsk->thread.fpu); + fpregs_unlock(); + convert_from_fxsr(&env, tsk); if (__copy_to_user(buf, &env, sizeof(env)) || @@ -189,15 +195,7 @@ retry: fpregs_unlock(); if (ret) { - int aligned_size; - int nr_pages; - - aligned_size = offset_in_page(buf_fx) + fpu_user_xstate_size; - nr_pages = DIV_ROUND_UP(aligned_size, PAGE_SIZE); - - ret = get_user_pages_unlocked((unsigned long)buf_fx, nr_pages, - NULL, FOLL_WRITE); - if (ret == nr_pages) + if (!fault_in_pages_writeable(buf_fx, fpu_user_xstate_size)) goto retry; return -EFAULT; } diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 3c36dd1784db..e5cb67d67c03 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -8,6 +8,8 @@ #include <linux/cpu.h> #include <linux/mman.h> #include <linux/pkeys.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> #include <asm/fpu/api.h> #include <asm/fpu/internal.h> @@ -68,15 +70,6 @@ static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8]; unsigned int fpu_user_xstate_size; /* - * Clear all of the X86_FEATURE_* bits that are unavailable - * when the CPU has no XSAVE support. - */ -void fpu__xstate_clear_all_cpu_caps(void) -{ - setup_clear_cpu_cap(X86_FEATURE_XSAVE); -} - -/* * Return whether the system supports a given xfeature. * * Also return the name of the (most advanced) feature that the caller requested: @@ -709,7 +702,7 @@ static void fpu__init_disable_system_xstate(void) { xfeatures_mask = 0; cr4_clear_bits(X86_CR4_OSXSAVE); - fpu__xstate_clear_all_cpu_caps(); + setup_clear_cpu_cap(X86_FEATURE_XSAVE); } /* @@ -1240,3 +1233,48 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) return 0; } + +#ifdef CONFIG_PROC_PID_ARCH_STATUS +/* + * Report the amount of time elapsed in millisecond since last AVX512 + * use in the task. + */ +static void avx512_status(struct seq_file *m, struct task_struct *task) +{ + unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp); + long delta; + + if (!timestamp) { + /* + * Report -1 if no AVX512 usage + */ + delta = -1; + } else { + delta = (long)(jiffies - timestamp); + /* + * Cap to LONG_MAX if time difference > LONG_MAX + */ + if (delta < 0) + delta = LONG_MAX; + delta = jiffies_to_msecs(delta); + } + + seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta); + seq_putc(m, '\n'); +} + +/* + * Report architecture specific information + */ +int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + /* + * Report AVX512 state if the processor and build option supported. + */ + if (cpu_feature_enabled(X86_FEATURE_AVX512F)) + avx512_status(m, task); + + return 0; +} +#endif /* CONFIG_PROC_PID_ARCH_STATUS */ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 0927bb158ffc..4b73f5937f41 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -22,6 +22,7 @@ #include <linux/init.h> #include <linux/list.h> #include <linux/module.h> +#include <linux/memory.h> #include <trace/syscall.h> @@ -34,16 +35,25 @@ #ifdef CONFIG_DYNAMIC_FTRACE int ftrace_arch_code_modify_prepare(void) + __acquires(&text_mutex) { + /* + * Need to grab text_mutex to prevent a race from module loading + * and live kernel patching from changing the text permissions while + * ftrace has it set to "read/write". + */ + mutex_lock(&text_mutex); set_kernel_text_rw(); set_all_modules_text_rw(); return 0; } int ftrace_arch_code_modify_post_process(void) + __releases(&text_mutex) { set_all_modules_text_ro(); set_kernel_text_ro(); + mutex_unlock(&text_mutex); return 0; } @@ -300,7 +310,6 @@ int ftrace_int3_handler(struct pt_regs *regs) ip = regs->ip - INT3_INSN_SIZE; -#ifdef CONFIG_X86_64 if (ftrace_location(ip)) { int3_emulate_call(regs, (unsigned long)ftrace_regs_caller); return 1; @@ -312,12 +321,6 @@ int ftrace_int3_handler(struct pt_regs *regs) int3_emulate_call(regs, ftrace_update_func_call); return 1; } -#else - if (ftrace_location(ip) || is_ftrace_caller(ip)) { - int3_emulate_jmp(regs, ip + CALL_INSN_SIZE); - return 1; - } -#endif return 0; } diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S index 2ba914a34b06..073aab525d80 100644 --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -9,6 +9,8 @@ #include <asm/export.h> #include <asm/ftrace.h> #include <asm/nospec-branch.h> +#include <asm/frame.h> +#include <asm/asm-offsets.h> # define function_hook __fentry__ EXPORT_SYMBOL(__fentry__) @@ -89,26 +91,38 @@ END(ftrace_caller) ENTRY(ftrace_regs_caller) /* - * i386 does not save SS and ESP when coming from kernel. - * Instead, to get sp, ®s->sp is used (see ptrace.h). - * Unfortunately, that means eflags must be at the same location - * as the current return ip is. We move the return ip into the - * regs->ip location, and move flags into the return ip location. + * We're here from an mcount/fentry CALL, and the stack frame looks like: + * + * <previous context> + * RET-IP + * + * The purpose of this function is to call out in an emulated INT3 + * environment with a stack frame like: + * + * <previous context> + * gap / RET-IP + * gap + * gap + * gap + * pt_regs + * + * We do _NOT_ restore: ss, flags, cs, gs, fs, es, ds */ - pushl $__KERNEL_CS - pushl 4(%esp) /* Save the return ip */ - pushl $0 /* Load 0 into orig_ax */ + subl $3*4, %esp # RET-IP + 3 gaps + pushl %ss # ss + pushl %esp # points at ss + addl $5*4, (%esp) # make it point at <previous context> + pushfl # flags + pushl $__KERNEL_CS # cs + pushl 7*4(%esp) # ip <- RET-IP + pushl $0 # orig_eax + pushl %gs pushl %fs pushl %es pushl %ds - pushl %eax - - /* Get flags and place them into the return ip slot */ - pushf - popl %eax - movl %eax, 8*4(%esp) + pushl %eax pushl %ebp pushl %edi pushl %esi @@ -116,24 +130,27 @@ ENTRY(ftrace_regs_caller) pushl %ecx pushl %ebx - movl 12*4(%esp), %eax /* Load ip (1st parameter) */ - subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ - movl 15*4(%esp), %edx /* Load parent ip (2nd parameter) */ - movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ - pushl %esp /* Save pt_regs as 4th parameter */ + ENCODE_FRAME_POINTER + + movl PT_EIP(%esp), %eax # 1st argument: IP + subl $MCOUNT_INSN_SIZE, %eax + movl 21*4(%esp), %edx # 2nd argument: parent ip + movl function_trace_op, %ecx # 3rd argument: ftrace_pos + pushl %esp # 4th argument: pt_regs GLOBAL(ftrace_regs_call) call ftrace_stub - addl $4, %esp /* Skip pt_regs */ + addl $4, %esp # skip 4th argument - /* restore flags */ - push 14*4(%esp) - popf + /* place IP below the new SP */ + movl PT_OLDESP(%esp), %eax + movl PT_EIP(%esp), %ecx + movl %ecx, -4(%eax) - /* Move return ip back to its original location */ - movl 12*4(%esp), %eax - movl %eax, 14*4(%esp) + /* place EAX below that */ + movl PT_EAX(%esp), %ecx + movl %ecx, -8(%eax) popl %ebx popl %ecx @@ -141,14 +158,9 @@ GLOBAL(ftrace_regs_call) popl %esi popl %edi popl %ebp - popl %eax - popl %ds - popl %es - popl %fs - popl %gs - /* use lea to not affect flags */ - lea 3*4(%esp), %esp /* Skip orig_ax, ip and cs */ + lea -8(%eax), %esp + popl %eax jmp .Lftrace_ret diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 10eb2760ef2c..809d54397dba 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -9,6 +9,7 @@ #include <asm/export.h> #include <asm/nospec-branch.h> #include <asm/unwind_hints.h> +#include <asm/frame.h> .code64 .section .entry.text, "ax" @@ -203,6 +204,8 @@ GLOBAL(ftrace_regs_caller_op_ptr) leaq MCOUNT_REG_SIZE+8*2(%rsp), %rcx movq %rcx, RSP(%rsp) + ENCODE_FRAME_POINTER + /* regs go into 4th parameter */ leaq (%rsp), %rcx diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 16b1cbd3a61e..29ffa495bd1c 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -184,24 +184,25 @@ unsigned long __head __startup_64(unsigned long physaddr, pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask(); if (la57) { - p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); + p4d = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], + physaddr); i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; pgd[i + 0] = (pgdval_t)p4d + pgtable_flags; pgd[i + 1] = (pgdval_t)p4d + pgtable_flags; - i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D; - p4d[i + 0] = (pgdval_t)pud + pgtable_flags; - p4d[i + 1] = (pgdval_t)pud + pgtable_flags; + i = physaddr >> P4D_SHIFT; + p4d[(i + 0) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags; + p4d[(i + 1) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags; } else { i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; pgd[i + 0] = (pgdval_t)pud + pgtable_flags; pgd[i + 1] = (pgdval_t)pud + pgtable_flags; } - i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD; - pud[i + 0] = (pudval_t)pmd + pgtable_flags; - pud[i + 1] = (pudval_t)pmd + pgtable_flags; + i = physaddr >> PUD_SHIFT; + pud[(i + 0) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags; + pud[(i + 1) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags; pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL; /* Filter out unsupported __PAGE_KERNEL_* bits: */ @@ -211,8 +212,9 @@ unsigned long __head __startup_64(unsigned long physaddr, pmd_entry += physaddr; for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) { - int idx = i + (physaddr >> PMD_SHIFT) % PTRS_PER_PMD; - pmd[idx] = pmd_entry + i * PMD_SIZE; + int idx = i + (physaddr >> PMD_SHIFT); + + pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE; } /* diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index a0573f2e7763..c43e96a938d0 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -1,32 +1,44 @@ // SPDX-License-Identifier: GPL-2.0-only -#include <linux/clocksource.h> #include <linux/clockchips.h> #include <linux/interrupt.h> -#include <linux/irq.h> #include <linux/export.h> #include <linux/delay.h> -#include <linux/errno.h> -#include <linux/i8253.h> -#include <linux/slab.h> #include <linux/hpet.h> -#include <linux/init.h> #include <linux/cpu.h> -#include <linux/pm.h> -#include <linux/io.h> +#include <linux/irq.h> -#include <asm/cpufeature.h> -#include <asm/irqdomain.h> -#include <asm/fixmap.h> #include <asm/hpet.h> #include <asm/time.h> -#define HPET_MASK CLOCKSOURCE_MASK(32) +#undef pr_fmt +#define pr_fmt(fmt) "hpet: " fmt -#define HPET_DEV_USED_BIT 2 -#define HPET_DEV_USED (1 << HPET_DEV_USED_BIT) -#define HPET_DEV_VALID 0x8 -#define HPET_DEV_FSB_CAP 0x1000 -#define HPET_DEV_PERI_CAP 0x2000 +enum hpet_mode { + HPET_MODE_UNUSED, + HPET_MODE_LEGACY, + HPET_MODE_CLOCKEVT, + HPET_MODE_DEVICE, +}; + +struct hpet_channel { + struct clock_event_device evt; + unsigned int num; + unsigned int cpu; + unsigned int irq; + unsigned int in_use; + enum hpet_mode mode; + unsigned int boot_cfg; + char name[10]; +}; + +struct hpet_base { + unsigned int nr_channels; + unsigned int nr_clockevents; + unsigned int boot_cfg; + struct hpet_channel *channels; +}; + +#define HPET_MASK CLOCKSOURCE_MASK(32) #define HPET_MIN_CYCLES 128 #define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1)) @@ -39,22 +51,25 @@ u8 hpet_blockid; /* OS timer block num */ bool hpet_msi_disable; #ifdef CONFIG_PCI_MSI -static unsigned int hpet_num_timers; +static DEFINE_PER_CPU(struct hpet_channel *, cpu_hpet_channel); +static struct irq_domain *hpet_domain; #endif + static void __iomem *hpet_virt_address; -struct hpet_dev { - struct clock_event_device evt; - unsigned int num; - int cpu; - unsigned int irq; - unsigned int flags; - char name[10]; -}; +static struct hpet_base hpet_base; + +static bool hpet_legacy_int_enabled; +static unsigned long hpet_freq; -static inline struct hpet_dev *EVT_TO_HPET_DEV(struct clock_event_device *evtdev) +bool boot_hpet_disable; +bool hpet_force_user; +static bool hpet_verbose; + +static inline +struct hpet_channel *clockevent_to_channel(struct clock_event_device *evt) { - return container_of(evtdev, struct hpet_dev, evt); + return container_of(evt, struct hpet_channel, evt); } inline unsigned int hpet_readl(unsigned int a) @@ -67,10 +82,6 @@ static inline void hpet_writel(unsigned int d, unsigned int a) writel(d, hpet_virt_address + a); } -#ifdef CONFIG_X86_64 -#include <asm/pgtable.h> -#endif - static inline void hpet_set_mapping(void) { hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); @@ -85,10 +96,6 @@ static inline void hpet_clear_mapping(void) /* * HPET command line enable / disable */ -bool boot_hpet_disable; -bool hpet_force_user; -static bool hpet_verbose; - static int __init hpet_setup(char *str) { while (str) { @@ -120,13 +127,8 @@ static inline int is_hpet_capable(void) return !boot_hpet_disable && hpet_address; } -/* - * HPET timer interrupt enable / disable - */ -static bool hpet_legacy_int_enabled; - /** - * is_hpet_enabled - check whether the hpet timer interrupt is enabled + * is_hpet_enabled - Check whether the legacy HPET timer interrupt is enabled */ int is_hpet_enabled(void) { @@ -136,32 +138,36 @@ EXPORT_SYMBOL_GPL(is_hpet_enabled); static void _hpet_print_config(const char *function, int line) { - u32 i, timers, l, h; - printk(KERN_INFO "hpet: %s(%d):\n", function, line); - l = hpet_readl(HPET_ID); - h = hpet_readl(HPET_PERIOD); - timers = ((l & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1; - printk(KERN_INFO "hpet: ID: 0x%x, PERIOD: 0x%x\n", l, h); - l = hpet_readl(HPET_CFG); - h = hpet_readl(HPET_STATUS); - printk(KERN_INFO "hpet: CFG: 0x%x, STATUS: 0x%x\n", l, h); + u32 i, id, period, cfg, status, channels, l, h; + + pr_info("%s(%d):\n", function, line); + + id = hpet_readl(HPET_ID); + period = hpet_readl(HPET_PERIOD); + pr_info("ID: 0x%x, PERIOD: 0x%x\n", id, period); + + cfg = hpet_readl(HPET_CFG); + status = hpet_readl(HPET_STATUS); + pr_info("CFG: 0x%x, STATUS: 0x%x\n", cfg, status); + l = hpet_readl(HPET_COUNTER); h = hpet_readl(HPET_COUNTER+4); - printk(KERN_INFO "hpet: COUNTER_l: 0x%x, COUNTER_h: 0x%x\n", l, h); + pr_info("COUNTER_l: 0x%x, COUNTER_h: 0x%x\n", l, h); + + channels = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1; - for (i = 0; i < timers; i++) { + for (i = 0; i < channels; i++) { l = hpet_readl(HPET_Tn_CFG(i)); h = hpet_readl(HPET_Tn_CFG(i)+4); - printk(KERN_INFO "hpet: T%d: CFG_l: 0x%x, CFG_h: 0x%x\n", - i, l, h); + pr_info("T%d: CFG_l: 0x%x, CFG_h: 0x%x\n", i, l, h); + l = hpet_readl(HPET_Tn_CMP(i)); h = hpet_readl(HPET_Tn_CMP(i)+4); - printk(KERN_INFO "hpet: T%d: CMP_l: 0x%x, CMP_h: 0x%x\n", - i, l, h); + pr_info("T%d: CMP_l: 0x%x, CMP_h: 0x%x\n", i, l, h); + l = hpet_readl(HPET_Tn_ROUTE(i)); h = hpet_readl(HPET_Tn_ROUTE(i)+4); - printk(KERN_INFO "hpet: T%d ROUTE_l: 0x%x, ROUTE_h: 0x%x\n", - i, l, h); + pr_info("T%d ROUTE_l: 0x%x, ROUTE_h: 0x%x\n", i, l, h); } } @@ -172,31 +178,20 @@ do { \ } while (0) /* - * When the hpet driver (/dev/hpet) is enabled, we need to reserve + * When the HPET driver (/dev/hpet) is enabled, we need to reserve * timer 0 and timer 1 in case of RTC emulation. */ #ifdef CONFIG_HPET -static void hpet_reserve_msi_timers(struct hpet_data *hd); - -static void hpet_reserve_platform_timers(unsigned int id) +static void __init hpet_reserve_platform_timers(void) { - struct hpet __iomem *hpet = hpet_virt_address; - struct hpet_timer __iomem *timer = &hpet->hpet_timers[2]; - unsigned int nrtimers, i; struct hpet_data hd; - - nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1; + unsigned int i; memset(&hd, 0, sizeof(hd)); hd.hd_phys_address = hpet_address; - hd.hd_address = hpet; - hd.hd_nirqs = nrtimers; - hpet_reserve_timer(&hd, 0); - -#ifdef CONFIG_HPET_EMULATE_RTC - hpet_reserve_timer(&hd, 1); -#endif + hd.hd_address = hpet_virt_address; + hd.hd_nirqs = hpet_base.nr_channels; /* * NOTE that hd_irq[] reflects IOAPIC input pins (LEGACY_8254 @@ -206,30 +201,52 @@ static void hpet_reserve_platform_timers(unsigned int id) hd.hd_irq[0] = HPET_LEGACY_8254; hd.hd_irq[1] = HPET_LEGACY_RTC; - for (i = 2; i < nrtimers; timer++, i++) { - hd.hd_irq[i] = (readl(&timer->hpet_config) & - Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT; - } + for (i = 0; i < hpet_base.nr_channels; i++) { + struct hpet_channel *hc = hpet_base.channels + i; + + if (i >= 2) + hd.hd_irq[i] = hc->irq; - hpet_reserve_msi_timers(&hd); + switch (hc->mode) { + case HPET_MODE_UNUSED: + case HPET_MODE_DEVICE: + hc->mode = HPET_MODE_DEVICE; + break; + case HPET_MODE_CLOCKEVT: + case HPET_MODE_LEGACY: + hpet_reserve_timer(&hd, hc->num); + break; + } + } hpet_alloc(&hd); +} +static void __init hpet_select_device_channel(void) +{ + int i; + + for (i = 0; i < hpet_base.nr_channels; i++) { + struct hpet_channel *hc = hpet_base.channels + i; + + /* Associate the first unused channel to /dev/hpet */ + if (hc->mode == HPET_MODE_UNUSED) { + hc->mode = HPET_MODE_DEVICE; + return; + } + } } + #else -static void hpet_reserve_platform_timers(unsigned int id) { } +static inline void hpet_reserve_platform_timers(void) { } +static inline void hpet_select_device_channel(void) {} #endif -/* - * Common hpet info - */ -static unsigned long hpet_freq; - -static struct clock_event_device hpet_clockevent; - +/* Common HPET functions */ static void hpet_stop_counter(void) { u32 cfg = hpet_readl(HPET_CFG); + cfg &= ~HPET_CFG_ENABLE; hpet_writel(cfg, HPET_CFG); } @@ -243,6 +260,7 @@ static void hpet_reset_counter(void) static void hpet_start_counter(void) { unsigned int cfg = hpet_readl(HPET_CFG); + cfg |= HPET_CFG_ENABLE; hpet_writel(cfg, HPET_CFG); } @@ -274,24 +292,9 @@ static void hpet_enable_legacy_int(void) hpet_legacy_int_enabled = true; } -static void hpet_legacy_clockevent_register(void) -{ - /* Start HPET legacy interrupts */ - hpet_enable_legacy_int(); - - /* - * Start hpet with the boot cpu mask and make it - * global after the IO_APIC has been initialized. - */ - hpet_clockevent.cpumask = cpumask_of(boot_cpu_data.cpu_index); - clockevents_config_and_register(&hpet_clockevent, hpet_freq, - HPET_MIN_PROG_DELTA, 0x7FFFFFFF); - global_clock_event = &hpet_clockevent; - printk(KERN_DEBUG "hpet clockevent registered\n"); -} - -static int hpet_set_periodic(struct clock_event_device *evt, int timer) +static int hpet_clkevt_set_state_periodic(struct clock_event_device *evt) { + unsigned int channel = clockevent_to_channel(evt)->num; unsigned int cfg, cmp, now; uint64_t delta; @@ -300,11 +303,11 @@ static int hpet_set_periodic(struct clock_event_device *evt, int timer) delta >>= evt->shift; now = hpet_readl(HPET_COUNTER); cmp = now + (unsigned int)delta; - cfg = hpet_readl(HPET_Tn_CFG(timer)); + cfg = hpet_readl(HPET_Tn_CFG(channel)); cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | HPET_TN_32BIT; - hpet_writel(cfg, HPET_Tn_CFG(timer)); - hpet_writel(cmp, HPET_Tn_CMP(timer)); + hpet_writel(cfg, HPET_Tn_CFG(channel)); + hpet_writel(cmp, HPET_Tn_CMP(channel)); udelay(1); /* * HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL @@ -313,52 +316,55 @@ static int hpet_set_periodic(struct clock_event_device *evt, int timer) * (See AMD-8111 HyperTransport I/O Hub Data Sheet, * Publication # 24674) */ - hpet_writel((unsigned int)delta, HPET_Tn_CMP(timer)); + hpet_writel((unsigned int)delta, HPET_Tn_CMP(channel)); hpet_start_counter(); hpet_print_config(); return 0; } -static int hpet_set_oneshot(struct clock_event_device *evt, int timer) +static int hpet_clkevt_set_state_oneshot(struct clock_event_device *evt) { + unsigned int channel = clockevent_to_channel(evt)->num; unsigned int cfg; - cfg = hpet_readl(HPET_Tn_CFG(timer)); + cfg = hpet_readl(HPET_Tn_CFG(channel)); cfg &= ~HPET_TN_PERIODIC; cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; - hpet_writel(cfg, HPET_Tn_CFG(timer)); + hpet_writel(cfg, HPET_Tn_CFG(channel)); return 0; } -static int hpet_shutdown(struct clock_event_device *evt, int timer) +static int hpet_clkevt_set_state_shutdown(struct clock_event_device *evt) { + unsigned int channel = clockevent_to_channel(evt)->num; unsigned int cfg; - cfg = hpet_readl(HPET_Tn_CFG(timer)); + cfg = hpet_readl(HPET_Tn_CFG(channel)); cfg &= ~HPET_TN_ENABLE; - hpet_writel(cfg, HPET_Tn_CFG(timer)); + hpet_writel(cfg, HPET_Tn_CFG(channel)); return 0; } -static int hpet_resume(struct clock_event_device *evt) +static int hpet_clkevt_legacy_resume(struct clock_event_device *evt) { hpet_enable_legacy_int(); hpet_print_config(); return 0; } -static int hpet_next_event(unsigned long delta, - struct clock_event_device *evt, int timer) +static int +hpet_clkevt_set_next_event(unsigned long delta, struct clock_event_device *evt) { + unsigned int channel = clockevent_to_channel(evt)->num; u32 cnt; s32 res; cnt = hpet_readl(HPET_COUNTER); cnt += (u32) delta; - hpet_writel(cnt, HPET_Tn_CMP(timer)); + hpet_writel(cnt, HPET_Tn_CMP(channel)); /* * HPETs are a complete disaster. The compare register is @@ -387,360 +393,250 @@ static int hpet_next_event(unsigned long delta, return res < HPET_MIN_CYCLES ? -ETIME : 0; } -static int hpet_legacy_shutdown(struct clock_event_device *evt) +static void hpet_init_clockevent(struct hpet_channel *hc, unsigned int rating) { - return hpet_shutdown(evt, 0); -} + struct clock_event_device *evt = &hc->evt; -static int hpet_legacy_set_oneshot(struct clock_event_device *evt) -{ - return hpet_set_oneshot(evt, 0); -} + evt->rating = rating; + evt->irq = hc->irq; + evt->name = hc->name; + evt->cpumask = cpumask_of(hc->cpu); + evt->set_state_oneshot = hpet_clkevt_set_state_oneshot; + evt->set_next_event = hpet_clkevt_set_next_event; + evt->set_state_shutdown = hpet_clkevt_set_state_shutdown; -static int hpet_legacy_set_periodic(struct clock_event_device *evt) -{ - return hpet_set_periodic(evt, 0); + evt->features = CLOCK_EVT_FEAT_ONESHOT; + if (hc->boot_cfg & HPET_TN_PERIODIC) { + evt->features |= CLOCK_EVT_FEAT_PERIODIC; + evt->set_state_periodic = hpet_clkevt_set_state_periodic; + } } -static int hpet_legacy_resume(struct clock_event_device *evt) +static void __init hpet_legacy_clockevent_register(struct hpet_channel *hc) { - return hpet_resume(evt); -} + /* + * Start HPET with the boot CPU's cpumask and make it global after + * the IO_APIC has been initialized. + */ + hc->cpu = boot_cpu_data.cpu_index; + strncpy(hc->name, "hpet", sizeof(hc->name)); + hpet_init_clockevent(hc, 50); -static int hpet_legacy_next_event(unsigned long delta, - struct clock_event_device *evt) -{ - return hpet_next_event(delta, evt, 0); -} + hc->evt.tick_resume = hpet_clkevt_legacy_resume; -/* - * The hpet clock event device - */ -static struct clock_event_device hpet_clockevent = { - .name = "hpet", - .features = CLOCK_EVT_FEAT_PERIODIC | - CLOCK_EVT_FEAT_ONESHOT, - .set_state_periodic = hpet_legacy_set_periodic, - .set_state_oneshot = hpet_legacy_set_oneshot, - .set_state_shutdown = hpet_legacy_shutdown, - .tick_resume = hpet_legacy_resume, - .set_next_event = hpet_legacy_next_event, - .irq = 0, - .rating = 50, -}; + /* + * Legacy horrors and sins from the past. HPET used periodic mode + * unconditionally forever on the legacy channel 0. Removing the + * below hack and using the conditional in hpet_init_clockevent() + * makes at least Qemu and one hardware machine fail to boot. + * There are two issues which cause the boot failure: + * + * #1 After the timer delivery test in IOAPIC and the IOAPIC setup + * the next interrupt is not delivered despite the HPET channel + * being programmed correctly. Reprogramming the HPET after + * switching to IOAPIC makes it work again. After fixing this, + * the next issue surfaces: + * + * #2 Due to the unconditional periodic mode availability the Local + * APIC timer calibration can hijack the global clockevents + * event handler without causing damage. Using oneshot at this + * stage makes if hang because the HPET does not get + * reprogrammed due to the handler hijacking. Duh, stupid me! + * + * Both issues require major surgery and especially the kick HPET + * again after enabling IOAPIC results in really nasty hackery. + * This 'assume periodic works' magic has survived since HPET + * support got added, so it's questionable whether this should be + * fixed. Both Qemu and the failing hardware machine support + * periodic mode despite the fact that both don't advertise it in + * the configuration register and both need that extra kick after + * switching to IOAPIC. Seems to be a feature... + */ + hc->evt.features |= CLOCK_EVT_FEAT_PERIODIC; + hc->evt.set_state_periodic = hpet_clkevt_set_state_periodic; + + /* Start HPET legacy interrupts */ + hpet_enable_legacy_int(); + + clockevents_config_and_register(&hc->evt, hpet_freq, + HPET_MIN_PROG_DELTA, 0x7FFFFFFF); + global_clock_event = &hc->evt; + pr_debug("Clockevent registered\n"); +} /* * HPET MSI Support */ #ifdef CONFIG_PCI_MSI -static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev); -static struct hpet_dev *hpet_devs; -static struct irq_domain *hpet_domain; - void hpet_msi_unmask(struct irq_data *data) { - struct hpet_dev *hdev = irq_data_get_irq_handler_data(data); + struct hpet_channel *hc = irq_data_get_irq_handler_data(data); unsigned int cfg; - /* unmask it */ - cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); + cfg = hpet_readl(HPET_Tn_CFG(hc->num)); cfg |= HPET_TN_ENABLE | HPET_TN_FSB; - hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); + hpet_writel(cfg, HPET_Tn_CFG(hc->num)); } void hpet_msi_mask(struct irq_data *data) { - struct hpet_dev *hdev = irq_data_get_irq_handler_data(data); + struct hpet_channel *hc = irq_data_get_irq_handler_data(data); unsigned int cfg; - /* mask it */ - cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); + cfg = hpet_readl(HPET_Tn_CFG(hc->num)); cfg &= ~(HPET_TN_ENABLE | HPET_TN_FSB); - hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); -} - -void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg) -{ - hpet_writel(msg->data, HPET_Tn_ROUTE(hdev->num)); - hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hdev->num) + 4); + hpet_writel(cfg, HPET_Tn_CFG(hc->num)); } -void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg) +void hpet_msi_write(struct hpet_channel *hc, struct msi_msg *msg) { - msg->data = hpet_readl(HPET_Tn_ROUTE(hdev->num)); - msg->address_lo = hpet_readl(HPET_Tn_ROUTE(hdev->num) + 4); - msg->address_hi = 0; + hpet_writel(msg->data, HPET_Tn_ROUTE(hc->num)); + hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hc->num) + 4); } -static int hpet_msi_shutdown(struct clock_event_device *evt) +static int hpet_clkevt_msi_resume(struct clock_event_device *evt) { - struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); - - return hpet_shutdown(evt, hdev->num); -} - -static int hpet_msi_set_oneshot(struct clock_event_device *evt) -{ - struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); - - return hpet_set_oneshot(evt, hdev->num); -} - -static int hpet_msi_set_periodic(struct clock_event_device *evt) -{ - struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); - - return hpet_set_periodic(evt, hdev->num); -} - -static int hpet_msi_resume(struct clock_event_device *evt) -{ - struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); - struct irq_data *data = irq_get_irq_data(hdev->irq); + struct hpet_channel *hc = clockevent_to_channel(evt); + struct irq_data *data = irq_get_irq_data(hc->irq); struct msi_msg msg; /* Restore the MSI msg and unmask the interrupt */ irq_chip_compose_msi_msg(data, &msg); - hpet_msi_write(hdev, &msg); + hpet_msi_write(hc, &msg); hpet_msi_unmask(data); return 0; } -static int hpet_msi_next_event(unsigned long delta, - struct clock_event_device *evt) -{ - struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); - return hpet_next_event(delta, evt, hdev->num); -} - -static irqreturn_t hpet_interrupt_handler(int irq, void *data) +static irqreturn_t hpet_msi_interrupt_handler(int irq, void *data) { - struct hpet_dev *dev = (struct hpet_dev *)data; - struct clock_event_device *hevt = &dev->evt; + struct hpet_channel *hc = data; + struct clock_event_device *evt = &hc->evt; - if (!hevt->event_handler) { - printk(KERN_INFO "Spurious HPET timer interrupt on HPET timer %d\n", - dev->num); + if (!evt->event_handler) { + pr_info("Spurious interrupt HPET channel %d\n", hc->num); return IRQ_HANDLED; } - hevt->event_handler(hevt); + evt->event_handler(evt); return IRQ_HANDLED; } -static int hpet_setup_irq(struct hpet_dev *dev) +static int hpet_setup_msi_irq(struct hpet_channel *hc) { - - if (request_irq(dev->irq, hpet_interrupt_handler, + if (request_irq(hc->irq, hpet_msi_interrupt_handler, IRQF_TIMER | IRQF_NOBALANCING, - dev->name, dev)) + hc->name, hc)) return -1; - disable_irq(dev->irq); - irq_set_affinity(dev->irq, cpumask_of(dev->cpu)); - enable_irq(dev->irq); + disable_irq(hc->irq); + irq_set_affinity(hc->irq, cpumask_of(hc->cpu)); + enable_irq(hc->irq); - printk(KERN_DEBUG "hpet: %s irq %d for MSI\n", - dev->name, dev->irq); + pr_debug("%s irq %u for MSI\n", hc->name, hc->irq); return 0; } -/* This should be called in specific @cpu */ -static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) +/* Invoked from the hotplug callback on @cpu */ +static void init_one_hpet_msi_clockevent(struct hpet_channel *hc, int cpu) { - struct clock_event_device *evt = &hdev->evt; - - WARN_ON(cpu != smp_processor_id()); - if (!(hdev->flags & HPET_DEV_VALID)) - return; - - hdev->cpu = cpu; - per_cpu(cpu_hpet_dev, cpu) = hdev; - evt->name = hdev->name; - hpet_setup_irq(hdev); - evt->irq = hdev->irq; + struct clock_event_device *evt = &hc->evt; - evt->rating = 110; - evt->features = CLOCK_EVT_FEAT_ONESHOT; - if (hdev->flags & HPET_DEV_PERI_CAP) { - evt->features |= CLOCK_EVT_FEAT_PERIODIC; - evt->set_state_periodic = hpet_msi_set_periodic; - } + hc->cpu = cpu; + per_cpu(cpu_hpet_channel, cpu) = hc; + hpet_setup_msi_irq(hc); - evt->set_state_shutdown = hpet_msi_shutdown; - evt->set_state_oneshot = hpet_msi_set_oneshot; - evt->tick_resume = hpet_msi_resume; - evt->set_next_event = hpet_msi_next_event; - evt->cpumask = cpumask_of(hdev->cpu); + hpet_init_clockevent(hc, 110); + evt->tick_resume = hpet_clkevt_msi_resume; clockevents_config_and_register(evt, hpet_freq, HPET_MIN_PROG_DELTA, 0x7FFFFFFF); } -#ifdef CONFIG_HPET -/* Reserve at least one timer for userspace (/dev/hpet) */ -#define RESERVE_TIMERS 1 -#else -#define RESERVE_TIMERS 0 -#endif - -static void hpet_msi_capability_lookup(unsigned int start_timer) +static struct hpet_channel *hpet_get_unused_clockevent(void) { - unsigned int id; - unsigned int num_timers; - unsigned int num_timers_used = 0; - int i, irq; - - if (hpet_msi_disable) - return; - - if (boot_cpu_has(X86_FEATURE_ARAT)) - return; - id = hpet_readl(HPET_ID); - - num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); - num_timers++; /* Value read out starts from 0 */ - hpet_print_config(); - - hpet_domain = hpet_create_irq_domain(hpet_blockid); - if (!hpet_domain) - return; - - hpet_devs = kcalloc(num_timers, sizeof(struct hpet_dev), GFP_KERNEL); - if (!hpet_devs) - return; - - hpet_num_timers = num_timers; - - for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) { - struct hpet_dev *hdev = &hpet_devs[num_timers_used]; - unsigned int cfg = hpet_readl(HPET_Tn_CFG(i)); - - /* Only consider HPET timer with MSI support */ - if (!(cfg & HPET_TN_FSB_CAP)) - continue; + int i; - hdev->flags = 0; - if (cfg & HPET_TN_PERIODIC_CAP) - hdev->flags |= HPET_DEV_PERI_CAP; - sprintf(hdev->name, "hpet%d", i); - hdev->num = i; + for (i = 0; i < hpet_base.nr_channels; i++) { + struct hpet_channel *hc = hpet_base.channels + i; - irq = hpet_assign_irq(hpet_domain, hdev, hdev->num); - if (irq <= 0) + if (hc->mode != HPET_MODE_CLOCKEVT || hc->in_use) continue; - - hdev->irq = irq; - hdev->flags |= HPET_DEV_FSB_CAP; - hdev->flags |= HPET_DEV_VALID; - num_timers_used++; - if (num_timers_used == num_possible_cpus()) - break; + hc->in_use = 1; + return hc; } - - printk(KERN_INFO "HPET: %d timers in total, %d timers will be used for per-cpu timer\n", - num_timers, num_timers_used); + return NULL; } -#ifdef CONFIG_HPET -static void hpet_reserve_msi_timers(struct hpet_data *hd) +static int hpet_cpuhp_online(unsigned int cpu) { - int i; - - if (!hpet_devs) - return; + struct hpet_channel *hc = hpet_get_unused_clockevent(); - for (i = 0; i < hpet_num_timers; i++) { - struct hpet_dev *hdev = &hpet_devs[i]; + if (hc) + init_one_hpet_msi_clockevent(hc, cpu); + return 0; +} - if (!(hdev->flags & HPET_DEV_VALID)) - continue; +static int hpet_cpuhp_dead(unsigned int cpu) +{ + struct hpet_channel *hc = per_cpu(cpu_hpet_channel, cpu); - hd->hd_irq[hdev->num] = hdev->irq; - hpet_reserve_timer(hd, hdev->num); - } + if (!hc) + return 0; + free_irq(hc->irq, hc); + hc->in_use = 0; + per_cpu(cpu_hpet_channel, cpu) = NULL; + return 0; } -#endif -static struct hpet_dev *hpet_get_unused_timer(void) +static void __init hpet_select_clockevents(void) { - int i; + unsigned int i; - if (!hpet_devs) - return NULL; + hpet_base.nr_clockevents = 0; - for (i = 0; i < hpet_num_timers; i++) { - struct hpet_dev *hdev = &hpet_devs[i]; + /* No point if MSI is disabled or CPU has an Always Runing APIC Timer */ + if (hpet_msi_disable || boot_cpu_has(X86_FEATURE_ARAT)) + return; - if (!(hdev->flags & HPET_DEV_VALID)) - continue; - if (test_and_set_bit(HPET_DEV_USED_BIT, - (unsigned long *)&hdev->flags)) - continue; - return hdev; - } - return NULL; -} + hpet_print_config(); -struct hpet_work_struct { - struct delayed_work work; - struct completion complete; -}; + hpet_domain = hpet_create_irq_domain(hpet_blockid); + if (!hpet_domain) + return; -static void hpet_work(struct work_struct *w) -{ - struct hpet_dev *hdev; - int cpu = smp_processor_id(); - struct hpet_work_struct *hpet_work; + for (i = 0; i < hpet_base.nr_channels; i++) { + struct hpet_channel *hc = hpet_base.channels + i; + int irq; - hpet_work = container_of(w, struct hpet_work_struct, work.work); + if (hc->mode != HPET_MODE_UNUSED) + continue; - hdev = hpet_get_unused_timer(); - if (hdev) - init_one_hpet_msi_clockevent(hdev, cpu); + /* Only consider HPET channel with MSI support */ + if (!(hc->boot_cfg & HPET_TN_FSB_CAP)) + continue; - complete(&hpet_work->complete); -} + sprintf(hc->name, "hpet%d", i); -static int hpet_cpuhp_online(unsigned int cpu) -{ - struct hpet_work_struct work; - - INIT_DELAYED_WORK_ONSTACK(&work.work, hpet_work); - init_completion(&work.complete); - /* FIXME: add schedule_work_on() */ - schedule_delayed_work_on(cpu, &work.work, 0); - wait_for_completion(&work.complete); - destroy_delayed_work_on_stack(&work.work); - return 0; -} + irq = hpet_assign_irq(hpet_domain, hc, hc->num); + if (irq <= 0) + continue; -static int hpet_cpuhp_dead(unsigned int cpu) -{ - struct hpet_dev *hdev = per_cpu(cpu_hpet_dev, cpu); + hc->irq = irq; + hc->mode = HPET_MODE_CLOCKEVT; - if (!hdev) - return 0; - free_irq(hdev->irq, hdev); - hdev->flags &= ~HPET_DEV_USED; - per_cpu(cpu_hpet_dev, cpu) = NULL; - return 0; -} -#else + if (++hpet_base.nr_clockevents == num_possible_cpus()) + break; + } -static void hpet_msi_capability_lookup(unsigned int start_timer) -{ - return; + pr_info("%d channels of %d reserved for per-cpu timers\n", + hpet_base.nr_channels, hpet_base.nr_clockevents); } -#ifdef CONFIG_HPET -static void hpet_reserve_msi_timers(struct hpet_data *hd) -{ - return; -} -#endif +#else + +static inline void hpet_select_clockevents(void) { } #define hpet_cpuhp_online NULL #define hpet_cpuhp_dead NULL @@ -754,10 +650,10 @@ static void hpet_reserve_msi_timers(struct hpet_data *hd) /* * Reading the HPET counter is a very slow operation. If a large number of * CPUs are trying to access the HPET counter simultaneously, it can cause - * massive delay and slow down system performance dramatically. This may + * massive delays and slow down system performance dramatically. This may * happen when HPET is the default clock source instead of TSC. For a * really large system with hundreds of CPUs, the slowdown may be so - * severe that it may actually crash the system because of a NMI watchdog + * severe, that it can actually crash the system because of a NMI watchdog * soft lockup, for example. * * If multiple CPUs are trying to access the HPET counter at the same time, @@ -766,10 +662,9 @@ static void hpet_reserve_msi_timers(struct hpet_data *hd) * * This special feature is only enabled on x86-64 systems. It is unlikely * that 32-bit x86 systems will have enough CPUs to require this feature - * with its associated locking overhead. And we also need 64-bit atomic - * read. + * with its associated locking overhead. We also need 64-bit atomic read. * - * The lock and the hpet value are stored together and can be read in a + * The lock and the HPET value are stored together and can be read in a * single atomic 64-bit read. It is explicitly assumed that arch_spinlock_t * is 32 bits in size. */ @@ -858,15 +753,40 @@ static struct clocksource clocksource_hpet = { .resume = hpet_resume_counter, }; -static int hpet_clocksource_register(void) +/* + * AMD SB700 based systems with spread spectrum enabled use a SMM based + * HPET emulation to provide proper frequency setting. + * + * On such systems the SMM code is initialized with the first HPET register + * access and takes some time to complete. During this time the config + * register reads 0xffffffff. We check for max 1000 loops whether the + * config register reads a non-0xffffffff value to make sure that the + * HPET is up and running before we proceed any further. + * + * A counting loop is safe, as the HPET access takes thousands of CPU cycles. + * + * On non-SB700 based machines this check is only done once and has no + * side effects. + */ +static bool __init hpet_cfg_working(void) { - u64 start, now; - u64 t1; + int i; + + for (i = 0; i < 1000; i++) { + if (hpet_readl(HPET_CFG) != 0xFFFFFFFF) + return true; + } + + pr_warn("Config register invalid. Disabling HPET\n"); + return false; +} + +static bool __init hpet_counting(void) +{ + u64 start, now, t1; - /* Start the counter */ hpet_restart_counter(); - /* Verify whether hpet counter works */ t1 = hpet_readl(HPET_COUNTER); start = rdtsc(); @@ -877,30 +797,24 @@ static int hpet_clocksource_register(void) * 1 GHz == 200us */ do { - rep_nop(); + if (t1 != hpet_readl(HPET_COUNTER)) + return true; now = rdtsc(); } while ((now - start) < 200000UL); - if (t1 == hpet_readl(HPET_COUNTER)) { - printk(KERN_WARNING - "HPET counter not counting. HPET disabled\n"); - return -ENODEV; - } - - clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq); - return 0; + pr_warn("Counter not counting. HPET disabled\n"); + return false; } -static u32 *hpet_boot_cfg; - /** * hpet_enable - Try to setup the HPET timer. Returns 1 on success. */ int __init hpet_enable(void) { - u32 hpet_period, cfg, id; + u32 hpet_period, cfg, id, irq; + unsigned int i, channels; + struct hpet_channel *hc; u64 freq; - unsigned int i, last; if (!is_hpet_capable()) return 0; @@ -909,40 +823,22 @@ int __init hpet_enable(void) if (!hpet_virt_address) return 0; + /* Validate that the config register is working */ + if (!hpet_cfg_working()) + goto out_nohpet; + + /* Validate that the counter is counting */ + if (!hpet_counting()) + goto out_nohpet; + /* * Read the period and check for a sane value: */ hpet_period = hpet_readl(HPET_PERIOD); - - /* - * AMD SB700 based systems with spread spectrum enabled use a - * SMM based HPET emulation to provide proper frequency - * setting. The SMM code is initialized with the first HPET - * register access and takes some time to complete. During - * this time the config register reads 0xffffffff. We check - * for max. 1000 loops whether the config register reads a non - * 0xffffffff value to make sure that HPET is up and running - * before we go further. A counting loop is safe, as the HPET - * access takes thousands of CPU cycles. On non SB700 based - * machines this check is only done once and has no side - * effects. - */ - for (i = 0; hpet_readl(HPET_CFG) == 0xFFFFFFFF; i++) { - if (i == 1000) { - printk(KERN_WARNING - "HPET config register value = 0xFFFFFFFF. " - "Disabling HPET\n"); - goto out_nohpet; - } - } - if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD) goto out_nohpet; - /* - * The period is a femto seconds value. Convert it to a - * frequency. - */ + /* The period is a femtoseconds value. Convert it to a frequency. */ freq = FSEC_PER_SEC; do_div(freq, hpet_period); hpet_freq = freq; @@ -954,72 +850,90 @@ int __init hpet_enable(void) id = hpet_readl(HPET_ID); hpet_print_config(); - last = (id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT; + /* This is the HPET channel number which is zero based */ + channels = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1; -#ifdef CONFIG_HPET_EMULATE_RTC /* * The legacy routing mode needs at least two channels, tick timer * and the rtc emulation channel. */ - if (!last) + if (IS_ENABLED(CONFIG_HPET_EMULATE_RTC) && channels < 2) goto out_nohpet; -#endif + hc = kcalloc(channels, sizeof(*hc), GFP_KERNEL); + if (!hc) { + pr_warn("Disabling HPET.\n"); + goto out_nohpet; + } + hpet_base.channels = hc; + hpet_base.nr_channels = channels; + + /* Read, store and sanitize the global configuration */ cfg = hpet_readl(HPET_CFG); - hpet_boot_cfg = kmalloc_array(last + 2, sizeof(*hpet_boot_cfg), - GFP_KERNEL); - if (hpet_boot_cfg) - *hpet_boot_cfg = cfg; - else - pr_warn("HPET initial state will not be saved\n"); + hpet_base.boot_cfg = cfg; cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY); hpet_writel(cfg, HPET_CFG); if (cfg) - pr_warn("Unrecognized bits %#x set in global cfg\n", cfg); + pr_warn("Global config: Unknown bits %#x\n", cfg); + + /* Read, store and sanitize the per channel configuration */ + for (i = 0; i < channels; i++, hc++) { + hc->num = i; - for (i = 0; i <= last; ++i) { cfg = hpet_readl(HPET_Tn_CFG(i)); - if (hpet_boot_cfg) - hpet_boot_cfg[i + 1] = cfg; + hc->boot_cfg = cfg; + irq = (cfg & Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT; + hc->irq = irq; + cfg &= ~(HPET_TN_ENABLE | HPET_TN_LEVEL | HPET_TN_FSB); hpet_writel(cfg, HPET_Tn_CFG(i)); + cfg &= ~(HPET_TN_PERIODIC | HPET_TN_PERIODIC_CAP | HPET_TN_64BIT_CAP | HPET_TN_32BIT | HPET_TN_ROUTE | HPET_TN_FSB | HPET_TN_FSB_CAP); if (cfg) - pr_warn("Unrecognized bits %#x set in cfg#%u\n", - cfg, i); + pr_warn("Channel #%u config: Unknown bits %#x\n", i, cfg); } hpet_print_config(); - if (hpet_clocksource_register()) - goto out_nohpet; + clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq); if (id & HPET_ID_LEGSUP) { - hpet_legacy_clockevent_register(); + hpet_legacy_clockevent_register(&hpet_base.channels[0]); + hpet_base.channels[0].mode = HPET_MODE_LEGACY; + if (IS_ENABLED(CONFIG_HPET_EMULATE_RTC)) + hpet_base.channels[1].mode = HPET_MODE_LEGACY; return 1; } return 0; out_nohpet: + kfree(hpet_base.channels); + hpet_base.channels = NULL; + hpet_base.nr_channels = 0; hpet_clear_mapping(); hpet_address = 0; return 0; } /* - * Needs to be late, as the reserve_timer code calls kalloc ! + * The late initialization runs after the PCI quirks have been invoked + * which might have detected a system on which the HPET can be enforced. + * + * Also, the MSI machinery is not working yet when the HPET is initialized + * early. * - * Not a problem on i386 as hpet_enable is called from late_time_init, - * but on x86_64 it is necessary ! + * If the HPET is enabled, then: + * + * 1) Reserve one channel for /dev/hpet if CONFIG_HPET=y + * 2) Reserve up to num_possible_cpus() channels as per CPU clockevents + * 3) Setup /dev/hpet if CONFIG_HPET=y + * 4) Register hotplug callbacks when clockevents are available */ static __init int hpet_late_init(void) { int ret; - if (boot_hpet_disable) - return -ENODEV; - if (!hpet_address) { if (!force_hpet_address) return -ENODEV; @@ -1031,21 +945,14 @@ static __init int hpet_late_init(void) if (!hpet_virt_address) return -ENODEV; - if (hpet_readl(HPET_ID) & HPET_ID_LEGSUP) - hpet_msi_capability_lookup(2); - else - hpet_msi_capability_lookup(0); - - hpet_reserve_platform_timers(hpet_readl(HPET_ID)); + hpet_select_device_channel(); + hpet_select_clockevents(); + hpet_reserve_platform_timers(); hpet_print_config(); - if (hpet_msi_disable) + if (!hpet_base.nr_clockevents) return 0; - if (boot_cpu_has(X86_FEATURE_ARAT)) - return 0; - - /* This notifier should be called after workqueue is ready */ ret = cpuhp_setup_state(CPUHP_AP_X86_HPET_ONLINE, "x86/hpet:online", hpet_cpuhp_online, NULL); if (ret) @@ -1064,47 +971,47 @@ fs_initcall(hpet_late_init); void hpet_disable(void) { - if (is_hpet_capable() && hpet_virt_address) { - unsigned int cfg = hpet_readl(HPET_CFG), id, last; - - if (hpet_boot_cfg) - cfg = *hpet_boot_cfg; - else if (hpet_legacy_int_enabled) { - cfg &= ~HPET_CFG_LEGACY; - hpet_legacy_int_enabled = false; - } - cfg &= ~HPET_CFG_ENABLE; - hpet_writel(cfg, HPET_CFG); + unsigned int i; + u32 cfg; - if (!hpet_boot_cfg) - return; + if (!is_hpet_capable() || !hpet_virt_address) + return; - id = hpet_readl(HPET_ID); - last = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); + /* Restore boot configuration with the enable bit cleared */ + cfg = hpet_base.boot_cfg; + cfg &= ~HPET_CFG_ENABLE; + hpet_writel(cfg, HPET_CFG); - for (id = 0; id <= last; ++id) - hpet_writel(hpet_boot_cfg[id + 1], HPET_Tn_CFG(id)); + /* Restore the channel boot configuration */ + for (i = 0; i < hpet_base.nr_channels; i++) + hpet_writel(hpet_base.channels[i].boot_cfg, HPET_Tn_CFG(i)); - if (*hpet_boot_cfg & HPET_CFG_ENABLE) - hpet_writel(*hpet_boot_cfg, HPET_CFG); - } + /* If the HPET was enabled at boot time, reenable it */ + if (hpet_base.boot_cfg & HPET_CFG_ENABLE) + hpet_writel(hpet_base.boot_cfg, HPET_CFG); } #ifdef CONFIG_HPET_EMULATE_RTC -/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET +/* + * HPET in LegacyReplacement mode eats up the RTC interrupt line. When HPET * is enabled, we support RTC interrupt functionality in software. + * * RTC has 3 kinds of interrupts: - * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock - * is updated - * 2) Alarm Interrupt - generate an interrupt at a specific time of day - * 3) Periodic Interrupt - generate periodic interrupt, with frequencies - * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2) - * (1) and (2) above are implemented using polling at a frequency of - * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt - * overhead. (DEFAULT_RTC_INT_FREQ) - * For (3), we use interrupts at 64Hz or user specified periodic - * frequency, whichever is higher. + * + * 1) Update Interrupt - generate an interrupt, every second, when the + * RTC clock is updated + * 2) Alarm Interrupt - generate an interrupt at a specific time of day + * 3) Periodic Interrupt - generate periodic interrupt, with frequencies + * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all frequencies in powers of 2) + * + * (1) and (2) above are implemented using polling at a frequency of 64 Hz: + * DEFAULT_RTC_INT_FREQ. + * + * The exact frequency is a tradeoff between accuracy and interrupt overhead. + * + * For (3), we use interrupts at 64 Hz, or the user specified periodic frequency, + * if it's higher. */ #include <linux/mc146818rtc.h> #include <linux/rtc.h> @@ -1125,7 +1032,7 @@ static unsigned long hpet_pie_limit; static rtc_irq_handler irq_handler; /* - * Check that the hpet counter c1 is ahead of the c2 + * Check that the HPET counter c1 is ahead of c2 */ static inline int hpet_cnt_ahead(u32 c1, u32 c2) { @@ -1163,8 +1070,8 @@ void hpet_unregister_irq_handler(rtc_irq_handler handler) EXPORT_SYMBOL_GPL(hpet_unregister_irq_handler); /* - * Timer 1 for RTC emulation. We use one shot mode, as periodic mode - * is not supported by all HPET implementations for timer 1. + * Channel 1 for RTC emulation. We use one shot mode, as periodic mode + * is not supported by all HPET implementations for channel 1. * * hpet_rtc_timer_init() is called when the rtc is initialized. */ @@ -1177,10 +1084,11 @@ int hpet_rtc_timer_init(void) return 0; if (!hpet_default_delta) { + struct clock_event_device *evt = &hpet_base.channels[0].evt; uint64_t clc; - clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; - clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT; + clc = (uint64_t) evt->mult * NSEC_PER_SEC; + clc >>= evt->shift + DEFAULT_RTC_SHIFT; hpet_default_delta = clc; } @@ -1209,6 +1117,7 @@ EXPORT_SYMBOL_GPL(hpet_rtc_timer_init); static void hpet_disable_rtc_channel(void) { u32 cfg = hpet_readl(HPET_T1_CFG); + cfg &= ~HPET_TN_ENABLE; hpet_writel(cfg, HPET_T1_CFG); } @@ -1250,8 +1159,7 @@ int hpet_set_rtc_irq_bit(unsigned long bit_mask) } EXPORT_SYMBOL_GPL(hpet_set_rtc_irq_bit); -int hpet_set_alarm_time(unsigned char hrs, unsigned char min, - unsigned char sec) +int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec) { if (!is_hpet_enabled()) return 0; @@ -1271,15 +1179,18 @@ int hpet_set_periodic_freq(unsigned long freq) if (!is_hpet_enabled()) return 0; - if (freq <= DEFAULT_RTC_INT_FREQ) + if (freq <= DEFAULT_RTC_INT_FREQ) { hpet_pie_limit = DEFAULT_RTC_INT_FREQ / freq; - else { - clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; + } else { + struct clock_event_device *evt = &hpet_base.channels[0].evt; + + clc = (uint64_t) evt->mult * NSEC_PER_SEC; do_div(clc, freq); - clc >>= hpet_clockevent.shift; + clc >>= evt->shift; hpet_pie_delta = clc; hpet_pie_limit = 0; } + return 1; } EXPORT_SYMBOL_GPL(hpet_set_periodic_freq); @@ -1317,8 +1228,7 @@ static void hpet_rtc_timer_reinit(void) if (hpet_rtc_flags & RTC_PIE) hpet_pie_count += lost_ints; if (printk_ratelimit()) - printk(KERN_WARNING "hpet1: lost %d rtc interrupts\n", - lost_ints); + pr_warn("Lost %d RTC interrupts\n", lost_ints); } } @@ -1340,8 +1250,7 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) hpet_prev_update_sec = curr_time.tm_sec; } - if (hpet_rtc_flags & RTC_PIE && - ++hpet_pie_count >= hpet_pie_limit) { + if (hpet_rtc_flags & RTC_PIE && ++hpet_pie_count >= hpet_pie_limit) { rtc_int_flag |= RTC_PF; hpet_pie_count = 0; } @@ -1350,7 +1259,7 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) (curr_time.tm_sec == hpet_alarm_time.tm_sec) && (curr_time.tm_min == hpet_alarm_time.tm_min) && (curr_time.tm_hour == hpet_alarm_time.tm_hour)) - rtc_int_flag |= RTC_AF; + rtc_int_flag |= RTC_AF; if (rtc_int_flag) { rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index 0d307a657abb..2b7999a1a50a 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -8,6 +8,7 @@ #include <linux/timex.h> #include <linux/i8253.h> +#include <asm/apic.h> #include <asm/hpet.h> #include <asm/time.h> #include <asm/smp.h> @@ -18,10 +19,32 @@ */ struct clock_event_device *global_clock_event; -void __init setup_pit_timer(void) +/* + * Modern chipsets can disable the PIT clock which makes it unusable. It + * would be possible to enable the clock but the registers are chipset + * specific and not discoverable. Avoid the whack a mole game. + * + * These platforms have discoverable TSC/CPU frequencies but this also + * requires to know the local APIC timer frequency as it normally is + * calibrated against the PIT interrupt. + */ +static bool __init use_pit(void) +{ + if (!IS_ENABLED(CONFIG_X86_TSC) || !boot_cpu_has(X86_FEATURE_TSC)) + return true; + + /* This also returns true when APIC is disabled */ + return apic_needs_pit(); +} + +bool __init pit_timer_init(void) { + if (!use_pit()) + return false; + clockevent_i8253_init(true); global_clock_event = &i8253_clockevent; + return true; } #ifndef CONFIG_X86_64 diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 6d8917875f44..87ef69a72c52 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -1,7 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Interrupt descriptor table related code - * - * This file is licensed under the GPL V2 */ #include <linux/interrupt.h> @@ -320,7 +319,8 @@ void __init idt_setup_apic_and_irq_gates(void) #ifdef CONFIG_X86_LOCAL_APIC for_each_clear_bit_from(i, system_vectors, NR_VECTORS) { set_bit(i, system_vectors); - set_intr_gate(i, spurious_interrupt); + entry = spurious_entries_start + 8 * (i - FIRST_SYSTEM_VECTOR); + set_intr_gate(i, entry); } #endif } diff --git a/arch/x86/kernel/ima_arch.c b/arch/x86/kernel/ima_arch.c index 64b973f0e985..4c407833faca 100644 --- a/arch/x86/kernel/ima_arch.c +++ b/arch/x86/kernel/ima_arch.c @@ -11,10 +11,11 @@ extern struct boot_params boot_params; static enum efi_secureboot_mode get_sb_mode(void) { efi_char16_t efi_SecureBoot_name[] = L"SecureBoot"; + efi_char16_t efi_SetupMode_name[] = L"SecureBoot"; efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID; efi_status_t status; unsigned long size; - u8 secboot; + u8 secboot, setupmode; size = sizeof(secboot); @@ -36,7 +37,14 @@ static enum efi_secureboot_mode get_sb_mode(void) return efi_secureboot_mode_unknown; } - if (secboot == 0) { + size = sizeof(setupmode); + status = efi.get_variable(efi_SetupMode_name, &efi_variable_guid, + NULL, &size, &setupmode); + + if (status != EFI_SUCCESS) /* ignore unknown SetupMode */ + setupmode = 0; + + if (secboot == 0 || setupmode == 1) { pr_info("ima: secureboot mode disabled\n"); return efi_secureboot_mode_disabled; } diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c index 805b7a341aca..fdb6506ceaaa 100644 --- a/arch/x86/kernel/io_delay.c +++ b/arch/x86/kernel/io_delay.c @@ -13,7 +13,22 @@ #include <linux/dmi.h> #include <linux/io.h> -int io_delay_type __read_mostly = CONFIG_DEFAULT_IO_DELAY_TYPE; +#define IO_DELAY_TYPE_0X80 0 +#define IO_DELAY_TYPE_0XED 1 +#define IO_DELAY_TYPE_UDELAY 2 +#define IO_DELAY_TYPE_NONE 3 + +#if defined(CONFIG_IO_DELAY_0X80) +#define DEFAULT_IO_DELAY_TYPE IO_DELAY_TYPE_0X80 +#elif defined(CONFIG_IO_DELAY_0XED) +#define DEFAULT_IO_DELAY_TYPE IO_DELAY_TYPE_0XED +#elif defined(CONFIG_IO_DELAY_UDELAY) +#define DEFAULT_IO_DELAY_TYPE IO_DELAY_TYPE_UDELAY +#elif defined(CONFIG_IO_DELAY_NONE) +#define DEFAULT_IO_DELAY_TYPE IO_DELAY_TYPE_NONE +#endif + +int io_delay_type __read_mostly = DEFAULT_IO_DELAY_TYPE; static int __initdata io_delay_override; @@ -24,13 +39,13 @@ void native_io_delay(void) { switch (io_delay_type) { default: - case CONFIG_IO_DELAY_TYPE_0X80: + case IO_DELAY_TYPE_0X80: asm volatile ("outb %al, $0x80"); break; - case CONFIG_IO_DELAY_TYPE_0XED: + case IO_DELAY_TYPE_0XED: asm volatile ("outb %al, $0xed"); break; - case CONFIG_IO_DELAY_TYPE_UDELAY: + case IO_DELAY_TYPE_UDELAY: /* * 2 usecs is an upper-bound for the outb delay but * note that udelay doesn't have the bus-level @@ -39,7 +54,8 @@ void native_io_delay(void) * are shorter until calibrated): */ udelay(2); - case CONFIG_IO_DELAY_TYPE_NONE: + break; + case IO_DELAY_TYPE_NONE: break; } } @@ -47,9 +63,9 @@ EXPORT_SYMBOL(native_io_delay); static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id) { - if (io_delay_type == CONFIG_IO_DELAY_TYPE_0X80) { + if (io_delay_type == IO_DELAY_TYPE_0X80) { pr_notice("%s: using 0xed I/O delay port\n", id->ident); - io_delay_type = CONFIG_IO_DELAY_TYPE_0XED; + io_delay_type = IO_DELAY_TYPE_0XED; } return 0; @@ -115,13 +131,13 @@ static int __init io_delay_param(char *s) return -EINVAL; if (!strcmp(s, "0x80")) - io_delay_type = CONFIG_IO_DELAY_TYPE_0X80; + io_delay_type = IO_DELAY_TYPE_0X80; else if (!strcmp(s, "0xed")) - io_delay_type = CONFIG_IO_DELAY_TYPE_0XED; + io_delay_type = IO_DELAY_TYPE_0XED; else if (!strcmp(s, "udelay")) - io_delay_type = CONFIG_IO_DELAY_TYPE_UDELAY; + io_delay_type = IO_DELAY_TYPE_UDELAY; else if (!strcmp(s, "none")) - io_delay_type = CONFIG_IO_DELAY_TYPE_NONE; + io_delay_type = IO_DELAY_TYPE_NONE; else return -EINVAL; diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 9b68b5b00ac9..4215653f8a8e 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -135,7 +135,7 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", per_cpu(mce_poll_count, j)); seq_puts(p, " Machine check polls\n"); #endif -#if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN) +#ifdef CONFIG_X86_HV_CALLBACK_VECTOR if (test_bit(HYPERVISOR_CALLBACK_VECTOR, system_vectors)) { seq_printf(p, "%*s: ", prec, "HYP"); for_each_online_cpu(j) @@ -247,7 +247,7 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) if (!handle_irq(desc, regs)) { ack_APIC_irq(); - if (desc != VECTOR_RETRIGGERED) { + if (desc != VECTOR_RETRIGGERED && desc != VECTOR_SHUTDOWN) { pr_emerg_ratelimited("%s: %d.%d No irq handler for vector\n", __func__, smp_processor_id(), vector); diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index 1b2ee55a2dfb..6857b4577f17 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -45,7 +45,7 @@ static void jailhouse_get_wallclock(struct timespec64 *now) static void __init jailhouse_timer_init(void) { - lapic_timer_frequency = setup_data.apic_khz * (1000 / HZ); + lapic_timer_period = setup_data.apic_khz * (1000 / HZ); } static unsigned long jailhouse_get_tsc(void) @@ -203,7 +203,7 @@ bool jailhouse_paravirt(void) return jailhouse_cpuid_base() != 0; } -static bool jailhouse_x2apic_available(void) +static bool __init jailhouse_x2apic_available(void) { /* * The x2APIC is only available if the root cell enabled it. Jailhouse diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index e631c358f7f4..044053235302 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -35,41 +35,43 @@ static void bug_at(unsigned char *ip, int line) BUG(); } -static void __ref __jump_label_transform(struct jump_entry *entry, - enum jump_label_type type, - int init) +static void __jump_label_set_jump_code(struct jump_entry *entry, + enum jump_label_type type, + union jump_code_union *code, + int init) { - union jump_code_union jmp; const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP }; const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5]; - const void *expect, *code; + const void *expect; int line; - jmp.jump = 0xe9; - jmp.offset = jump_entry_target(entry) - - (jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE); + code->jump = 0xe9; + code->offset = jump_entry_target(entry) - + (jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE); - if (type == JUMP_LABEL_JMP) { - if (init) { - expect = default_nop; line = __LINE__; - } else { - expect = ideal_nop; line = __LINE__; - } - - code = &jmp.code; + if (init) { + expect = default_nop; line = __LINE__; + } else if (type == JUMP_LABEL_JMP) { + expect = ideal_nop; line = __LINE__; } else { - if (init) { - expect = default_nop; line = __LINE__; - } else { - expect = &jmp.code; line = __LINE__; - } - - code = ideal_nop; + expect = code->code; line = __LINE__; } if (memcmp((void *)jump_entry_code(entry), expect, JUMP_LABEL_NOP_SIZE)) bug_at((void *)jump_entry_code(entry), line); + if (type == JUMP_LABEL_NOP) + memcpy(code, ideal_nop, JUMP_LABEL_NOP_SIZE); +} + +static void __ref __jump_label_transform(struct jump_entry *entry, + enum jump_label_type type, + int init) +{ + union jump_code_union code; + + __jump_label_set_jump_code(entry, type, &code, init); + /* * As long as only a single processor is running and the code is still * not marked as RO, text_poke_early() can be used; Checking that @@ -82,12 +84,12 @@ static void __ref __jump_label_transform(struct jump_entry *entry, * always nop being the 'currently valid' instruction */ if (init || system_state == SYSTEM_BOOTING) { - text_poke_early((void *)jump_entry_code(entry), code, + text_poke_early((void *)jump_entry_code(entry), &code, JUMP_LABEL_NOP_SIZE); return; } - text_poke_bp((void *)jump_entry_code(entry), code, JUMP_LABEL_NOP_SIZE, + text_poke_bp((void *)jump_entry_code(entry), &code, JUMP_LABEL_NOP_SIZE, (void *)jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE); } @@ -99,6 +101,75 @@ void arch_jump_label_transform(struct jump_entry *entry, mutex_unlock(&text_mutex); } +#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) +static struct text_poke_loc tp_vec[TP_VEC_MAX]; +static int tp_vec_nr; + +bool arch_jump_label_transform_queue(struct jump_entry *entry, + enum jump_label_type type) +{ + struct text_poke_loc *tp; + void *entry_code; + + if (system_state == SYSTEM_BOOTING) { + /* + * Fallback to the non-batching mode. + */ + arch_jump_label_transform(entry, type); + return true; + } + + /* + * No more space in the vector, tell upper layer to apply + * the queue before continuing. + */ + if (tp_vec_nr == TP_VEC_MAX) + return false; + + tp = &tp_vec[tp_vec_nr]; + + entry_code = (void *)jump_entry_code(entry); + + /* + * The INT3 handler will do a bsearch in the queue, so we need entries + * to be sorted. We can survive an unsorted list by rejecting the entry, + * forcing the generic jump_label code to apply the queue. Warning once, + * to raise the attention to the case of an unsorted entry that is + * better not happen, because, in the worst case we will perform in the + * same way as we do without batching - with some more overhead. + */ + if (tp_vec_nr > 0) { + int prev = tp_vec_nr - 1; + struct text_poke_loc *prev_tp = &tp_vec[prev]; + + if (WARN_ON_ONCE(prev_tp->addr > entry_code)) + return false; + } + + __jump_label_set_jump_code(entry, type, + (union jump_code_union *) &tp->opcode, 0); + + tp->addr = entry_code; + tp->detour = entry_code + JUMP_LABEL_NOP_SIZE; + tp->len = JUMP_LABEL_NOP_SIZE; + + tp_vec_nr++; + + return true; +} + +void arch_jump_label_transform_apply(void) +{ + if (!tp_vec_nr) + return; + + mutex_lock(&text_mutex); + text_poke_bp_batch(tp_vec, tp_vec_nr); + mutex_unlock(&text_mutex); + + tp_vec_nr = 0; +} + static enum { JL_STATE_START, JL_STATE_NO_UPDATE, diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index b07e7069b09e..5ebcd02cbca7 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -1,12 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Kexec bzImage loader * * Copyright (C) 2014 Red Hat Inc. * Authors: * Vivek Goyal <vgoyal@redhat.com> - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. */ #define pr_fmt(fmt) "kexec-bzImage64: " fmt @@ -321,6 +319,11 @@ static int bzImage64_probe(const char *buf, unsigned long len) return ret; } + if (!(header->xloadflags & XLF_5LEVEL) && pgtable_l5_enabled()) { + pr_err("bzImage cannot handle 5-level paging mode.\n"); + return ret; + } + /* I've got a bzImage */ pr_debug("It's a relocatable bzImage64\n"); ret = 0; diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 9a8c1648fc9a..23297ea64f5f 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -118,14 +118,6 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs) #ifdef CONFIG_X86_32 switch (regno) { - case GDB_SS: - if (!user_mode(regs)) - *(unsigned long *)mem = __KERNEL_DS; - break; - case GDB_SP: - if (!user_mode(regs)) - *(unsigned long *)mem = kernel_stack_pointer(regs); - break; case GDB_GS: case GDB_FS: *(unsigned long *)mem = 0xFFFF; @@ -758,7 +750,7 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) BREAK_INSTR_SIZE); bpt->type = BP_POKE_BREAKPOINT; - return err; + return 0; } int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h index 2b949f4fd4d8..7d3a2e2daf01 100644 --- a/arch/x86/kernel/kprobes/common.h +++ b/arch/x86/kernel/kprobes/common.h @@ -5,15 +5,10 @@ /* Kprobes and Optprobes common header */ #include <asm/asm.h> - -#ifdef CONFIG_FRAME_POINTER -# define SAVE_RBP_STRING " push %" _ASM_BP "\n" \ - " mov %" _ASM_SP ", %" _ASM_BP "\n" -#else -# define SAVE_RBP_STRING " push %" _ASM_BP "\n" -#endif +#include <asm/frame.h> #ifdef CONFIG_X86_64 + #define SAVE_REGS_STRING \ /* Skip cs, ip, orig_ax. */ \ " subq $24, %rsp\n" \ @@ -27,11 +22,13 @@ " pushq %r10\n" \ " pushq %r11\n" \ " pushq %rbx\n" \ - SAVE_RBP_STRING \ + " pushq %rbp\n" \ " pushq %r12\n" \ " pushq %r13\n" \ " pushq %r14\n" \ - " pushq %r15\n" + " pushq %r15\n" \ + ENCODE_FRAME_POINTER + #define RESTORE_REGS_STRING \ " popq %r15\n" \ " popq %r14\n" \ @@ -51,19 +48,22 @@ /* Skip orig_ax, ip, cs */ \ " addq $24, %rsp\n" #else + #define SAVE_REGS_STRING \ /* Skip cs, ip, orig_ax and gs. */ \ - " subl $16, %esp\n" \ + " subl $4*4, %esp\n" \ " pushl %fs\n" \ " pushl %es\n" \ " pushl %ds\n" \ " pushl %eax\n" \ - SAVE_RBP_STRING \ + " pushl %ebp\n" \ " pushl %edi\n" \ " pushl %esi\n" \ " pushl %edx\n" \ " pushl %ecx\n" \ - " pushl %ebx\n" + " pushl %ebx\n" \ + ENCODE_FRAME_POINTER + #define RESTORE_REGS_STRING \ " popl %ebx\n" \ " popl %ecx\n" \ @@ -72,8 +72,8 @@ " popl %edi\n" \ " popl %ebp\n" \ " popl %eax\n" \ - /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\ - " addl $24, %esp\n" + /* Skip ds, es, fs, gs, orig_ax, ip, and cs. */\ + " addl $7*4, %esp\n" #endif /* Ensure if the instruction can be boostable */ diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index d3243d93daf4..0e0b08008b5a 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -56,7 +56,7 @@ DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); -#define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs)) +#define stack_addr(regs) ((unsigned long *)regs->sp) #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ @@ -718,29 +718,27 @@ asm( ".global kretprobe_trampoline\n" ".type kretprobe_trampoline, @function\n" "kretprobe_trampoline:\n" -#ifdef CONFIG_X86_64 /* We don't bother saving the ss register */ +#ifdef CONFIG_X86_64 " pushq %rsp\n" " pushfq\n" SAVE_REGS_STRING " movq %rsp, %rdi\n" " call trampoline_handler\n" /* Replace saved sp with true return address. */ - " movq %rax, 152(%rsp)\n" + " movq %rax, 19*8(%rsp)\n" RESTORE_REGS_STRING " popfq\n" #else - " pushf\n" + " pushl %esp\n" + " pushfl\n" SAVE_REGS_STRING " movl %esp, %eax\n" " call trampoline_handler\n" - /* Move flags to cs */ - " movl 56(%esp), %edx\n" - " movl %edx, 52(%esp)\n" - /* Replace saved flags with true return address. */ - " movl %eax, 56(%esp)\n" + /* Replace saved sp with true return address. */ + " movl %eax, 15*4(%esp)\n" RESTORE_REGS_STRING - " popf\n" + " popfl\n" #endif " ret\n" ".size kretprobe_trampoline, .-kretprobe_trampoline\n" @@ -781,16 +779,13 @@ __used __visible void *trampoline_handler(struct pt_regs *regs) INIT_HLIST_HEAD(&empty_rp); kretprobe_hash_lock(current, &head, &flags); /* fixup registers */ -#ifdef CONFIG_X86_64 regs->cs = __KERNEL_CS; - /* On x86-64, we use pt_regs->sp for return address holder. */ - frame_pointer = ®s->sp; -#else - regs->cs = __KERNEL_CS | get_kernel_rpl(); +#ifdef CONFIG_X86_32 + regs->cs |= get_kernel_rpl(); regs->gs = 0; - /* On x86-32, we use pt_regs->flags for return address holder. */ - frame_pointer = ®s->flags; #endif + /* We use pt_regs->sp for return address holder. */ + frame_pointer = ®s->sp; regs->ip = trampoline_address; regs->orig_ax = ~0UL; diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 7c361a24c6df..9d4aedece363 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -102,14 +102,15 @@ asm ( "optprobe_template_call:\n" ASM_NOP5 /* Move flags to rsp */ - " movq 144(%rsp), %rdx\n" - " movq %rdx, 152(%rsp)\n" + " movq 18*8(%rsp), %rdx\n" + " movq %rdx, 19*8(%rsp)\n" RESTORE_REGS_STRING /* Skip flags entry */ " addq $8, %rsp\n" " popfq\n" #else /* CONFIG_X86_32 */ - " pushf\n" + " pushl %esp\n" + " pushfl\n" SAVE_REGS_STRING " movl %esp, %edx\n" ".global optprobe_template_val\n" @@ -118,9 +119,13 @@ asm ( ".global optprobe_template_call\n" "optprobe_template_call:\n" ASM_NOP5 + /* Move flags into esp */ + " movl 14*4(%esp), %edx\n" + " movl %edx, 15*4(%esp)\n" RESTORE_REGS_STRING - " addl $4, %esp\n" /* skip cs */ - " popf\n" + /* Skip flags entry */ + " addl $4, %esp\n" + " popfl\n" #endif ".global optprobe_template_end\n" "optprobe_template_end:\n" @@ -152,10 +157,9 @@ optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) } else { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); /* Save skipped registers */ -#ifdef CONFIG_X86_64 regs->cs = __KERNEL_CS; -#else - regs->cs = __KERNEL_CS | get_kernel_rpl(); +#ifdef CONFIG_X86_32 + regs->cs |= get_kernel_rpl(); regs->gs = 0; #endif regs->ip = (unsigned long)op->kp.addr + INT3_SIZE; @@ -418,7 +422,7 @@ err: void arch_optimize_kprobes(struct list_head *oplist) { struct optimized_kprobe *op, *tmp; - u8 insn_buf[RELATIVEJUMP_SIZE]; + u8 insn_buff[RELATIVEJUMP_SIZE]; list_for_each_entry_safe(op, tmp, oplist, list) { s32 rel = (s32)((long)op->optinsn.insn - @@ -430,10 +434,10 @@ void arch_optimize_kprobes(struct list_head *oplist) memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, RELATIVE_ADDR_SIZE); - insn_buf[0] = RELATIVEJUMP_OPCODE; - *(s32 *)(&insn_buf[1]) = rel; + insn_buff[0] = RELATIVEJUMP_OPCODE; + *(s32 *)(&insn_buff[1]) = rel; - text_poke_bp(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE, + text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE, op->optinsn.insn); list_del_init(&op->list); @@ -443,12 +447,12 @@ void arch_optimize_kprobes(struct list_head *oplist) /* Replace a relative jump with a breakpoint (int3). */ void arch_unoptimize_kprobe(struct optimized_kprobe *op) { - u8 insn_buf[RELATIVEJUMP_SIZE]; + u8 insn_buff[RELATIVEJUMP_SIZE]; /* Set int3 to first byte for kprobes */ - insn_buf[0] = BREAKPOINT_INSTRUCTION; - memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); - text_poke_bp(op->kp.addr, insn_buf, RELATIVEJUMP_SIZE, + insn_buff[0] = BREAKPOINT_INSTRUCTION; + memcpy(insn_buff + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); + text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE, op->optinsn.insn); } diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 5409c2800ab5..77854b192fef 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -1,9 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * handle transition of Linux booting another kernel * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. */ #include <linux/mm.h> diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index ceba408ea982..5dcd438ad8f2 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -1,9 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * handle transition of Linux booting another kernel * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. */ #define pr_fmt(fmt) "kexec: " fmt @@ -18,6 +16,7 @@ #include <linux/io.h> #include <linux/suspend.h> #include <linux/vmalloc.h> +#include <linux/efi.h> #include <asm/init.h> #include <asm/pgtable.h> @@ -29,6 +28,55 @@ #include <asm/setup.h> #include <asm/set_memory.h> +#ifdef CONFIG_ACPI +/* + * Used while adding mapping for ACPI tables. + * Can be reused when other iomem regions need be mapped + */ +struct init_pgtable_data { + struct x86_mapping_info *info; + pgd_t *level4p; +}; + +static int mem_region_callback(struct resource *res, void *arg) +{ + struct init_pgtable_data *data = arg; + unsigned long mstart, mend; + + mstart = res->start; + mend = mstart + resource_size(res) - 1; + + return kernel_ident_mapping_init(data->info, data->level4p, mstart, mend); +} + +static int +map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) +{ + struct init_pgtable_data data; + unsigned long flags; + int ret; + + data.info = info; + data.level4p = level4p; + flags = IORESOURCE_MEM | IORESOURCE_BUSY; + + ret = walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, + &data, mem_region_callback); + if (ret && ret != -EINVAL) + return ret; + + /* ACPI tables could be located in ACPI Non-volatile Storage region */ + ret = walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, + &data, mem_region_callback); + if (ret && ret != -EINVAL) + return ret; + + return 0; +} +#else +static int map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) { return 0; } +#endif + #ifdef CONFIG_KEXEC_FILE const struct kexec_file_ops * const kexec_file_loaders[] = { &kexec_bzImage64_ops, @@ -36,6 +84,31 @@ const struct kexec_file_ops * const kexec_file_loaders[] = { }; #endif +static int +map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p) +{ +#ifdef CONFIG_EFI + unsigned long mstart, mend; + + if (!efi_enabled(EFI_BOOT)) + return 0; + + mstart = (boot_params.efi_info.efi_systab | + ((u64)boot_params.efi_info.efi_systab_hi<<32)); + + if (efi_enabled(EFI_64BIT)) + mend = mstart + sizeof(efi_system_table_64_t); + else + mend = mstart + sizeof(efi_system_table_32_t); + + if (!mstart) + return 0; + + return kernel_ident_mapping_init(info, level4p, mstart, mend); +#endif + return 0; +} + static void free_transition_pgtable(struct kimage *image) { free_page((unsigned long)image->arch.p4d); @@ -50,12 +123,13 @@ static void free_transition_pgtable(struct kimage *image) static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) { + pgprot_t prot = PAGE_KERNEL_EXEC_NOENC; + unsigned long vaddr, paddr; + int result = -ENOMEM; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; - unsigned long vaddr, paddr; - int result = -ENOMEM; vaddr = (unsigned long)relocate_kernel; paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE); @@ -92,7 +166,11 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); } pte = pte_offset_kernel(pmd, vaddr); - set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC)); + + if (sev_active()) + prot = PAGE_KERNEL_EXEC; + + set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot)); return 0; err: return result; @@ -129,6 +207,11 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) level4p = (pgd_t *)__va(start_pgtable); clear_page(level4p); + if (sev_active()) { + info.page_flag |= _PAGE_ENC; + info.kernpg_flag |= _PAGE_ENC; + } + if (direct_gbpages) info.direct_gbpages = true; @@ -159,6 +242,18 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable) return result; } + /* + * Prepare EFI systab and ACPI tables for kexec kernel since they are + * not covered by pfn_mapped. + */ + result = map_efi_systab(&info, level4p); + if (result) + return result; + + result = map_acpi_tables(&info, level4p); + if (result) + return result; + return init_transition_pgtable(image, level4p); } @@ -559,8 +654,20 @@ void arch_kexec_unprotect_crashkres(void) kexec_mark_crashkres(false); } +/* + * During a traditional boot under SME, SME will encrypt the kernel, + * so the SME kexec kernel also needs to be un-encrypted in order to + * replicate a normal SME boot. + * + * During a traditional boot under SEV, the kernel has already been + * loaded encrypted, so the SEV kexec kernel needs to be encrypted in + * order to replicate a normal SEV boot. + */ int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) { + if (sev_active()) + return 0; + /* * If SME is active we need to be sure that kexec pages are * not encrypted because when we boot to the new kernel the @@ -571,6 +678,9 @@ int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) { + if (sev_active()) + return; + /* * If SME is active we need to reset the pages back to being * an encrypted mapping before freeing them. diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 06f6bb48d018..98039d7fb998 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -58,24 +58,24 @@ struct branch { u32 delta; } __attribute__((packed)); -static unsigned paravirt_patch_call(void *insnbuf, const void *target, +static unsigned paravirt_patch_call(void *insn_buff, const void *target, unsigned long addr, unsigned len) { - struct branch *b = insnbuf; - unsigned long delta = (unsigned long)target - (addr+5); - - if (len < 5) { -#ifdef CONFIG_RETPOLINE - WARN_ONCE(1, "Failing to patch indirect CALL in %ps\n", (void *)addr); -#endif - return len; /* call too long for patch site */ + const int call_len = 5; + struct branch *b = insn_buff; + unsigned long delta = (unsigned long)target - (addr+call_len); + + if (len < call_len) { + pr_warn("paravirt: Failed to patch indirect CALL at %ps\n", (void *)addr); + /* Kernel might not be viable if patching fails, bail out: */ + BUG_ON(1); } b->opcode = 0xe8; /* call */ b->delta = delta; - BUILD_BUG_ON(sizeof(*b) != 5); + BUILD_BUG_ON(sizeof(*b) != call_len); - return 5; + return call_len; } #ifdef CONFIG_PARAVIRT_XXL @@ -85,10 +85,10 @@ u64 notrace _paravirt_ident_64(u64 x) return x; } -static unsigned paravirt_patch_jmp(void *insnbuf, const void *target, +static unsigned paravirt_patch_jmp(void *insn_buff, const void *target, unsigned long addr, unsigned len) { - struct branch *b = insnbuf; + struct branch *b = insn_buff; unsigned long delta = (unsigned long)target - (addr+5); if (len < 5) { @@ -113,7 +113,7 @@ void __init native_pv_lock_init(void) static_branch_disable(&virt_spin_lock_key); } -unsigned paravirt_patch_default(u8 type, void *insnbuf, +unsigned paravirt_patch_default(u8 type, void *insn_buff, unsigned long addr, unsigned len) { /* @@ -125,36 +125,36 @@ unsigned paravirt_patch_default(u8 type, void *insnbuf, if (opfunc == NULL) /* If there's no function, patch it with a ud2a (BUG) */ - ret = paravirt_patch_insns(insnbuf, len, ud2a, ud2a+sizeof(ud2a)); + ret = paravirt_patch_insns(insn_buff, len, ud2a, ud2a+sizeof(ud2a)); else if (opfunc == _paravirt_nop) ret = 0; #ifdef CONFIG_PARAVIRT_XXL /* identity functions just return their single argument */ else if (opfunc == _paravirt_ident_64) - ret = paravirt_patch_ident_64(insnbuf, len); + ret = paravirt_patch_ident_64(insn_buff, len); else if (type == PARAVIRT_PATCH(cpu.iret) || type == PARAVIRT_PATCH(cpu.usergs_sysret64)) /* If operation requires a jmp, then jmp */ - ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len); + ret = paravirt_patch_jmp(insn_buff, opfunc, addr, len); #endif else /* Otherwise call the function. */ - ret = paravirt_patch_call(insnbuf, opfunc, addr, len); + ret = paravirt_patch_call(insn_buff, opfunc, addr, len); return ret; } -unsigned paravirt_patch_insns(void *insnbuf, unsigned len, +unsigned paravirt_patch_insns(void *insn_buff, unsigned len, const char *start, const char *end) { unsigned insn_len = end - start; - if (insn_len > len || start == NULL) - insn_len = len; - else - memcpy(insnbuf, start, insn_len); + /* Alternative instruction is too large for the patch site and we cannot continue: */ + BUG_ON(insn_len > len || start == NULL); + + memcpy(insn_buff, start, insn_len); return insn_len; } diff --git a/arch/x86/kernel/paravirt_patch.c b/arch/x86/kernel/paravirt_patch.c new file mode 100644 index 000000000000..3eff63c090d2 --- /dev/null +++ b/arch/x86/kernel/paravirt_patch.c @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/stringify.h> + +#include <asm/paravirt.h> +#include <asm/asm-offsets.h> + +#define PSTART(d, m) \ + patch_data_##d.m + +#define PEND(d, m) \ + (PSTART(d, m) + sizeof(patch_data_##d.m)) + +#define PATCH(d, m, insn_buff, len) \ + paravirt_patch_insns(insn_buff, len, PSTART(d, m), PEND(d, m)) + +#define PATCH_CASE(ops, m, data, insn_buff, len) \ + case PARAVIRT_PATCH(ops.m): \ + return PATCH(data, ops##_##m, insn_buff, len) + +#ifdef CONFIG_PARAVIRT_XXL +struct patch_xxl { + const unsigned char irq_irq_disable[1]; + const unsigned char irq_irq_enable[1]; + const unsigned char irq_save_fl[2]; + const unsigned char mmu_read_cr2[3]; + const unsigned char mmu_read_cr3[3]; + const unsigned char mmu_write_cr3[3]; + const unsigned char irq_restore_fl[2]; +# ifdef CONFIG_X86_64 + const unsigned char cpu_wbinvd[2]; + const unsigned char cpu_usergs_sysret64[6]; + const unsigned char cpu_swapgs[3]; + const unsigned char mov64[3]; +# else + const unsigned char cpu_iret[1]; +# endif +}; + +static const struct patch_xxl patch_data_xxl = { + .irq_irq_disable = { 0xfa }, // cli + .irq_irq_enable = { 0xfb }, // sti + .irq_save_fl = { 0x9c, 0x58 }, // pushf; pop %[re]ax + .mmu_read_cr2 = { 0x0f, 0x20, 0xd0 }, // mov %cr2, %[re]ax + .mmu_read_cr3 = { 0x0f, 0x20, 0xd8 }, // mov %cr3, %[re]ax +# ifdef CONFIG_X86_64 + .mmu_write_cr3 = { 0x0f, 0x22, 0xdf }, // mov %rdi, %cr3 + .irq_restore_fl = { 0x57, 0x9d }, // push %rdi; popfq + .cpu_wbinvd = { 0x0f, 0x09 }, // wbinvd + .cpu_usergs_sysret64 = { 0x0f, 0x01, 0xf8, + 0x48, 0x0f, 0x07 }, // swapgs; sysretq + .cpu_swapgs = { 0x0f, 0x01, 0xf8 }, // swapgs + .mov64 = { 0x48, 0x89, 0xf8 }, // mov %rdi, %rax +# else + .mmu_write_cr3 = { 0x0f, 0x22, 0xd8 }, // mov %eax, %cr3 + .irq_restore_fl = { 0x50, 0x9d }, // push %eax; popf + .cpu_iret = { 0xcf }, // iret +# endif +}; + +unsigned int paravirt_patch_ident_64(void *insn_buff, unsigned int len) +{ +#ifdef CONFIG_X86_64 + return PATCH(xxl, mov64, insn_buff, len); +#endif + return 0; +} +# endif /* CONFIG_PARAVIRT_XXL */ + +#ifdef CONFIG_PARAVIRT_SPINLOCKS +struct patch_lock { + unsigned char queued_spin_unlock[3]; + unsigned char vcpu_is_preempted[2]; +}; + +static const struct patch_lock patch_data_lock = { + .vcpu_is_preempted = { 0x31, 0xc0 }, // xor %eax, %eax + +# ifdef CONFIG_X86_64 + .queued_spin_unlock = { 0xc6, 0x07, 0x00 }, // movb $0, (%rdi) +# else + .queued_spin_unlock = { 0xc6, 0x00, 0x00 }, // movb $0, (%eax) +# endif +}; +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr, + unsigned int len) +{ + switch (type) { + +#ifdef CONFIG_PARAVIRT_XXL + PATCH_CASE(irq, restore_fl, xxl, insn_buff, len); + PATCH_CASE(irq, save_fl, xxl, insn_buff, len); + PATCH_CASE(irq, irq_enable, xxl, insn_buff, len); + PATCH_CASE(irq, irq_disable, xxl, insn_buff, len); + + PATCH_CASE(mmu, read_cr2, xxl, insn_buff, len); + PATCH_CASE(mmu, read_cr3, xxl, insn_buff, len); + PATCH_CASE(mmu, write_cr3, xxl, insn_buff, len); + +# ifdef CONFIG_X86_64 + PATCH_CASE(cpu, usergs_sysret64, xxl, insn_buff, len); + PATCH_CASE(cpu, swapgs, xxl, insn_buff, len); + PATCH_CASE(cpu, wbinvd, xxl, insn_buff, len); +# else + PATCH_CASE(cpu, iret, xxl, insn_buff, len); +# endif +#endif + +#ifdef CONFIG_PARAVIRT_SPINLOCKS + case PARAVIRT_PATCH(lock.queued_spin_unlock): + if (pv_is_native_spin_unlock()) + return PATCH(lock, queued_spin_unlock, insn_buff, len); + break; + + case PARAVIRT_PATCH(lock.vcpu_is_preempted): + if (pv_is_native_vcpu_is_preempted()) + return PATCH(lock, vcpu_is_preempted, insn_buff, len); + break; +#endif + default: + break; + } + + return paravirt_patch_default(type, insn_buff, addr, len); +} diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c deleted file mode 100644 index de138d3912e4..000000000000 --- a/arch/x86/kernel/paravirt_patch_32.c +++ /dev/null @@ -1,67 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <asm/paravirt.h> - -#ifdef CONFIG_PARAVIRT_XXL -DEF_NATIVE(irq, irq_disable, "cli"); -DEF_NATIVE(irq, irq_enable, "sti"); -DEF_NATIVE(irq, restore_fl, "push %eax; popf"); -DEF_NATIVE(irq, save_fl, "pushf; pop %eax"); -DEF_NATIVE(cpu, iret, "iret"); -DEF_NATIVE(mmu, read_cr2, "mov %cr2, %eax"); -DEF_NATIVE(mmu, write_cr3, "mov %eax, %cr3"); -DEF_NATIVE(mmu, read_cr3, "mov %cr3, %eax"); - -unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) -{ - /* arg in %edx:%eax, return in %edx:%eax */ - return 0; -} -#endif - -#if defined(CONFIG_PARAVIRT_SPINLOCKS) -DEF_NATIVE(lock, queued_spin_unlock, "movb $0, (%eax)"); -DEF_NATIVE(lock, vcpu_is_preempted, "xor %eax, %eax"); -#endif - -extern bool pv_is_native_spin_unlock(void); -extern bool pv_is_native_vcpu_is_preempted(void); - -unsigned native_patch(u8 type, void *ibuf, unsigned long addr, unsigned len) -{ -#define PATCH_SITE(ops, x) \ - case PARAVIRT_PATCH(ops.x): \ - return paravirt_patch_insns(ibuf, len, start_##ops##_##x, end_##ops##_##x) - - switch (type) { -#ifdef CONFIG_PARAVIRT_XXL - PATCH_SITE(irq, irq_disable); - PATCH_SITE(irq, irq_enable); - PATCH_SITE(irq, restore_fl); - PATCH_SITE(irq, save_fl); - PATCH_SITE(cpu, iret); - PATCH_SITE(mmu, read_cr2); - PATCH_SITE(mmu, read_cr3); - PATCH_SITE(mmu, write_cr3); -#endif -#if defined(CONFIG_PARAVIRT_SPINLOCKS) - case PARAVIRT_PATCH(lock.queued_spin_unlock): - if (pv_is_native_spin_unlock()) - return paravirt_patch_insns(ibuf, len, - start_lock_queued_spin_unlock, - end_lock_queued_spin_unlock); - break; - - case PARAVIRT_PATCH(lock.vcpu_is_preempted): - if (pv_is_native_vcpu_is_preempted()) - return paravirt_patch_insns(ibuf, len, - start_lock_vcpu_is_preempted, - end_lock_vcpu_is_preempted); - break; -#endif - - default: - break; - } -#undef PATCH_SITE - return paravirt_patch_default(type, ibuf, addr, len); -} diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c deleted file mode 100644 index 9d9e04b31077..000000000000 --- a/arch/x86/kernel/paravirt_patch_64.c +++ /dev/null @@ -1,75 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <asm/paravirt.h> -#include <asm/asm-offsets.h> -#include <linux/stringify.h> - -#ifdef CONFIG_PARAVIRT_XXL -DEF_NATIVE(irq, irq_disable, "cli"); -DEF_NATIVE(irq, irq_enable, "sti"); -DEF_NATIVE(irq, restore_fl, "pushq %rdi; popfq"); -DEF_NATIVE(irq, save_fl, "pushfq; popq %rax"); -DEF_NATIVE(mmu, read_cr2, "movq %cr2, %rax"); -DEF_NATIVE(mmu, read_cr3, "movq %cr3, %rax"); -DEF_NATIVE(mmu, write_cr3, "movq %rdi, %cr3"); -DEF_NATIVE(cpu, wbinvd, "wbinvd"); - -DEF_NATIVE(cpu, usergs_sysret64, "swapgs; sysretq"); -DEF_NATIVE(cpu, swapgs, "swapgs"); -DEF_NATIVE(, mov64, "mov %rdi, %rax"); - -unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) -{ - return paravirt_patch_insns(insnbuf, len, - start__mov64, end__mov64); -} -#endif - -#if defined(CONFIG_PARAVIRT_SPINLOCKS) -DEF_NATIVE(lock, queued_spin_unlock, "movb $0, (%rdi)"); -DEF_NATIVE(lock, vcpu_is_preempted, "xor %eax, %eax"); -#endif - -extern bool pv_is_native_spin_unlock(void); -extern bool pv_is_native_vcpu_is_preempted(void); - -unsigned native_patch(u8 type, void *ibuf, unsigned long addr, unsigned len) -{ -#define PATCH_SITE(ops, x) \ - case PARAVIRT_PATCH(ops.x): \ - return paravirt_patch_insns(ibuf, len, start_##ops##_##x, end_##ops##_##x) - - switch (type) { -#ifdef CONFIG_PARAVIRT_XXL - PATCH_SITE(irq, restore_fl); - PATCH_SITE(irq, save_fl); - PATCH_SITE(irq, irq_enable); - PATCH_SITE(irq, irq_disable); - PATCH_SITE(cpu, usergs_sysret64); - PATCH_SITE(cpu, swapgs); - PATCH_SITE(cpu, wbinvd); - PATCH_SITE(mmu, read_cr2); - PATCH_SITE(mmu, read_cr3); - PATCH_SITE(mmu, write_cr3); -#endif -#if defined(CONFIG_PARAVIRT_SPINLOCKS) - case PARAVIRT_PATCH(lock.queued_spin_unlock): - if (pv_is_native_spin_unlock()) - return paravirt_patch_insns(ibuf, len, - start_lock_queued_spin_unlock, - end_lock_queued_spin_unlock); - break; - - case PARAVIRT_PATCH(lock.vcpu_is_preempted): - if (pv_is_native_vcpu_is_preempted()) - return paravirt_patch_insns(ibuf, len, - start_lock_vcpu_is_preempted, - end_lock_vcpu_is_preempted); - break; -#endif - - default: - break; - } -#undef PATCH_SITE - return paravirt_patch_default(type, ibuf, addr, len); -} diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index 07c30ee17425..bb7e1132290b 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c @@ -74,6 +74,9 @@ u64 perf_reg_value(struct pt_regs *regs, int idx) return regs_get_register(regs, pt_regs_offset[idx]); } +#define PERF_REG_X86_RESERVED (((1ULL << PERF_REG_X86_XMM0) - 1) & \ + ~((1ULL << PERF_REG_X86_MAX) - 1)) + #ifdef CONFIG_X86_32 #define REG_NOSUPPORT ((1ULL << PERF_REG_X86_R8) | \ (1ULL << PERF_REG_X86_R9) | \ @@ -86,7 +89,7 @@ u64 perf_reg_value(struct pt_regs *regs, int idx) int perf_reg_validate(u64 mask) { - if (!mask || (mask & REG_NOSUPPORT)) + if (!mask || (mask & (REG_NOSUPPORT | PERF_REG_X86_RESERVED))) return -EINVAL; return 0; @@ -112,7 +115,7 @@ void perf_get_regs_user(struct perf_regs *regs_user, int perf_reg_validate(u64 mask) { - if (!mask || (mask & REG_NOSUPPORT)) + if (!mask || (mask & (REG_NOSUPPORT | PERF_REG_X86_RESERVED))) return -EINVAL; return 0; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 2399e910d109..b8ceec4974fe 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -62,27 +62,21 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; unsigned long d0, d1, d2, d3, d6, d7; - unsigned long sp; - unsigned short ss, gs; + unsigned short gs; - if (user_mode(regs)) { - sp = regs->sp; - ss = regs->ss; + if (user_mode(regs)) gs = get_user_gs(regs); - } else { - sp = kernel_stack_pointer(regs); - savesegment(ss, ss); + else savesegment(gs, gs); - } show_ip(regs, KERN_DEFAULT); printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", regs->ax, regs->bx, regs->cx, regs->dx); printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", - regs->si, regs->di, regs->bp, sp); + regs->si, regs->di, regs->bp, regs->sp); printk(KERN_DEFAULT "DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x EFLAGS: %08lx\n", - (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss, regs->flags); + (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, regs->ss, regs->flags); if (mode != SHOW_REGS_ALL) return; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index a166c960bc9e..71691a8310e7 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -25,6 +25,7 @@ #include <linux/rcupdate.h> #include <linux/export.h> #include <linux/context_tracking.h> +#include <linux/nospec.h> #include <linux/uaccess.h> #include <asm/pgtable.h> @@ -154,35 +155,6 @@ static inline bool invalid_selector(u16 value) #define FLAG_MASK FLAG_MASK_32 -/* - * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode - * when it traps. The previous stack will be directly underneath the saved - * registers, and 'sp/ss' won't even have been saved. Thus the '®s->sp'. - * - * Now, if the stack is empty, '®s->sp' is out of range. In this - * case we try to take the previous stack. To always return a non-null - * stack pointer we fall back to regs as stack if no previous stack - * exists. - * - * This is valid only for kernel mode traps. - */ -unsigned long kernel_stack_pointer(struct pt_regs *regs) -{ - unsigned long context = (unsigned long)regs & ~(THREAD_SIZE - 1); - unsigned long sp = (unsigned long)®s->sp; - u32 *prev_esp; - - if (context == (sp & ~(THREAD_SIZE - 1))) - return sp; - - prev_esp = (u32 *)(context); - if (*prev_esp) - return (unsigned long)*prev_esp; - - return (unsigned long)regs; -} -EXPORT_SYMBOL_GPL(kernel_stack_pointer); - static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno) { BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); @@ -397,22 +369,12 @@ static int putreg(struct task_struct *child, case offsetof(struct user_regs_struct,fs_base): if (value >= TASK_SIZE_MAX) return -EIO; - /* - * When changing the FS base, use do_arch_prctl_64() - * to set the index to zero and to set the base - * as requested. - */ - if (child->thread.fsbase != value) - return do_arch_prctl_64(child, ARCH_SET_FS, value); + x86_fsbase_write_task(child, value); return 0; case offsetof(struct user_regs_struct,gs_base): - /* - * Exactly the same here as the %fs handling above. - */ if (value >= TASK_SIZE_MAX) return -EIO; - if (child->thread.gsbase != value) - return do_arch_prctl_64(child, ARCH_SET_GS, value); + x86_gsbase_write_task(child, value); return 0; #endif } @@ -645,7 +607,8 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) unsigned long val = 0; if (n < HBP_NUM) { - struct perf_event *bp = thread->ptrace_bps[n]; + int index = array_index_nospec(n, HBP_NUM); + struct perf_event *bp = thread->ptrace_bps[index]; if (bp) val = bp->hw.info.address; @@ -747,9 +710,6 @@ static int ioperm_get(struct task_struct *target, void ptrace_disable(struct task_struct *child) { user_disable_single_step(child); -#ifdef TIF_SYSCALL_EMU - clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); -#endif } #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION @@ -1361,18 +1321,19 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task) #endif } -void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, - int error_code, int si_code) +void send_sigtrap(struct pt_regs *regs, int error_code, int si_code) { + struct task_struct *tsk = current; + tsk->thread.trap_nr = X86_TRAP_DB; tsk->thread.error_code = error_code; /* Send us the fake SIGTRAP */ force_sig_fault(SIGTRAP, si_code, - user_mode(regs) ? (void __user *)regs->ip : NULL, tsk); + user_mode(regs) ? (void __user *)regs->ip : NULL); } void user_single_step_report(struct pt_regs *regs) { - send_sigtrap(current, regs, 0, TRAP_BRKPT); + send_sigtrap(regs, 0, TRAP_BRKPT); } diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 0ff3e294d0e5..10125358b9c4 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -3,6 +3,7 @@ */ +#include <linux/clocksource.h> #include <linux/kernel.h> #include <linux/percpu.h> #include <linux/notifier.h> diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index 77630d57e7bf..ee26df08002e 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -1,9 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * relocate_kernel.S - put the kernel image in place to boot * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. */ #include <linux/linkage.h> diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 11eda21eb697..c51ccff5cd01 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -1,9 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * relocate_kernel.S - put the kernel image in place to boot * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. */ #include <linux/linkage.h> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 08a5f4a131f5..bbe35bf879f5 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -453,15 +453,24 @@ static void __init memblock_x86_reserve_range_setup_data(void) #define CRASH_ALIGN SZ_16M /* - * Keep the crash kernel below this limit. On 32 bits earlier kernels - * would limit the kernel to the low 512 MiB due to mapping restrictions. + * Keep the crash kernel below this limit. + * + * On 32 bits earlier kernels would limit the kernel to the low 512 MiB + * due to mapping restrictions. + * + * On 64bit, kdump kernel need be restricted to be under 64TB, which is + * the upper limit of system RAM in 4-level paing mode. Since the kdump + * jumping could be from 5-level to 4-level, the jumping will fail if + * kernel is put above 64TB, and there's no way to detect the paging mode + * of the kernel which will be loaded for dumping during the 1st kernel + * bootup. */ #ifdef CONFIG_X86_32 # define CRASH_ADDR_LOW_MAX SZ_512M # define CRASH_ADDR_HIGH_MAX SZ_512M #else # define CRASH_ADDR_LOW_MAX SZ_4G -# define CRASH_ADDR_HIGH_MAX MAXMEM +# define CRASH_ADDR_HIGH_MAX SZ_64T #endif static int __init reserve_crashkernel_low(void) @@ -827,8 +836,14 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) void __init setup_arch(char **cmdline_p) { + /* + * Reserve the memory occupied by the kernel between _text and + * __end_of_kernel_reserve symbols. Any kernel sections after the + * __end_of_kernel_reserve symbol must be explicitly reserved with a + * separate memblock_reserve() or they will be discarded. + */ memblock_reserve(__pa_symbol(_text), - (unsigned long)__bss_stop - (unsigned long)_text); + (unsigned long)__end_of_kernel_reserve - (unsigned long)_text); /* * Make sure page 0 is always reserved because on systems with diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 364813cea647..7cf508f78c8c 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -857,7 +857,7 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where) pr_cont("\n"); } - force_sig(SIGSEGV, me); + force_sig(SIGSEGV); } #ifdef CONFIG_X86_X32_ABI diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 4693e2f3a03e..96421f97e75c 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -144,7 +144,7 @@ void native_send_call_func_ipi(const struct cpumask *mask) } cpumask_copy(allbutself, cpu_online_mask); - cpumask_clear_cpu(smp_processor_id(), allbutself); + __cpumask_clear_cpu(smp_processor_id(), allbutself); if (cpumask_equal(mask, allbutself) && cpumask_equal(cpu_online_mask, cpu_callout_mask)) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 362dd8953f48..f78801114ee1 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -89,6 +89,10 @@ EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map); EXPORT_PER_CPU_SYMBOL(cpu_core_map); +/* representing HT, core, and die siblings of each logical CPU */ +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map); +EXPORT_PER_CPU_SYMBOL(cpu_die_map); + DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); /* Per CPU bogomips and other parameters */ @@ -99,6 +103,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_info); unsigned int __max_logical_packages __read_mostly; EXPORT_SYMBOL(__max_logical_packages); static unsigned int logical_packages __read_mostly; +static unsigned int logical_die __read_mostly; /* Maximum number of SMT threads on any online core */ int __read_mostly __max_smt_threads = 1; @@ -205,13 +210,19 @@ static int enable_start_cpu0; */ static void notrace start_secondary(void *unused) { + unsigned long cr4 = __read_cr4(); + /* * Don't put *anything* except direct CPU state initialization * before cpu_init(), SMP booting is too fragile that we want to * limit the things done here to the most necessary things. */ if (boot_cpu_has(X86_FEATURE_PCID)) - __write_cr4(__read_cr4() | X86_CR4_PCIDE); + cr4 |= X86_CR4_PCIDE; + if (static_branch_likely(&cr_pinning)) + cr4 |= cr4_pinned_bits; + + __write_cr4(cr4); #ifdef CONFIG_X86_32 /* switch away from the initial page table */ @@ -300,6 +311,26 @@ int topology_phys_to_logical_pkg(unsigned int phys_pkg) return -1; } EXPORT_SYMBOL(topology_phys_to_logical_pkg); +/** + * topology_phys_to_logical_die - Map a physical die id to logical + * + * Returns logical die id or -1 if not found + */ +int topology_phys_to_logical_die(unsigned int die_id, unsigned int cur_cpu) +{ + int cpu; + int proc_id = cpu_data(cur_cpu).phys_proc_id; + + for_each_possible_cpu(cpu) { + struct cpuinfo_x86 *c = &cpu_data(cpu); + + if (c->initialized && c->cpu_die_id == die_id && + c->phys_proc_id == proc_id) + return c->logical_die_id; + } + return -1; +} +EXPORT_SYMBOL(topology_phys_to_logical_die); /** * topology_update_package_map - Update the physical to logical package map @@ -324,6 +355,29 @@ found: cpu_data(cpu).logical_proc_id = new; return 0; } +/** + * topology_update_die_map - Update the physical to logical die map + * @die: The die id as retrieved via CPUID + * @cpu: The cpu for which this is updated + */ +int topology_update_die_map(unsigned int die, unsigned int cpu) +{ + int new; + + /* Already available somewhere? */ + new = topology_phys_to_logical_die(die, cpu); + if (new >= 0) + goto found; + + new = logical_die++; + if (new != die) { + pr_info("CPU %u Converting physical %u to logical die %u\n", + cpu, die, new); + } +found: + cpu_data(cpu).logical_die_id = new; + return 0; +} void __init smp_store_boot_cpu_info(void) { @@ -333,6 +387,7 @@ void __init smp_store_boot_cpu_info(void) *c = boot_cpu_data; c->cpu_index = id; topology_update_package_map(c->phys_proc_id, id); + topology_update_die_map(c->cpu_die_id, id); c->initialized = true; } @@ -387,6 +442,7 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) int cpu1 = c->cpu_index, cpu2 = o->cpu_index; if (c->phys_proc_id == o->phys_proc_id && + c->cpu_die_id == o->cpu_die_id && per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) { if (c->cpu_core_id == o->cpu_core_id) return topology_sane(c, o, "smt"); @@ -398,6 +454,7 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) } } else if (c->phys_proc_id == o->phys_proc_id && + c->cpu_die_id == o->cpu_die_id && c->cpu_core_id == o->cpu_core_id) { return topology_sane(c, o, "smt"); } @@ -460,6 +517,15 @@ static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) return false; } +static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +{ + if ((c->phys_proc_id == o->phys_proc_id) && + (c->cpu_die_id == o->cpu_die_id)) + return true; + return false; +} + + #if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) static inline int x86_sched_itmt_flags(void) { @@ -522,6 +588,7 @@ void set_cpu_sibling_map(int cpu) cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu)); cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); cpumask_set_cpu(cpu, topology_core_cpumask(cpu)); + cpumask_set_cpu(cpu, topology_die_cpumask(cpu)); c->booted_cores = 1; return; } @@ -570,6 +637,9 @@ void set_cpu_sibling_map(int cpu) } if (match_pkg(c, o) && !topology_same_node(c, o)) x86_has_numa_in_package = true; + + if ((i == cpu) || (has_mp && match_die(c, o))) + link_mask(topology_die_cpumask, cpu, i); } threads = cpumask_weight(topology_sibling_cpumask(cpu)); @@ -1174,6 +1244,7 @@ static __init void disable_smp(void) physid_set_mask_of_physid(0, &phys_cpu_present_map); cpumask_set_cpu(0, topology_sibling_cpumask(0)); cpumask_set_cpu(0, topology_core_cpumask(0)); + cpumask_set_cpu(0, topology_die_cpumask(0)); } /* @@ -1269,6 +1340,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) for_each_possible_cpu(i) { zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); } @@ -1489,6 +1561,8 @@ static void remove_siblinginfo(int cpu) cpu_data(sibling).booted_cores--; } + for_each_cpu(sibling, topology_die_cpumask(cpu)) + cpumask_clear_cpu(cpu, topology_die_cpumask(sibling)); for_each_cpu(sibling, topology_sibling_cpumask(cpu)) cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling)); for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) @@ -1496,6 +1570,7 @@ static void remove_siblinginfo(int cpu) cpumask_clear(cpu_llc_shared_mask(cpu)); cpumask_clear(topology_sibling_cpumask(cpu)); cpumask_clear(topology_core_cpumask(cpu)); + cpumask_clear(topology_die_cpumask(cpu)); c->cpu_core_id = 0; c->booted_cores = 0; cpumask_clear_cpu(cpu, cpu_sibling_setup_mask); diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 0e14f6c0d35e..7ce29cee9f9e 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -37,8 +37,7 @@ unsigned long profile_pc(struct pt_regs *regs) #ifdef CONFIG_FRAME_POINTER return *(unsigned long *)(regs->bp + sizeof(long)); #else - unsigned long *sp = - (unsigned long *)kernel_stack_pointer(regs); + unsigned long *sp = (unsigned long *)regs->sp; /* * Return address is either directly at stack pointer * or above a saved flags. Eflags has bits 22-31 zero, @@ -82,8 +81,11 @@ static void __init setup_default_timer_irq(void) /* Default timer init function */ void __init hpet_time_init(void) { - if (!hpet_enable()) - setup_pit_timer(); + if (!hpet_enable()) { + if (!pit_timer_init()) + return; + } + setup_default_timer_irq(); } diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index a5b802a12212..71d3fef1edc9 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -5,6 +5,7 @@ #include <linux/user.h> #include <linux/regset.h> #include <linux/syscalls.h> +#include <linux/nospec.h> #include <linux/uaccess.h> #include <asm/desc.h> @@ -220,6 +221,7 @@ int do_get_thread_area(struct task_struct *p, int idx, struct user_desc __user *u_info) { struct user_desc info; + int index; if (idx == -1 && get_user(idx, &u_info->entry_number)) return -EFAULT; @@ -227,8 +229,11 @@ int do_get_thread_area(struct task_struct *p, int idx, if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) return -EINVAL; - fill_user_desc(&info, idx, - &p->thread.tls_array[idx - GDT_ENTRY_TLS_MIN]); + index = idx - GDT_ENTRY_TLS_MIN; + index = array_index_nospec(index, + GDT_ENTRY_TLS_MAX - GDT_ENTRY_TLS_MIN + 1); + + fill_user_desc(&info, idx, &p->thread.tls_array[index]); if (copy_to_user(u_info, &info, sizeof(info))) return -EFAULT; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 8b6d03e55d2f..87095a477154 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -254,9 +254,9 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, show_signal(tsk, signr, "trap ", str, regs, error_code); if (!sicode) - force_sig(signr, tsk); + force_sig(signr); else - force_sig_fault(signr, sicode, addr, tsk); + force_sig_fault(signr, sicode, addr); } NOKPROBE_SYMBOL(do_trap); @@ -566,7 +566,7 @@ do_general_protection(struct pt_regs *regs, long error_code) show_signal(tsk, SIGSEGV, "", desc, regs, error_code); - force_sig(SIGSEGV, tsk); + force_sig(SIGSEGV); } NOKPROBE_SYMBOL(do_general_protection); @@ -805,7 +805,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) } si_code = get_si_code(tsk->thread.debugreg6); if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) - send_sigtrap(tsk, regs, error_code, si_code); + send_sigtrap(regs, error_code, si_code); cond_local_irq_disable(regs); debug_stack_usage_dec(); @@ -856,7 +856,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) return; force_sig_fault(SIGFPE, si_code, - (void __user *)uprobe_get_trap_addr(regs), task); + (void __user *)uprobe_get_trap_addr(regs)); } dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 0b29e58f288e..57d87f79558f 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -59,7 +59,7 @@ struct cyc2ns { static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); -void __always_inline cyc2ns_read_begin(struct cyc2ns_data *data) +__always_inline void cyc2ns_read_begin(struct cyc2ns_data *data) { int seq, idx; @@ -76,7 +76,7 @@ void __always_inline cyc2ns_read_begin(struct cyc2ns_data *data) } while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence))); } -void __always_inline cyc2ns_read_end(void) +__always_inline void cyc2ns_read_end(void) { preempt_enable_notrace(); } @@ -632,31 +632,38 @@ unsigned long native_calibrate_tsc(void) crystal_khz = ecx_hz / 1000; - if (crystal_khz == 0) { - switch (boot_cpu_data.x86_model) { - case INTEL_FAM6_SKYLAKE_MOBILE: - case INTEL_FAM6_SKYLAKE_DESKTOP: - case INTEL_FAM6_KABYLAKE_MOBILE: - case INTEL_FAM6_KABYLAKE_DESKTOP: - crystal_khz = 24000; /* 24.0 MHz */ - break; - case INTEL_FAM6_ATOM_GOLDMONT_X: - crystal_khz = 25000; /* 25.0 MHz */ - break; - case INTEL_FAM6_ATOM_GOLDMONT: - crystal_khz = 19200; /* 19.2 MHz */ - break; - } - } + /* + * Denverton SoCs don't report crystal clock, and also don't support + * CPUID.0x16 for the calculation below, so hardcode the 25MHz crystal + * clock. + */ + if (crystal_khz == 0 && + boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT_X) + crystal_khz = 25000; - if (crystal_khz == 0) - return 0; /* - * TSC frequency determined by CPUID is a "hardware reported" + * TSC frequency reported directly by CPUID is a "hardware reported" * frequency and is the most accurate one so far we have. This * is considered a known frequency. */ - setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + if (crystal_khz != 0) + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + + /* + * Some Intel SoCs like Skylake and Kabylake don't report the crystal + * clock, but we can easily calculate it to a high degree of accuracy + * by considering the crystal ratio and the CPU speed. + */ + if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= 0x16) { + unsigned int eax_base_mhz, ebx, ecx, edx; + + cpuid(0x16, &eax_base_mhz, &ebx, &ecx, &edx); + crystal_khz = eax_base_mhz * 1000 * + eax_denominator / ebx_numerator; + } + + if (crystal_khz == 0) + return 0; /* * For Atom SoCs TSC is the only reliable clocksource. @@ -665,6 +672,16 @@ unsigned long native_calibrate_tsc(void) if (boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT) setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); +#ifdef CONFIG_X86_LOCAL_APIC + /* + * The local APIC appears to be fed by the core crystal clock + * (which sounds entirely sensible). We can set the global + * lapic_timer_period here to avoid having to calibrate the APIC + * timer later. + */ + lapic_timer_period = crystal_khz * 1000 / HZ; +#endif + return crystal_khz * ebx_numerator / eax_denominator; } diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 3d0e9aeea7c8..067858fe4db8 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -71,7 +71,7 @@ static const struct x86_cpu_id tsc_msr_cpu_ids[] = { /* * MSR-based CPU/TSC frequency discovery for certain CPUs. * - * Set global "lapic_timer_frequency" to bus_clock_cycles/jiffy + * Set global "lapic_timer_period" to bus_clock_cycles/jiffy * Return processor base frequency in KHz, or 0 on failure. */ unsigned long cpu_khz_from_msr(void) @@ -104,7 +104,7 @@ unsigned long cpu_khz_from_msr(void) res = freq * ratio; #ifdef CONFIG_X86_LOCAL_APIC - lapic_timer_frequency = (freq * 1000) / HZ; + lapic_timer_period = (freq * 1000) / HZ; #endif /* diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index f8f3cfda01ae..5b345add550f 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -277,7 +277,7 @@ static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs) tsk->thread.error_code = X86_PF_USER | X86_PF_WRITE; tsk->thread.trap_nr = X86_TRAP_PF; - force_sig_fault(SIGSEGV, SEGV_MAPERR, addr, tsk); + force_sig_fault(SIGSEGV, SEGV_MAPERR, addr); if (!(show_unhandled_signals && unhandled_signal(tsk, SIGSEGV))) return; diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 6106760de716..a224b5ab103f 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -70,15 +70,6 @@ static void unwind_dump(struct unwind_state *state) } } -static size_t regs_size(struct pt_regs *regs) -{ - /* x86_32 regs from kernel mode are two words shorter: */ - if (IS_ENABLED(CONFIG_X86_32) && !user_mode(regs)) - return sizeof(*regs) - 2*sizeof(long); - - return sizeof(*regs); -} - static bool in_entry_code(unsigned long ip) { char *addr = (char *)ip; @@ -198,12 +189,6 @@ static struct pt_regs *decode_frame_pointer(unsigned long *bp) } #endif -#ifdef CONFIG_X86_32 -#define KERNEL_REGS_SIZE (sizeof(struct pt_regs) - 2*sizeof(long)) -#else -#define KERNEL_REGS_SIZE (sizeof(struct pt_regs)) -#endif - static bool update_stack_state(struct unwind_state *state, unsigned long *next_bp) { @@ -214,7 +199,7 @@ static bool update_stack_state(struct unwind_state *state, size_t len; if (state->regs) - prev_frame_end = (void *)state->regs + regs_size(state->regs); + prev_frame_end = (void *)state->regs + sizeof(*state->regs); else prev_frame_end = (void *)state->bp + FRAME_HEADER_SIZE; @@ -222,7 +207,7 @@ static bool update_stack_state(struct unwind_state *state, regs = decode_frame_pointer(next_bp); if (regs) { frame = (unsigned long *)regs; - len = KERNEL_REGS_SIZE; + len = sizeof(*regs); state->got_irq = true; } else { frame = next_bp; @@ -246,14 +231,6 @@ static bool update_stack_state(struct unwind_state *state, frame < prev_frame_end) return false; - /* - * On 32-bit with user mode regs, make sure the last two regs are safe - * to access: - */ - if (IS_ENABLED(CONFIG_X86_32) && regs && user_mode(regs) && - !on_stack(info, frame, len + 2*sizeof(long))) - return false; - /* Move state to the next frame: */ if (regs) { state->regs = regs; @@ -412,10 +389,9 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, * Pretend that the frame is complete and that BP points to it, but save * the real BP so that we can use it when looking for the next frame. */ - if (regs && regs->ip == 0 && - (unsigned long *)kernel_stack_pointer(regs) >= first_frame) { + if (regs && regs->ip == 0 && (unsigned long *)regs->sp >= first_frame) { state->next_bp = bp; - bp = ((unsigned long *)kernel_stack_pointer(regs)) - 1; + bp = ((unsigned long *)regs->sp) - 1; } /* Initialize stack info and make sure the frame data is accessible: */ diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 33b66b5c5aec..332ae6530fa8 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -82,9 +82,9 @@ static struct orc_entry *orc_find(unsigned long ip); * But they are copies of the ftrace entries that are static and * defined in ftrace_*.S, which do have orc entries. * - * If the undwinder comes across a ftrace trampoline, then find the + * If the unwinder comes across a ftrace trampoline, then find the * ftrace function that was used to create it, and use that ftrace - * function's orc entrie, as the placement of the return code in + * function's orc entry, as the placement of the return code in * the stack will be identical. */ static struct orc_entry *orc_ftrace_find(unsigned long ip) @@ -128,6 +128,16 @@ static struct orc_entry null_orc_entry = { .type = ORC_TYPE_CALL }; +/* Fake frame pointer entry -- used as a fallback for generated code */ +static struct orc_entry orc_fp_entry = { + .type = ORC_TYPE_CALL, + .sp_reg = ORC_REG_BP, + .sp_offset = 16, + .bp_reg = ORC_REG_PREV_SP, + .bp_offset = -16, + .end = 0, +}; + static struct orc_entry *orc_find(unsigned long ip) { static struct orc_entry *orc; @@ -392,8 +402,16 @@ bool unwind_next_frame(struct unwind_state *state) * calls and calls to noreturn functions. */ orc = orc_find(state->signal ? state->ip : state->ip - 1); - if (!orc) - goto err; + if (!orc) { + /* + * As a fallback, try to assume this code uses a frame pointer. + * This is useful for generated code, like BPF, which ORC + * doesn't know about. This is just a guess, so the rest of + * the unwind is no longer considered reliable. + */ + orc = &orc_fp_entry; + state->error = true; + } /* End-of-stack check for kernel threads: */ if (orc->sp_reg == ORC_REG_UNDEFINED) { @@ -580,7 +598,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, goto done; state->ip = regs->ip; - state->sp = kernel_stack_pointer(regs); + state->sp = regs->sp; state->bp = regs->bp; state->regs = regs; state->full_regs = true; diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 918b5092a85f..d8359ebeea70 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -1074,7 +1074,7 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs pr_err("return address clobbered: pid=%d, %%sp=%#lx, %%ip=%#lx\n", current->pid, regs->sp, regs->ip); - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); } return -1; diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S index 3d3c2f71f617..a024c4f7ba56 100644 --- a/arch/x86/kernel/verify_cpu.S +++ b/arch/x86/kernel/verify_cpu.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * * verify_cpu.S - Code for cpu long mode and SSE verification. This @@ -9,9 +10,6 @@ * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com) * Copyright (c) 2010 Kees Cook (kees.cook@canonical.com) * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. - * * This is a common code for verification whether CPU supports * long mode and SSE or not. It is not called directly instead this * file is included at various places and compiled in that context. diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 6a38717d179c..a76c12b38e92 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -583,7 +583,7 @@ int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno) return 1; /* we let this handle by the calling routine */ current->thread.trap_nr = trapno; current->thread.error_code = error_code; - force_sig(SIGTRAP, current); + force_sig(SIGTRAP); return 0; } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 0850b5149345..147cd020516a 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -368,6 +368,14 @@ SECTIONS __bss_stop = .; } + /* + * The memory occupied from _text to here, __end_of_kernel_reserve, is + * automatically reserved in setup_arch(). Anything after here must be + * explicitly reserved using memblock_reserve() or it will be discarded + * and treated as available memory. + */ + __end_of_kernel_reserve = .; + . = ALIGN(PAGE_SIZE); .brk : AT(ADDR(.brk) - LOAD_OFFSET) { __brk_base = .; @@ -379,10 +387,34 @@ SECTIONS . = ALIGN(PAGE_SIZE); /* keep VO_INIT_SIZE page aligned */ _end = .; +#ifdef CONFIG_AMD_MEM_ENCRYPT + /* + * Early scratch/workarea section: Lives outside of the kernel proper + * (_text - _end). + * + * Resides after _end because even though the .brk section is after + * __end_of_kernel_reserve, the .brk section is later reserved as a + * part of the kernel. Since it is located after __end_of_kernel_reserve + * it will be discarded and become part of the available memory. As + * such, it can only be used by very early boot code and must not be + * needed afterwards. + * + * Currently used by SME for performing in-place encryption of the + * kernel during boot. Resides on a 2MB boundary to simplify the + * pagetable setup used for SME in-place encryption. + */ + . = ALIGN(HPAGE_SIZE); + .init.scratch : AT(ADDR(.init.scratch) - LOAD_OFFSET) { + __init_scratch_begin = .; + *(.init.scratch) + . = ALIGN(HPAGE_SIZE); + __init_scratch_end = .; + } +#endif + STABS_DEBUG DWARF_DEBUG - /* Sections to be discarded */ DISCARDS /DISCARD/ : { *(.eh_frame) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index e18a9f9f65b5..4992e7c99588 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Kernel-based Virtual Machine driver for Linux * cpuid support routines @@ -6,10 +7,6 @@ * * Copyright 2011 Red Hat, Inc. and/or its affiliates. * Copyright IBM Corporation, 2008 - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * */ #include <linux/kvm_host.h> diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 9a327d5b6d1f..d78a61408243 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -47,8 +47,6 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_8000_0001_ECX] = {0x80000001, 0, CPUID_ECX}, [CPUID_7_0_EBX] = { 7, 0, CPUID_EBX}, [CPUID_D_1_EAX] = { 0xd, 1, CPUID_EAX}, - [CPUID_F_0_EDX] = { 0xf, 0, CPUID_EDX}, - [CPUID_F_1_EDX] = { 0xf, 1, CPUID_EDX}, [CPUID_8000_0008_EBX] = {0x80000008, 0, CPUID_EBX}, [CPUID_6_EAX] = { 6, 0, CPUID_EAX}, [CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX}, diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index a2f3432ce090..329361b69d5e 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -1,11 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Kernel-based Virtual Machine driver for Linux * * Copyright 2016 Red Hat, Inc. and/or its affiliates. - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * */ #include <linux/kvm_host.h> #include <linux/debugfs.h> diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d0d5dd44b4f4..4a387a235424 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /****************************************************************************** * emulate.c * @@ -14,9 +15,6 @@ * Avi Kivity <avi@qumranet.com> * Yaniv Kamay <yaniv@qumranet.com> * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 */ diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 8ca4b39918e0..a39e38f13029 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * KVM Microsoft Hyper-V emulation * @@ -15,10 +16,6 @@ * Amit Shah <amit.shah@qumranet.com> * Ben-Ami Yassour <benami@il.ibm.com> * Andrey Smetanin <asmetanin@virtuozzo.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * */ #include "x86.h" diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index fd7cf13a2144..757cb578101c 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * KVM Microsoft Hyper-V emulation * @@ -15,10 +16,6 @@ * Amit Shah <amit.shah@qumranet.com> * Ben-Ami Yassour <benami@il.ibm.com> * Andrey Smetanin <asmetanin@virtuozzo.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * */ #ifndef __ARCH_X86_KVM_HYPERV_H__ diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4924f83ed4f3..4dabc318adb8 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Local APIC virtualization @@ -13,9 +14,6 @@ * Yaozu (Eddie) Dong <eddie.dong@intel.com> * * Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation. - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. */ #include <linux/kvm_host.h> @@ -2341,7 +2339,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu) struct kvm_lapic *apic = vcpu->arch.apic; u32 ppr; - if (!apic_enabled(apic)) + if (!kvm_apic_hw_enabled(apic)) return -1; __apic_update_ppr(apic, &ppr); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 1e9ba81accba..98f6e4f88b04 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Kernel-based Virtual Machine driver for Linux * @@ -12,10 +13,6 @@ * Authors: * Yaniv Kamay <yaniv@qumranet.com> * Avi Kivity <avi@qumranet.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * */ #include "irq.h" @@ -5602,14 +5599,18 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) struct page *page; int i; - if (tdp_enabled) - return 0; - /* - * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. - * Therefore we need to allocate shadow page tables in the first - * 4GB of memory, which happens to fit the DMA32 zone. + * When using PAE paging, the four PDPTEs are treated as 'root' pages, + * while the PDP table is a per-vCPU construct that's allocated at MMU + * creation. When emulating 32-bit mode, cr3 is only 32 bits even on + * x86_64. Therefore we need to allocate the PDP table in the first + * 4GB of memory, which happens to fit the DMA32 zone. Except for + * SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can + * skip allocating the PDP table. */ + if (tdp_enabled && kvm_x86_ops->get_tdp_level(vcpu) > PT32E_ROOT_LEVEL) + return 0; + page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32); if (!page) return -ENOMEM; diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index abac7e208853..ca39f62aabc6 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * mmu_audit.c: * @@ -11,10 +12,6 @@ * Avi Kivity <avi@qumranet.com> * Marcelo Tosatti <mtosatti@redhat.com> * Xiao Guangrong <xiaoguangrong@cn.fujitsu.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * */ #include <linux/ratelimit.h> diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index 9f72cc427158..25ce3edd1872 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * vMTRR implementation * @@ -11,9 +12,6 @@ * Marcelo Tosatti <mtosatti@redhat.com> * Paolo Bonzini <pbonzini@redhat.com> * Xiao Guangrong <guangrong.xiao@linux.intel.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. */ #include <linux/kvm_host.h> diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c index fd04d462fdae..3521e2d176f2 100644 --- a/arch/x86/kvm/page_track.c +++ b/arch/x86/kvm/page_track.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Support KVM gust page tracking * @@ -8,9 +9,6 @@ * * Author: * Xiao Guangrong <guangrong.xiao@linux.intel.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. */ #include <linux/kvm_host.h> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 367a47df4ba0..d583bcd119fc 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Kernel-based Virtual Machine driver for Linux * @@ -12,10 +13,6 @@ * Authors: * Yaniv Kamay <yaniv@qumranet.com> * Avi Kivity <avi@qumranet.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * */ /* diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index dd745b58ffd8..ab73a9a639ae 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Kernel-based Virtual Machine -- Performance Monitoring Unit support * @@ -7,10 +8,6 @@ * Avi Kivity <avi@redhat.com> * Gleb Natapov <gleb@redhat.com> * Wei Huang <wei@redhat.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * */ #include <linux/types.h> @@ -264,10 +261,10 @@ static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) ctr_val = rdtsc(); break; case VMWARE_BACKDOOR_PMC_REAL_TIME: - ctr_val = ktime_get_boot_ns(); + ctr_val = ktime_get_boottime_ns(); break; case VMWARE_BACKDOOR_PMC_APPARENT_TIME: - ctr_val = ktime_get_boot_ns() + + ctr_val = ktime_get_boottime_ns() + vcpu->kvm->arch.kvmclock_offset; break; default: diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c index d3118088f1cd..c8388389a3b0 100644 --- a/arch/x86/kvm/pmu_amd.c +++ b/arch/x86/kvm/pmu_amd.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * KVM PMU support for AMD * @@ -6,9 +7,6 @@ * Author: * Wei Huang <wei@redhat.com> * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * * Implementation is based on pmu_intel.c file */ #include <linux/types.h> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 735b8c01895e..48c865a4e5dd 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Kernel-based Virtual Machine driver for Linux * @@ -9,10 +10,6 @@ * Authors: * Yaniv Kamay <yaniv@qumranet.com> * Avi Kivity <avi@qumranet.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * */ #define pr_fmt(fmt) "SVM: " fmt diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 1032f068f0b9..46af3a5e9209 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1397,7 +1397,7 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) } if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { vmcs12->exception_bitmap = evmcs->exception_bitmap; } @@ -1437,7 +1437,7 @@ static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx) } if (unlikely(!(evmcs->hv_clean_fields & - HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { + HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { vmcs12->pin_based_vm_exec_control = evmcs->pin_based_vm_exec_control; vmcs12->vm_exit_controls = evmcs->vm_exit_controls; @@ -5226,40 +5226,42 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12; struct kvm_nested_state kvm_state = { .flags = 0, - .format = 0, + .format = KVM_STATE_NESTED_FORMAT_VMX, .size = sizeof(kvm_state), - .vmx.vmxon_pa = -1ull, - .vmx.vmcs_pa = -1ull, + .hdr.vmx.vmxon_pa = -1ull, + .hdr.vmx.vmcs12_pa = -1ull, }; + struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = + &user_kvm_nested_state->data.vmx[0]; if (!vcpu) - return kvm_state.size + 2 * VMCS12_SIZE; + return kvm_state.size + sizeof(*user_vmx_nested_state); vmx = to_vmx(vcpu); vmcs12 = get_vmcs12(vcpu); - if (nested_vmx_allowed(vcpu) && vmx->nested.enlightened_vmcs_enabled) - kvm_state.flags |= KVM_STATE_NESTED_EVMCS; - if (nested_vmx_allowed(vcpu) && (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { - kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr; - kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr; + kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; + kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; if (vmx_has_valid_vmcs12(vcpu)) { - kvm_state.size += VMCS12_SIZE; + kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); + + if (vmx->nested.hv_evmcs) + kvm_state.flags |= KVM_STATE_NESTED_EVMCS; if (is_guest_mode(vcpu) && nested_cpu_has_shadow_vmcs(vmcs12) && vmcs12->vmcs_link_pointer != -1ull) - kvm_state.size += VMCS12_SIZE; + kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); } if (vmx->nested.smm.vmxon) - kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; + kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; if (vmx->nested.smm.guest_mode) - kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; + kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; if (is_guest_mode(vcpu)) { kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; @@ -5294,16 +5296,19 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, copy_shadow_to_vmcs12(vmx); } + BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); + BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); + /* * Copy over the full allocated size of vmcs12 rather than just the size * of the struct. */ - if (copy_to_user(user_kvm_nested_state->data, vmcs12, VMCS12_SIZE)) + if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) return -EFAULT; if (nested_cpu_has_shadow_vmcs(vmcs12) && vmcs12->vmcs_link_pointer != -1ull) { - if (copy_to_user(user_kvm_nested_state->data + VMCS12_SIZE, + if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, get_shadow_vmcs12(vcpu), VMCS12_SIZE)) return -EFAULT; } @@ -5331,33 +5336,44 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12; u32 exit_qual; + struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = + &user_kvm_nested_state->data.vmx[0]; int ret; - if (kvm_state->format != 0) + if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) return -EINVAL; - if (!nested_vmx_allowed(vcpu)) - return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL; + if (kvm_state->hdr.vmx.vmxon_pa == -1ull) { + if (kvm_state->hdr.vmx.smm.flags) + return -EINVAL; - if (kvm_state->vmx.vmxon_pa == -1ull) { - if (kvm_state->vmx.smm.flags) + if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) return -EINVAL; - if (kvm_state->vmx.vmcs_pa != -1ull) + /* + * KVM_STATE_NESTED_EVMCS used to signal that KVM should + * enable eVMCS capability on vCPU. However, since then + * code was changed such that flag signals vmcs12 should + * be copied into eVMCS in guest memory. + * + * To preserve backwards compatability, allow user + * to set this flag even when there is no VMXON region. + */ + if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) + return -EINVAL; + } else { + if (!nested_vmx_allowed(vcpu)) return -EINVAL; - vmx_leave_nested(vcpu); - return 0; + if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) + return -EINVAL; } - if (!page_address_valid(vcpu, kvm_state->vmx.vmxon_pa)) - return -EINVAL; - - if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && + if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) return -EINVAL; - if (kvm_state->vmx.smm.flags & + if (kvm_state->hdr.vmx.smm.flags & ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) return -EINVAL; @@ -5366,21 +5382,26 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags * must be zero. */ - if (is_smm(vcpu) ? kvm_state->flags : kvm_state->vmx.smm.flags) + if (is_smm(vcpu) ? + (kvm_state->flags & + (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) + : kvm_state->hdr.vmx.smm.flags) return -EINVAL; - if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && - !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) + if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && + !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) return -EINVAL; + if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && + (!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled)) + return -EINVAL; + vmx_leave_nested(vcpu); - if (kvm_state->vmx.vmxon_pa == -1ull) - return 0; - if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) - nested_enable_evmcs(vcpu, NULL); + if (kvm_state->hdr.vmx.vmxon_pa == -1ull) + return 0; - vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa; + vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; ret = enter_vmx_operation(vcpu); if (ret) return ret; @@ -5389,12 +5410,12 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) return 0; - if (kvm_state->vmx.vmcs_pa != -1ull) { - if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa || - !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa)) + if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) { + if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || + !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) return -EINVAL; - set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa); + set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { /* * Sync eVMCS upon entry as we may not have @@ -5405,16 +5426,16 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, return -EINVAL; } - if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { + if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { vmx->nested.smm.vmxon = true; vmx->nested.vmxon = false; - if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) + if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) vmx->nested.smm.guest_mode = true; } vmcs12 = get_vmcs12(vcpu); - if (copy_from_user(vmcs12, user_kvm_nested_state->data, sizeof(*vmcs12))) + if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) return -EFAULT; if (vmcs12->hdr.revision_id != VMCS12_REVISION) @@ -5431,12 +5452,14 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, vmcs12->vmcs_link_pointer != -1ull) { struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); - if (kvm_state->size < sizeof(*kvm_state) + VMCS12_SIZE + sizeof(*vmcs12)) + if (kvm_state->size < + sizeof(*kvm_state) + + sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) goto error_guest_mode; if (copy_from_user(shadow_vmcs12, - user_kvm_nested_state->data + VMCS12_SIZE, - sizeof(*vmcs12))) { + user_vmx_nested_state->shadow_vmcs12, + sizeof(*shadow_vmcs12))) { ret = -EFAULT; goto error_guest_mode; } diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index a99613a060dd..68d231d49c7a 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * KVM PMU support for Intel CPUs * @@ -6,10 +7,6 @@ * Authors: * Avi Kivity <avi@redhat.com> * Gleb Natapov <gleb@redhat.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * */ #include <linux/types.h> #include <linux/kvm_host.h> diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 3a742428ad17..337718fc8a36 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -201,9 +201,10 @@ struct __packed vmcs12 { /* * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region * and any VMCS region. Although only sizeof(struct vmcs12) are used by the - * current implementation, 4K are reserved to avoid future complications. + * current implementation, 4K are reserved to avoid future complications and + * to preserve userspace ABI. */ -#define VMCS12_SIZE 0x1000 +#define VMCS12_SIZE KVM_STATE_NESTED_VMX_VMCS_SIZE /* * VMCS12_MAX_FIELD_INDEX is the highest index value used in any diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b93e36ddee5e..d98eac371c0a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Kernel-based Virtual Machine driver for Linux * @@ -10,10 +11,6 @@ * Authors: * Avi Kivity <avi@qumranet.com> * Yaniv Kamay <yaniv@qumranet.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * */ #include <linux/frame.h> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 83aefd759846..63bb1ee8258e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Kernel-based Virtual Machine driver for Linux * @@ -13,10 +14,6 @@ * Yaniv Kamay <yaniv@qumranet.com> * Amit Shah <amit.shah@qumranet.com> * Ben-Ami Yassour <benami@il.ibm.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * */ #include <linux/kvm_host.h> @@ -70,6 +67,7 @@ #include <asm/mshyperv.h> #include <asm/hypervisor.h> #include <asm/intel_pt.h> +#include <clocksource/hyperv_timer.h> #define CREATE_TRACE_POINTS #include "trace.h" @@ -1557,7 +1555,7 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) vcpu->arch.tsc_always_catchup = 1; return 0; } else { - WARN(1, "user requested TSC rate below hardware speed\n"); + pr_warn_ratelimited("user requested TSC rate below hardware speed\n"); return -1; } } @@ -1567,8 +1565,8 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) user_tsc_khz, tsc_khz); if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) { - WARN_ONCE(1, "Invalid TSC scaling ratio - virtual-tsc-khz=%u\n", - user_tsc_khz); + pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n", + user_tsc_khz); return -1; } @@ -1731,7 +1729,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_compute_tsc_offset(vcpu, data); - ns = ktime_get_boot_ns(); + ns = ktime_get_boottime_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; if (vcpu->arch.virtual_tsc_khz) { @@ -2073,7 +2071,7 @@ u64 get_kvmclock_ns(struct kvm *kvm) spin_lock(&ka->pvclock_gtod_sync_lock); if (!ka->use_master_clock) { spin_unlock(&ka->pvclock_gtod_sync_lock); - return ktime_get_boot_ns() + ka->kvmclock_offset; + return ktime_get_boottime_ns() + ka->kvmclock_offset; } hv_clock.tsc_timestamp = ka->master_cycle_now; @@ -2089,7 +2087,7 @@ u64 get_kvmclock_ns(struct kvm *kvm) &hv_clock.tsc_to_system_mul); ret = __pvclock_read_cycles(&hv_clock, rdtsc()); } else - ret = ktime_get_boot_ns() + ka->kvmclock_offset; + ret = ktime_get_boottime_ns() + ka->kvmclock_offset; put_cpu(); @@ -2188,7 +2186,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) } if (!use_master_clock) { host_tsc = rdtsc(); - kernel_ns = ktime_get_boot_ns(); + kernel_ns = ktime_get_boottime_ns(); } tsc_timestamp = kvm_read_l1_tsc(v, host_tsc); @@ -9018,7 +9016,7 @@ int kvm_arch_hardware_enable(void) * before any KVM threads can be running. Unfortunately, we can't * bring the TSCs fully up to date with real time, as we aren't yet far * enough into CPU bringup that we know how much real time has actually - * elapsed; our helper function, ktime_get_boot_ns() will be using boot + * elapsed; our helper function, ktime_get_boottime_ns() will be using boot * variables that haven't been updated yet. * * So we simply find the maximum observed TSC above, then record the @@ -9246,7 +9244,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) mutex_init(&kvm->arch.apic_map_lock); spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); - kvm->arch.kvmclock_offset = -ktime_get_boot_ns(); + kvm->arch.kvmclock_offset = -ktime_get_boottime_ns(); pvclock_update_vm_gtod_copy(kvm); kvm->arch.guest_can_read_msr_platform_info = true; diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c index 1811fa4a1b1a..7c48ff4ae8d1 100644 --- a/arch/x86/lib/cache-smp.c +++ b/arch/x86/lib/cache-smp.c @@ -15,6 +15,7 @@ EXPORT_SYMBOL(wbinvd_on_cpu); int wbinvd_on_all_cpus(void) { - return on_each_cpu(__wbinvd, NULL, 1); + on_each_cpu(__wbinvd, NULL, 1); + return 0; } EXPORT_SYMBOL(wbinvd_on_all_cpus); diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c index 3261abb21ef4..4f1719e22d3c 100644 --- a/arch/x86/lib/cmdline.c +++ b/arch/x86/lib/cmdline.c @@ -1,6 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* - * This file is part of the Linux kernel, and is made available under - * the terms of the GNU General Public License version 2. * * Misc librarized functions for cmdline poking. */ diff --git a/arch/x86/lib/iomap_copy_64.S b/arch/x86/lib/iomap_copy_64.S index 33147fef3452..a9bdf0805be0 100644 --- a/arch/x86/lib/iomap_copy_64.S +++ b/arch/x86/lib/iomap_copy_64.S @@ -1,18 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright 2006 PathScale, Inc. All Rights Reserved. - * - * This file is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include <linux/linkage.h> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 46df4c6aae46..794f364cb882 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -710,6 +710,10 @@ static void set_signal_archinfo(unsigned long address, * To avoid leaking information about the kernel page * table layout, pretend that user-mode accesses to * kernel addresses are always protection faults. + * + * NB: This means that failed vsyscalls with vsyscall=none + * will have the PROT bit. This doesn't leak any + * information and does not appear to cause any problems. */ if (address >= TASK_SIZE_MAX) error_code |= X86_PF_PROT; @@ -756,8 +760,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, set_signal_archinfo(address, error_code); /* XXX: hwpoison faults will set the wrong code. */ - force_sig_fault(signal, si_code, (void __user *)address, - tsk); + force_sig_fault(signal, si_code, (void __user *)address); } /* @@ -918,7 +921,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, if (si_code == SEGV_PKUERR) force_sig_pkuerr((void __user *)address, pkey); - force_sig_fault(SIGSEGV, si_code, (void __user *)address, tsk); + force_sig_fault(SIGSEGV, si_code, (void __user *)address); return; } @@ -1015,8 +1018,6 @@ static void do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, vm_fault_t fault) { - struct task_struct *tsk = current; - /* Kernel mode? Handle exceptions or die: */ if (!(error_code & X86_PF_USER)) { no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); @@ -1031,6 +1032,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, #ifdef CONFIG_MEMORY_FAILURE if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { + struct task_struct *tsk = current; unsigned lsb = 0; pr_err( @@ -1040,11 +1042,11 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); if (fault & VM_FAULT_HWPOISON) lsb = PAGE_SHIFT; - force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, tsk); + force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb); return; } #endif - force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, tsk); + force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address); } static noinline void @@ -1369,16 +1371,18 @@ void do_user_addr_fault(struct pt_regs *regs, #ifdef CONFIG_X86_64 /* - * Instruction fetch faults in the vsyscall page might need - * emulation. The vsyscall page is at a high address - * (>PAGE_OFFSET), but is considered to be part of the user - * address space. + * Faults in the vsyscall page might need emulation. The + * vsyscall page is at a high address (>PAGE_OFFSET), but is + * considered to be part of the user address space. * * The vsyscall page does not have a "real" VMA, so do this * emulation before we go searching for VMAs. + * + * PKRU never rejects instruction fetches, so we don't need + * to consider the PF_PK bit. */ - if ((hw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) { - if (emulate_vsyscall(regs, address)) + if (is_vsyscall_vaddr(address)) { + if (emulate_vsyscall(hw_error_code, regs, address)) return; } #endif diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 693aaf28d5fe..0f01c7b1d217 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -671,23 +671,25 @@ static unsigned long __meminit phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end, unsigned long page_size_mask, bool init) { - unsigned long paddr_next, paddr_last = paddr_end; - unsigned long vaddr = (unsigned long)__va(paddr); - int i = p4d_index(vaddr); + unsigned long vaddr, vaddr_end, vaddr_next, paddr_next, paddr_last; + + paddr_last = paddr_end; + vaddr = (unsigned long)__va(paddr); + vaddr_end = (unsigned long)__va(paddr_end); if (!pgtable_l5_enabled()) return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask, init); - for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) { - p4d_t *p4d; + for (; vaddr < vaddr_end; vaddr = vaddr_next) { + p4d_t *p4d = p4d_page + p4d_index(vaddr); pud_t *pud; - vaddr = (unsigned long)__va(paddr); - p4d = p4d_page + p4d_index(vaddr); - paddr_next = (paddr & P4D_MASK) + P4D_SIZE; + vaddr_next = (vaddr & P4D_MASK) + P4D_SIZE; + paddr = __pa(vaddr); if (paddr >= paddr_end) { + paddr_next = __pa(vaddr_next); if (!after_bootmem && !e820__mapped_any(paddr & P4D_MASK, paddr_next, E820_TYPE_RAM) && @@ -699,13 +701,13 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end, if (!p4d_none(*p4d)) { pud = pud_offset(p4d, 0); - paddr_last = phys_pud_init(pud, paddr, paddr_end, - page_size_mask, init); + paddr_last = phys_pud_init(pud, paddr, __pa(vaddr_end), + page_size_mask, init); continue; } pud = alloc_low_page(); - paddr_last = phys_pud_init(pud, paddr, paddr_end, + paddr_last = phys_pud_init(pud, paddr, __pa(vaddr_end), page_size_mask, init); spin_lock(&init_mm.page_table_lock); diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 4b6423e7bd21..e500f1df1140 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -28,9 +28,11 @@ #include "physaddr.h" -struct ioremap_mem_flags { - bool system_ram; - bool desc_other; +/* + * Descriptor controlling ioremap() behavior. + */ +struct ioremap_desc { + unsigned int flags; }; /* @@ -62,13 +64,14 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size, return err; } -static bool __ioremap_check_ram(struct resource *res) +/* Does the range (or a subset of) contain normal RAM? */ +static unsigned int __ioremap_check_ram(struct resource *res) { unsigned long start_pfn, stop_pfn; unsigned long i; if ((res->flags & IORESOURCE_SYSTEM_RAM) != IORESOURCE_SYSTEM_RAM) - return false; + return 0; start_pfn = (res->start + PAGE_SIZE - 1) >> PAGE_SHIFT; stop_pfn = (res->end + 1) >> PAGE_SHIFT; @@ -76,28 +79,44 @@ static bool __ioremap_check_ram(struct resource *res) for (i = 0; i < (stop_pfn - start_pfn); ++i) if (pfn_valid(start_pfn + i) && !PageReserved(pfn_to_page(start_pfn + i))) - return true; + return IORES_MAP_SYSTEM_RAM; } - return false; + return 0; } -static int __ioremap_check_desc_other(struct resource *res) +/* + * In a SEV guest, NONE and RESERVED should not be mapped encrypted because + * there the whole memory is already encrypted. + */ +static unsigned int __ioremap_check_encrypted(struct resource *res) { - return (res->desc != IORES_DESC_NONE); + if (!sev_active()) + return 0; + + switch (res->desc) { + case IORES_DESC_NONE: + case IORES_DESC_RESERVED: + break; + default: + return IORES_MAP_ENCRYPTED; + } + + return 0; } -static int __ioremap_res_check(struct resource *res, void *arg) +static int __ioremap_collect_map_flags(struct resource *res, void *arg) { - struct ioremap_mem_flags *flags = arg; + struct ioremap_desc *desc = arg; - if (!flags->system_ram) - flags->system_ram = __ioremap_check_ram(res); + if (!(desc->flags & IORES_MAP_SYSTEM_RAM)) + desc->flags |= __ioremap_check_ram(res); - if (!flags->desc_other) - flags->desc_other = __ioremap_check_desc_other(res); + if (!(desc->flags & IORES_MAP_ENCRYPTED)) + desc->flags |= __ioremap_check_encrypted(res); - return flags->system_ram && flags->desc_other; + return ((desc->flags & (IORES_MAP_SYSTEM_RAM | IORES_MAP_ENCRYPTED)) == + (IORES_MAP_SYSTEM_RAM | IORES_MAP_ENCRYPTED)); } /* @@ -106,15 +125,15 @@ static int __ioremap_res_check(struct resource *res, void *arg) * resource described not as IORES_DESC_NONE (e.g. IORES_DESC_ACPI_TABLES). */ static void __ioremap_check_mem(resource_size_t addr, unsigned long size, - struct ioremap_mem_flags *flags) + struct ioremap_desc *desc) { u64 start, end; start = (u64)addr; end = start + size - 1; - memset(flags, 0, sizeof(*flags)); + memset(desc, 0, sizeof(struct ioremap_desc)); - walk_mem_res(start, end, flags, __ioremap_res_check); + walk_mem_res(start, end, desc, __ioremap_collect_map_flags); } /* @@ -131,15 +150,15 @@ static void __ioremap_check_mem(resource_size_t addr, unsigned long size, * have to convert them into an offset in a page-aligned mapping, but the * caller shouldn't need to know that small detail. */ -static void __iomem *__ioremap_caller(resource_size_t phys_addr, - unsigned long size, enum page_cache_mode pcm, - void *caller, bool encrypted) +static void __iomem * +__ioremap_caller(resource_size_t phys_addr, unsigned long size, + enum page_cache_mode pcm, void *caller, bool encrypted) { unsigned long offset, vaddr; resource_size_t last_addr; const resource_size_t unaligned_phys_addr = phys_addr; const unsigned long unaligned_size = size; - struct ioremap_mem_flags mem_flags; + struct ioremap_desc io_desc; struct vm_struct *area; enum page_cache_mode new_pcm; pgprot_t prot; @@ -158,12 +177,12 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, return NULL; } - __ioremap_check_mem(phys_addr, size, &mem_flags); + __ioremap_check_mem(phys_addr, size, &io_desc); /* * Don't allow anybody to remap normal RAM that we're using.. */ - if (mem_flags.system_ram) { + if (io_desc.flags & IORES_MAP_SYSTEM_RAM) { WARN_ONCE(1, "ioremap on RAM at %pa - %pa\n", &phys_addr, &last_addr); return NULL; @@ -201,7 +220,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, * resulting mapping. */ prot = PAGE_KERNEL_IO; - if ((sev_active() && mem_flags.desc_other) || encrypted) + if ((io_desc.flags & IORES_MAP_ENCRYPTED) || encrypted) prot = pgprot_encrypted(prot); switch (pcm) { diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 8dc0fc0b1382..296da58f3013 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -199,7 +199,7 @@ static inline p4d_t *early_p4d_offset(pgd_t *pgd, unsigned long addr) if (!pgtable_l5_enabled()) return (p4d_t *)pgd; - p4d = __pa_nodebug(pgd_val(*pgd)) & PTE_PFN_MASK; + p4d = pgd_val(*pgd) & PTE_PFN_MASK; p4d += __START_KERNEL_map - phys_base; return (p4d_t *)p4d + p4d_index(addr); } diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index dc3f058bdf9b..dc6182eecefa 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -52,7 +52,7 @@ static __initdata struct kaslr_memory_region { } kaslr_regions[] = { { &page_offset_base, 0 }, { &vmalloc_base, 0 }, - { &vmemmap_base, 1 }, + { &vmemmap_base, 0 }, }; /* Get size in bytes used by the memory region */ @@ -78,6 +78,7 @@ void __init kernel_randomize_memory(void) unsigned long rand, memory_tb; struct rnd_state rand_state; unsigned long remain_entropy; + unsigned long vmemmap_size; vaddr_start = pgtable_l5_enabled() ? __PAGE_OFFSET_BASE_L5 : __PAGE_OFFSET_BASE_L4; vaddr = vaddr_start; @@ -109,6 +110,14 @@ void __init kernel_randomize_memory(void) if (memory_tb < kaslr_regions[0].size_tb) kaslr_regions[0].size_tb = memory_tb; + /* + * Calculate the vmemmap region size in TBs, aligned to a TB + * boundary. + */ + vmemmap_size = (kaslr_regions[0].size_tb << (TB_SHIFT - PAGE_SHIFT)) * + sizeof(struct page); + kaslr_regions[2].size_tb = DIV_ROUND_UP(vmemmap_size, 1UL << TB_SHIFT); + /* Calculate entropy available between regions */ remain_entropy = vaddr_end - vaddr_start; for (i = 0; i < ARRAY_SIZE(kaslr_regions); i++) diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 51f50a7a07ef..e0df96fdfe46 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -1,13 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * AMD Memory Encryption Support * * Copyright (C) 2016 Advanced Micro Devices, Inc. * * Author: Tom Lendacky <thomas.lendacky@amd.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #define DISABLE_BRANCH_PROFILING diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S index 40a6085063d6..6d71481a1e70 100644 --- a/arch/x86/mm/mem_encrypt_boot.S +++ b/arch/x86/mm/mem_encrypt_boot.S @@ -1,13 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * AMD Memory Encryption Support * * Copyright (C) 2016 Advanced Micro Devices, Inc. * * Author: Tom Lendacky <thomas.lendacky@amd.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/linkage.h> diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index 4aa9b1480866..e2b0e2ac07bb 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -1,13 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * AMD Memory Encryption Support * * Copyright (C) 2016 Advanced Micro Devices, Inc. * * Author: Tom Lendacky <thomas.lendacky@amd.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #define DISABLE_BRANCH_PROFILING @@ -73,6 +70,19 @@ struct sme_populate_pgd_data { unsigned long vaddr_end; }; +/* + * This work area lives in the .init.scratch section, which lives outside of + * the kernel proper. It is sized to hold the intermediate copy buffer and + * more than enough pagetable pages. + * + * By using this section, the kernel can be encrypted in place and it + * avoids any possibility of boot parameters or initramfs images being + * placed such that the in-place encryption logic overwrites them. This + * section is 2MB aligned to allow for simple pagetable setup using only + * PMD entries (see vmlinux.lds.S). + */ +static char sme_workarea[2 * PMD_PAGE_SIZE] __section(.init.scratch); + static char sme_cmdline_arg[] __initdata = "mem_encrypt"; static char sme_cmdline_on[] __initdata = "on"; static char sme_cmdline_off[] __initdata = "off"; @@ -314,8 +324,13 @@ void __init sme_encrypt_kernel(struct boot_params *bp) } #endif - /* Set the encryption workarea to be immediately after the kernel */ - workarea_start = kernel_end; + /* + * We're running identity mapped, so we must obtain the address to the + * SME encryption workarea using rip-relative addressing. + */ + asm ("lea sme_workarea(%%rip), %0" + : "=r" (workarea_start) + : "p" (sme_workarea)); /* * Calculate required number of workarea bytes needed: diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 0d1c47cbbdd6..895fb7a9294d 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -912,7 +912,7 @@ void mpx_notify_unmap(struct mm_struct *mm, unsigned long start, ret = mpx_unmap_tables(mm, start, end); if (ret) - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); } /* MPX cannot handle addresses above 47 bits yet. */ diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 32bfab4e21eb..eaaed5bfc4a4 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -186,9 +186,7 @@ struct jit_context { #define BPF_MAX_INSN_SIZE 128 #define BPF_INSN_SAFETY 64 -#define AUX_STACK_SPACE 40 /* Space for RBX, R13, R14, R15, tailcnt */ - -#define PROLOGUE_SIZE 37 +#define PROLOGUE_SIZE 20 /* * Emit x86-64 prologue code for BPF program and check its size. @@ -199,44 +197,19 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) u8 *prog = *pprog; int cnt = 0; - /* push rbp */ - EMIT1(0x55); - - /* mov rbp,rsp */ - EMIT3(0x48, 0x89, 0xE5); - - /* sub rsp, rounded_stack_depth + AUX_STACK_SPACE */ - EMIT3_off32(0x48, 0x81, 0xEC, - round_up(stack_depth, 8) + AUX_STACK_SPACE); - - /* sub rbp, AUX_STACK_SPACE */ - EMIT4(0x48, 0x83, 0xED, AUX_STACK_SPACE); - - /* mov qword ptr [rbp+0],rbx */ - EMIT4(0x48, 0x89, 0x5D, 0); - /* mov qword ptr [rbp+8],r13 */ - EMIT4(0x4C, 0x89, 0x6D, 8); - /* mov qword ptr [rbp+16],r14 */ - EMIT4(0x4C, 0x89, 0x75, 16); - /* mov qword ptr [rbp+24],r15 */ - EMIT4(0x4C, 0x89, 0x7D, 24); - + EMIT1(0x55); /* push rbp */ + EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */ + /* sub rsp, rounded_stack_depth */ + EMIT3_off32(0x48, 0x81, 0xEC, round_up(stack_depth, 8)); + EMIT1(0x53); /* push rbx */ + EMIT2(0x41, 0x55); /* push r13 */ + EMIT2(0x41, 0x56); /* push r14 */ + EMIT2(0x41, 0x57); /* push r15 */ if (!ebpf_from_cbpf) { - /* - * Clear the tail call counter (tail_call_cnt): for eBPF tail - * calls we need to reset the counter to 0. It's done in two - * instructions, resetting RAX register to 0, and moving it - * to the counter location. - */ - - /* xor eax, eax */ - EMIT2(0x31, 0xc0); - /* mov qword ptr [rbp+32], rax */ - EMIT4(0x48, 0x89, 0x45, 32); - + /* zero init tail_call_cnt */ + EMIT2(0x6a, 0x00); BUILD_BUG_ON(cnt != PROLOGUE_SIZE); } - *pprog = prog; } @@ -281,13 +254,13 @@ static void emit_bpf_tail_call(u8 **pprog) * if (tail_call_cnt > MAX_TAIL_CALL_CNT) * goto out; */ - EMIT2_off32(0x8B, 0x85, 36); /* mov eax, dword ptr [rbp + 36] */ + EMIT2_off32(0x8B, 0x85, -36 - MAX_BPF_STACK); /* mov eax, dword ptr [rbp - 548] */ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ #define OFFSET2 (30 + RETPOLINE_RAX_BPF_JIT_SIZE) EMIT2(X86_JA, OFFSET2); /* ja out */ label2 = cnt; EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ - EMIT2_off32(0x89, 0x85, 36); /* mov dword ptr [rbp + 36], eax */ + EMIT2_off32(0x89, 0x85, -36 - MAX_BPF_STACK); /* mov dword ptr [rbp -548], eax */ /* prog = array->ptrs[index]; */ EMIT4_off32(0x48, 0x8B, 0x84, 0xD6, /* mov rax, [rsi + rdx * 8 + offsetof(...)] */ @@ -1036,19 +1009,14 @@ emit_jmp: seen_exit = true; /* Update cleanup_addr */ ctx->cleanup_addr = proglen; - /* mov rbx, qword ptr [rbp+0] */ - EMIT4(0x48, 0x8B, 0x5D, 0); - /* mov r13, qword ptr [rbp+8] */ - EMIT4(0x4C, 0x8B, 0x6D, 8); - /* mov r14, qword ptr [rbp+16] */ - EMIT4(0x4C, 0x8B, 0x75, 16); - /* mov r15, qword ptr [rbp+24] */ - EMIT4(0x4C, 0x8B, 0x7D, 24); - - /* add rbp, AUX_STACK_SPACE */ - EMIT4(0x48, 0x83, 0xC5, AUX_STACK_SPACE); - EMIT1(0xC9); /* leave */ - EMIT1(0xC3); /* ret */ + if (!bpf_prog_was_classic(bpf_prog)) + EMIT1(0x5B); /* get rid of tail_call_cnt */ + EMIT2(0x41, 0x5F); /* pop r15 */ + EMIT2(0x41, 0x5E); /* pop r14 */ + EMIT2(0x41, 0x5D); /* pop r13 */ + EMIT1(0x5B); /* pop rbx */ + EMIT1(0xC9); /* leave */ + EMIT1(0xC3); /* ret */ break; default: diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 632b83885867..3b9fd679cea9 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -728,7 +728,7 @@ void efi_recover_from_page_fault(unsigned long phys_addr) * Address range 0x0000 - 0x0fff is always mapped in the efi_pgd, so * page faulting on these addresses isn't expected. */ - if (phys_addr >= 0x0000 && phys_addr <= 0x0fff) + if (phys_addr <= 0x0fff) return; /* diff --git a/arch/x86/platform/geode/alix.c b/arch/x86/platform/geode/alix.c index 1865c196f136..c33f744b5388 100644 --- a/arch/x86/platform/geode/alix.c +++ b/arch/x86/platform/geode/alix.c @@ -1,9 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * System Specific setup for PCEngines ALIX. * At the moment this means setup of GPIO control of LEDs * on Alix.2/3/6 boards. * - * * Copyright (C) 2008 Constantin Baranov <const@mimas.ru> * Copyright (C) 2011 Ed Wildgoose <kernel@wildgooses.com> * and Philip Prindeville <philipp@redfish-solutions.com> @@ -11,10 +11,6 @@ * TODO: There are large similarities with leds-net5501.c * by Alessandro Zummo <a.zummo@towertech.it> * In the future leds-net5501.c should be migrated over to platform - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation. */ #include <linux/kernel.h> @@ -24,7 +20,6 @@ #include <linux/moduleparam.h> #include <linux/leds.h> #include <linux/platform_device.h> -#include <linux/gpio.h> #include <linux/input.h> #include <linux/gpio_keys.h> #include <linux/dmi.h> diff --git a/arch/x86/platform/geode/geos.c b/arch/x86/platform/geode/geos.c index 4fcdb91318a0..73a3f49b4eb6 100644 --- a/arch/x86/platform/geode/geos.c +++ b/arch/x86/platform/geode/geos.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * System Specific setup for Traverse Technologies GEOS. * At the moment this means setup of GPIO control of LEDs. @@ -9,10 +10,6 @@ * TODO: There are large similarities with leds-net5501.c * by Alessandro Zummo <a.zummo@towertech.it> * In the future leds-net5501.c should be migrated over to platform - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation. */ #include <linux/kernel.h> @@ -21,7 +18,6 @@ #include <linux/string.h> #include <linux/leds.h> #include <linux/platform_device.h> -#include <linux/gpio.h> #include <linux/input.h> #include <linux/gpio_keys.h> #include <linux/dmi.h> diff --git a/arch/x86/platform/geode/net5501.c b/arch/x86/platform/geode/net5501.c index a2f6b982a729..163e1b545517 100644 --- a/arch/x86/platform/geode/net5501.c +++ b/arch/x86/platform/geode/net5501.c @@ -1,19 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * System Specific setup for Soekris net5501 * At the moment this means setup of GPIO control of LEDs and buttons * on net5501 boards. * - * * Copyright (C) 2008-2009 Tower Technologies * Written by Alessandro Zummo <a.zummo@towertech.it> * * Copyright (C) 2008 Constantin Baranov <const@mimas.ru> * Copyright (C) 2011 Ed Wildgoose <kernel@wildgooses.com> * and Philip Prindeville <philipp@redfish-solutions.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation. */ #include <linux/kernel.h> @@ -22,7 +18,6 @@ #include <linux/string.h> #include <linux/leds.h> #include <linux/platform_device.h> -#include <linux/gpio.h> #include <linux/input.h> #include <linux/gpio_keys.h> diff --git a/arch/x86/purgatory/entry64.S b/arch/x86/purgatory/entry64.S index d1a4291d3568..275a646d1048 100644 --- a/arch/x86/purgatory/entry64.S +++ b/arch/x86/purgatory/entry64.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2003,2004 Eric Biederman (ebiederm@xmission.com) * Copyright (C) 2014 Red Hat Inc. @@ -5,9 +6,6 @@ * Author(s): Vivek Goyal <vgoyal@redhat.com> * * This code has been taken from kexec-tools. - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. */ .text diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c index 025c34ac0d84..6d8d5a34c377 100644 --- a/arch/x86/purgatory/purgatory.c +++ b/arch/x86/purgatory/purgatory.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * purgatory: Runs between two kernels * @@ -5,9 +6,6 @@ * * Author: * Vivek Goyal <vgoyal@redhat.com> - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. */ #include <linux/bug.h> diff --git a/arch/x86/purgatory/setup-x86_64.S b/arch/x86/purgatory/setup-x86_64.S index dfae9b9e60b5..321146be741d 100644 --- a/arch/x86/purgatory/setup-x86_64.S +++ b/arch/x86/purgatory/setup-x86_64.S @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * purgatory: setup code * @@ -5,9 +6,6 @@ * Copyright (C) 2014 Red Hat Inc. * * This code has been taken from kexec-tools. - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. */ #include <asm/purgatory.h> diff --git a/arch/x86/purgatory/stack.S b/arch/x86/purgatory/stack.S index 50a4147f91fb..8b1427422dfc 100644 --- a/arch/x86/purgatory/stack.S +++ b/arch/x86/purgatory/stack.S @@ -1,10 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * purgatory: stack * * Copyright (C) 2014 Red Hat Inc. - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. */ /* A stack for the loaded kernel. diff --git a/arch/x86/purgatory/string.c b/arch/x86/purgatory/string.c index 795ca4f2cb3c..01ad43873ad9 100644 --- a/arch/x86/purgatory/string.c +++ b/arch/x86/purgatory/string.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Simple string functions. * @@ -5,9 +6,6 @@ * * Author: * Vivek Goyal <vgoyal@redhat.com> - * - * This source code is licensed under the GNU General Public License, - * Version 2. See the file COPYING for more details. */ #include <linux/types.h> diff --git a/arch/x86/ras/Kconfig b/arch/x86/ras/Kconfig index a9c3db125222..9ad6842de4b4 100644 --- a/arch/x86/ras/Kconfig +++ b/arch/x86/ras/Kconfig @@ -11,3 +11,13 @@ config RAS_CEC Bear in mind that this is absolutely useless if your platform doesn't have ECC DIMMs and doesn't have DRAM ECC checking enabled in the BIOS. + +config RAS_CEC_DEBUG + bool "CEC debugging machinery" + default n + depends on RAS_CEC + help + Add extra files to (debugfs)/ras/cec to test the correctable error + collector feature. "pfn" is a writable file that allows user to + simulate an error in a particular page frame. "array" is a read-only + file that dumps out the current state of all pages logged so far. diff --git a/arch/x86/tools/insn_decoder_test.c b/arch/x86/tools/insn_decoder_test.c index e455349e0ab5..34eda63c124b 100644 --- a/arch/x86/tools/insn_decoder_test.c +++ b/arch/x86/tools/insn_decoder_test.c @@ -111,7 +111,7 @@ static void parse_args(int argc, char **argv) int main(int argc, char **argv) { char line[BUFSIZE], sym[BUFSIZE] = "<unknown>"; - unsigned char insn_buf[16]; + unsigned char insn_buff[16]; struct insn insn; int insns = 0; int warnings = 0; @@ -130,7 +130,7 @@ int main(int argc, char **argv) } insns++; - memset(insn_buf, 0, 16); + memset(insn_buff, 0, 16); strcpy(copy, line); tab1 = strchr(copy, '\t'); if (!tab1) @@ -143,13 +143,13 @@ int main(int argc, char **argv) *tab2 = '\0'; /* Characters beyond tab2 aren't examined */ while (s < tab2) { if (sscanf(s, "%x", &b) == 1) { - insn_buf[nb++] = (unsigned char) b; + insn_buff[nb++] = (unsigned char) b; s += 3; } else break; } /* Decode an instruction */ - insn_init(&insn, insn_buf, sizeof(insn_buf), x86_64); + insn_init(&insn, insn_buff, sizeof(insn_buff), x86_64); insn_get_length(&insn); if (insn.length != nb) { warnings++; diff --git a/arch/x86/tools/insn_sanity.c b/arch/x86/tools/insn_sanity.c index 14cf07916081..185ceba9d289 100644 --- a/arch/x86/tools/insn_sanity.c +++ b/arch/x86/tools/insn_sanity.c @@ -83,7 +83,7 @@ static void dump_insn(FILE *fp, struct insn *insn) } static void dump_stream(FILE *fp, const char *msg, unsigned long nr_iter, - unsigned char *insn_buf, struct insn *insn) + unsigned char *insn_buff, struct insn *insn) { int i; @@ -96,7 +96,7 @@ static void dump_stream(FILE *fp, const char *msg, unsigned long nr_iter, /* Input a decoded instruction sequence directly */ fprintf(fp, " $ echo "); for (i = 0; i < MAX_INSN_SIZE; i++) - fprintf(fp, " %02x", insn_buf[i]); + fprintf(fp, " %02x", insn_buff[i]); fprintf(fp, " | %s -i -\n", prog); if (!input_file) { @@ -124,7 +124,7 @@ fail: } /* Read given instruction sequence from the input file */ -static int read_next_insn(unsigned char *insn_buf) +static int read_next_insn(unsigned char *insn_buff) { char buf[256] = "", *tmp; int i; @@ -134,7 +134,7 @@ static int read_next_insn(unsigned char *insn_buf) return 0; for (i = 0; i < MAX_INSN_SIZE; i++) { - insn_buf[i] = (unsigned char)strtoul(tmp, &tmp, 16); + insn_buff[i] = (unsigned char)strtoul(tmp, &tmp, 16); if (*tmp != ' ') break; } @@ -142,19 +142,19 @@ static int read_next_insn(unsigned char *insn_buf) return i; } -static int generate_insn(unsigned char *insn_buf) +static int generate_insn(unsigned char *insn_buff) { int i; if (input_file) - return read_next_insn(insn_buf); + return read_next_insn(insn_buff); /* Fills buffer with random binary up to MAX_INSN_SIZE */ for (i = 0; i < MAX_INSN_SIZE - 1; i += 2) - *(unsigned short *)(&insn_buf[i]) = random() & 0xffff; + *(unsigned short *)(&insn_buff[i]) = random() & 0xffff; while (i < MAX_INSN_SIZE) - insn_buf[i++] = random() & 0xff; + insn_buff[i++] = random() & 0xff; return i; } @@ -226,31 +226,31 @@ int main(int argc, char **argv) int insns = 0; int errors = 0; unsigned long i; - unsigned char insn_buf[MAX_INSN_SIZE * 2]; + unsigned char insn_buff[MAX_INSN_SIZE * 2]; parse_args(argc, argv); /* Prepare stop bytes with NOPs */ - memset(insn_buf + MAX_INSN_SIZE, INSN_NOP, MAX_INSN_SIZE); + memset(insn_buff + MAX_INSN_SIZE, INSN_NOP, MAX_INSN_SIZE); for (i = 0; i < iter_end; i++) { - if (generate_insn(insn_buf) <= 0) + if (generate_insn(insn_buff) <= 0) break; if (i < iter_start) /* Skip to given iteration number */ continue; /* Decode an instruction */ - insn_init(&insn, insn_buf, sizeof(insn_buf), x86_64); + insn_init(&insn, insn_buff, sizeof(insn_buff), x86_64); insn_get_length(&insn); if (insn.next_byte <= insn.kaddr || insn.kaddr + MAX_INSN_SIZE < insn.next_byte) { /* Access out-of-range memory */ - dump_stream(stderr, "Error: Found an access violation", i, insn_buf, &insn); + dump_stream(stderr, "Error: Found an access violation", i, insn_buff, &insn); errors++; } else if (verbose && !insn_complete(&insn)) - dump_stream(stdout, "Info: Found an undecodable input", i, insn_buf, &insn); + dump_stream(stdout, "Info: Found an undecodable input", i, insn_buff, &insn); else if (verbose >= 2) dump_insn(stdout, &insn); insns++; diff --git a/arch/x86/um/delay.c b/arch/x86/um/delay.c index a8fb7ca4822b..8d510ceb43fb 100644 --- a/arch/x86/um/delay.c +++ b/arch/x86/um/delay.c @@ -1,10 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2011 Richard Weinberger <richrd@nod.at> * Mostly copied from arch/x86/lib/delay.c - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/export.h> diff --git a/arch/x86/um/mem_32.c b/arch/x86/um/mem_32.c index 56c44d865f7b..19c5dbd46770 100644 --- a/arch/x86/um/mem_32.c +++ b/arch/x86/um/mem_32.c @@ -1,9 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2011 Richard Weinberger <richrd@nod.at> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/mm.h> diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index 8b4a71efe7ee..7c11c9e5d7ea 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -471,7 +471,7 @@ long sys_sigreturn(void) return PT_REGS_SYSCALL_RET(¤t->thread.regs); segfault: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return 0; } @@ -577,6 +577,6 @@ long sys_rt_sigreturn(void) return PT_REGS_SYSCALL_RET(¤t->thread.regs); segfault: - force_sig(SIGSEGV, current); + force_sig(SIGSEGV); return 0; } diff --git a/arch/x86/um/vdso/um_vdso.c b/arch/x86/um/vdso/um_vdso.c index 7c441b59d375..ac9c02b9d92c 100644 --- a/arch/x86/um/vdso/um_vdso.c +++ b/arch/x86/um/vdso/um_vdso.c @@ -1,10 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2011 Richard Weinberger <richrd@nod.at> * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * * This vDSO turns all calls into a syscall so that UML can trap them. */ diff --git a/arch/x86/um/vdso/vma.c b/arch/x86/um/vdso/vma.c index 6be22f991b59..9e7c4aba6c3a 100644 --- a/arch/x86/um/vdso/vma.c +++ b/arch/x86/um/vdso/vma.c @@ -1,9 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2011 Richard Weinberger <richrd@nod.at> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include <linux/slab.h> diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index e07abefd3d26..ba5a41828e9d 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -7,6 +7,7 @@ config XEN bool "Xen guest support" depends on PARAVIRT select PARAVIRT_CLOCK + select X86_HV_CALLBACK_VECTOR depends on X86_64 || (X86_32 && X86_PAE) depends on X86_LOCAL_APIC && X86_TSC help diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 590fcf863006..77d81c1a63e9 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -251,6 +251,7 @@ static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus) for_each_possible_cpu(i) { zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); } set_cpu_sibling_map(0); |