From d4e0340919fb9190a57e879fb3125c4acce0d9b2 Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Mon, 22 Jun 2020 18:18:02 -0700 Subject: arm64/module: Optimize module load time by optimizing PLT counting When loading a module, module_frob_arch_sections() tries to figure out the number of PLTs that'll be needed to handle all the RELAs. While doing this, it tries to dedupe PLT allocations for multiple R_AARCH64_CALL26 relocations to the same symbol. It does the same for R_AARCH64_JUMP26 relocations. To make checks for duplicates easier/faster, it sorts the relocation list by type, symbol and addend. That way, to check for a duplicate relocation, it just needs to compare with the previous entry. However, sorting the entire relocation array is unnecessary and expensive (O(n log n)) because there are a lot of other relocation types that don't need deduping or can't be deduped. So this commit partitions the array into entries that need deduping and those that don't. And then sorts just the part that needs deduping. And when CONFIG_RANDOMIZE_BASE is disabled, the sorting is skipped entirely because PLTs are not allocated for R_AARCH64_CALL26 and R_AARCH64_JUMP26 if it's disabled. This gives significant reduction in module load time for modules with large number of relocations with no measurable impact on modules with a small number of relocations. In my test setup with CONFIG_RANDOMIZE_BASE enabled, these were the results for a few downstream modules: Module Size (MB) wlan 14 video codec 3.8 drm 1.8 IPA 2.5 audio 1.2 gpu 1.8 Without this patch: Module Number of entries sorted Module load time (ms) wlan 243739 283 video codec 74029 138 drm 53837 67 IPA 42800 90 audio 21326 27 gpu 20967 32 Total time to load all these module: 637 ms With this patch: Module Number of entries sorted Module load time (ms) wlan 22454 61 video codec 10150 47 drm 13014 40 IPA 8097 63 audio 4606 16 gpu 6527 20 Total time to load all these modules: 247 Time saved during boot for just these 6 modules: 390 ms Signed-off-by: Saravana Kannan Acked-by: Will Deacon Cc: Ard Biesheuvel Link: https://lore.kernel.org/r/20200623011803.91232-1-saravanak@google.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/module-plts.c | 46 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/module-plts.c b/arch/arm64/kernel/module-plts.c index 65b08a74aec6..0ce3a28e3347 100644 --- a/arch/arm64/kernel/module-plts.c +++ b/arch/arm64/kernel/module-plts.c @@ -253,6 +253,40 @@ static unsigned int count_plts(Elf64_Sym *syms, Elf64_Rela *rela, int num, return ret; } +static bool branch_rela_needs_plt(Elf64_Sym *syms, Elf64_Rela *rela, + Elf64_Word dstidx) +{ + + Elf64_Sym *s = syms + ELF64_R_SYM(rela->r_info); + + if (s->st_shndx == dstidx) + return false; + + return ELF64_R_TYPE(rela->r_info) == R_AARCH64_JUMP26 || + ELF64_R_TYPE(rela->r_info) == R_AARCH64_CALL26; +} + +/* Group branch PLT relas at the front end of the array. */ +static int partition_branch_plt_relas(Elf64_Sym *syms, Elf64_Rela *rela, + int numrels, Elf64_Word dstidx) +{ + int i = 0, j = numrels - 1; + + if (!IS_ENABLED(CONFIG_RANDOMIZE_BASE)) + return 0; + + while (i < j) { + if (branch_rela_needs_plt(syms, &rela[i], dstidx)) + i++; + else if (branch_rela_needs_plt(syms, &rela[j], dstidx)) + swap(rela[i], rela[j]); + else + j--; + } + + return i; +} + int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings, struct module *mod) { @@ -290,7 +324,7 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, for (i = 0; i < ehdr->e_shnum; i++) { Elf64_Rela *rels = (void *)ehdr + sechdrs[i].sh_offset; - int numrels = sechdrs[i].sh_size / sizeof(Elf64_Rela); + int nents, numrels = sechdrs[i].sh_size / sizeof(Elf64_Rela); Elf64_Shdr *dstsec = sechdrs + sechdrs[i].sh_info; if (sechdrs[i].sh_type != SHT_RELA) @@ -300,8 +334,14 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, if (!(dstsec->sh_flags & SHF_EXECINSTR)) continue; - /* sort by type, symbol index and addend */ - sort(rels, numrels, sizeof(Elf64_Rela), cmp_rela, NULL); + /* + * sort branch relocations requiring a PLT by type, symbol index + * and addend + */ + nents = partition_branch_plt_relas(syms, rels, numrels, + sechdrs[i].sh_info); + if (nents) + sort(rels, nents, sizeof(Elf64_Rela), cmp_rela, NULL); if (!str_has_prefix(secstrings + dstsec->sh_name, ".init")) core_plts += count_plts(syms, rels, numrels, -- cgit v1.2.3 From 638d503130098e234b002942b33a4d886ef6f270 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 29 Jun 2020 10:08:31 +0530 Subject: arm64/panic: Unify all three existing notifier blocks Currently there are three different registered panic notifier blocks. This unifies all of them into a single one i.e arm64_panic_block, hence reducing code duplication and required calling sequence during panic. This preserves the existing dump sequence. While here, just use device_initcall() directly instead of __initcall() which has been a legacy alias for the earlier. This replacement is a pure cleanup with no functional implications. Signed-off-by: Anshuman Khandual Acked-by: Mark Rutland Cc: Will Deacon Cc: Steve Capper Cc: Mark Rutland Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Link: https://lore.kernel.org/r/1593405511-7625-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/cpufeature.h | 1 + arch/arm64/include/asm/memory.h | 1 + arch/arm64/kernel/cpufeature.c | 15 +-------------- arch/arm64/kernel/setup.c | 24 ++++++++++++++---------- arch/arm64/mm/init.c | 18 +----------------- 5 files changed, 18 insertions(+), 41 deletions(-) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 5d1f4ae42799..e375529ca9fc 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -774,6 +774,7 @@ static inline unsigned int get_vmid_bits(u64 mmfr1) } u32 get_kvm_ipa_limit(void); +void dump_cpu_features(void); #endif /* __ASSEMBLY__ */ diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index a1871bb32bb1..2a88cb734d06 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -322,6 +322,7 @@ static inline void *phys_to_virt(phys_addr_t x) __is_lm_address(__addr) && pfn_valid(virt_to_pfn(__addr)); \ }) +void dump_mem_limit(void); #endif /* !ASSEMBLY */ /* diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 9f63053a63a9..9b79df930396 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -119,24 +119,11 @@ static inline void finalize_system_capabilities(void) static_branch_enable(&arm64_const_caps_ready); } -static int dump_cpu_hwcaps(struct notifier_block *self, unsigned long v, void *p) +void dump_cpu_features(void) { /* file-wide pr_fmt adds "CPU features: " prefix */ pr_emerg("0x%*pb\n", ARM64_NCAPS, &cpu_hwcaps); - return 0; -} - -static struct notifier_block cpu_hwcaps_notifier = { - .notifier_call = dump_cpu_hwcaps -}; - -static int __init register_cpu_hwcaps_dumper(void) -{ - atomic_notifier_chain_register(&panic_notifier_list, - &cpu_hwcaps_notifier); - return 0; } -__initcall(register_cpu_hwcaps_dumper); DEFINE_STATIC_KEY_ARRAY_FALSE(cpu_hwcap_keys, ARM64_NCAPS); EXPORT_SYMBOL(cpu_hwcap_keys); diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 93b3844cf442..c793276ec7ad 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -400,11 +400,7 @@ static int __init topology_init(void) } subsys_initcall(topology_init); -/* - * Dump out kernel offset information on panic. - */ -static int dump_kernel_offset(struct notifier_block *self, unsigned long v, - void *p) +static void dump_kernel_offset(void) { const unsigned long offset = kaslr_offset(); @@ -415,17 +411,25 @@ static int dump_kernel_offset(struct notifier_block *self, unsigned long v, } else { pr_emerg("Kernel Offset: disabled\n"); } +} + +static int arm64_panic_block_dump(struct notifier_block *self, + unsigned long v, void *p) +{ + dump_kernel_offset(); + dump_cpu_features(); + dump_mem_limit(); return 0; } -static struct notifier_block kernel_offset_notifier = { - .notifier_call = dump_kernel_offset +static struct notifier_block arm64_panic_block = { + .notifier_call = arm64_panic_block_dump }; -static int __init register_kernel_offset_dumper(void) +static int __init register_arm64_panic_block(void) { atomic_notifier_chain_register(&panic_notifier_list, - &kernel_offset_notifier); + &arm64_panic_block); return 0; } -__initcall(register_kernel_offset_dumper); +device_initcall(register_arm64_panic_block); diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 1e93cfc7c47a..6c3eb424c613 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -563,27 +563,11 @@ void free_initmem(void) unmap_kernel_range((u64)__init_begin, (u64)(__init_end - __init_begin)); } -/* - * Dump out memory limit information on panic. - */ -static int dump_mem_limit(struct notifier_block *self, unsigned long v, void *p) +void dump_mem_limit(void) { if (memory_limit != PHYS_ADDR_MAX) { pr_emerg("Memory Limit: %llu MB\n", memory_limit >> 20); } else { pr_emerg("Memory Limit: none\n"); } - return 0; -} - -static struct notifier_block mem_limit_notifier = { - .notifier_call = dump_mem_limit, -}; - -static int __init register_mem_limit_dumper(void) -{ - atomic_notifier_chain_register(&panic_notifier_list, - &mem_limit_notifier); - return 0; } -__initcall(register_mem_limit_dumper); -- cgit v1.2.3 From 1d50e5d0c5052446cb85a3bf11fe8ba4e8d770ca Mon Sep 17 00:00:00 2001 From: Bhupesh Sharma Date: Thu, 14 May 2020 00:22:36 +0530 Subject: crash_core, vmcoreinfo: Append 'MAX_PHYSMEM_BITS' to vmcoreinfo Right now user-space tools like 'makedumpfile' and 'crash' need to rely on a best-guess method of determining value of 'MAX_PHYSMEM_BITS' supported by underlying kernel. This value is used in user-space code to calculate the bit-space required to store a section for SPARESMEM (similar to the existing calculation method used in the kernel implementation): #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) Now, regressions have been reported in user-space utilities like 'makedumpfile' and 'crash' on arm64, with the recently added kernel support for 52-bit physical address space, as there is no clear method of determining this value in user-space (other than reading kernel CONFIG flags). As per suggestion from makedumpfile maintainer (Kazu), it makes more sense to append 'MAX_PHYSMEM_BITS' to vmcoreinfo in the core code itself rather than in arch-specific code, so that the user-space code for other archs can also benefit from this addition to the vmcoreinfo and use it as a standard way of determining 'SECTIONS_SHIFT' value in user-land. A reference 'makedumpfile' implementation which reads the 'MAX_PHYSMEM_BITS' value from vmcoreinfo in a arch-independent fashion is available here: While at it also update vmcoreinfo documentation for 'MAX_PHYSMEM_BITS' variable being added to vmcoreinfo. 'MAX_PHYSMEM_BITS' defines the maximum supported physical address space memory. Signed-off-by: Bhupesh Sharma Tested-by: John Donnelly Acked-by: Dave Young Cc: Boris Petkov Cc: Ingo Molnar Cc: Thomas Gleixner Cc: James Morse Cc: Mark Rutland Cc: Will Deacon Cc: Michael Ellerman Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Dave Anderson Cc: Kazuhito Hagio Cc: x86@kernel.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Cc: kexec@lists.infradead.org Link: https://lore.kernel.org/r/1589395957-24628-2-git-send-email-bhsharma@redhat.com Signed-off-by: Catalin Marinas --- Documentation/admin-guide/kdump/vmcoreinfo.rst | 5 +++++ kernel/crash_core.c | 1 + 2 files changed, 6 insertions(+) diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst index e4ee8b2db604..2a632020f809 100644 --- a/Documentation/admin-guide/kdump/vmcoreinfo.rst +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst @@ -93,6 +93,11 @@ It exists in the sparse memory mapping model, and it is also somewhat similar to the mem_map variable, both of them are used to translate an address. +MAX_PHYSMEM_BITS +---------------- + +Defines the maximum supported physical address space memory. + page ---- diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 9f1557b98468..18175687133a 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -413,6 +413,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); VMCOREINFO_STRUCT_SIZE(mem_section); VMCOREINFO_OFFSET(mem_section, section_mem_map); + VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS); #endif VMCOREINFO_STRUCT_SIZE(page); VMCOREINFO_STRUCT_SIZE(pglist_data); -- cgit v1.2.3 From bbdbc11804ff0b4130e7550113b452e96a74d16e Mon Sep 17 00:00:00 2001 From: Bhupesh Sharma Date: Thu, 14 May 2020 00:22:37 +0530 Subject: arm64/crash_core: Export TCR_EL1.T1SZ in vmcoreinfo TCR_EL1.TxSZ, which controls the VA space size, is configured by a single kernel image to support either 48-bit or 52-bit VA space. If the ARMv8.2-LVA optional feature is present and we are running with a 64KB page size, then it is possible to use 52-bits of address space for both userspace and kernel addresses. However, any kernel binary that supports 52-bit must also be able to fall back to 48-bit at early boot time if the hardware feature is not present. Since TCR_EL1.T1SZ indicates the size of the memory region addressed by TTBR1_EL1, export the same in vmcoreinfo. User-space utilities like makedumpfile and crash-utility need to read this value from vmcoreinfo for determining if a virtual address lies in the linear map range. While at it also add documentation for TCR_EL1.T1SZ variable being added to vmcoreinfo. It indicates the size offset of the memory region addressed by TTBR1_EL1. Signed-off-by: Bhupesh Sharma Tested-by: John Donnelly Tested-by: Kamlakant Patel Tested-by: Amit Daniel Kachhap Reviewed-by: James Morse Reviewed-by: Amit Daniel Kachhap Cc: James Morse Cc: Mark Rutland Cc: Will Deacon Cc: Steve Capper Cc: Ard Biesheuvel Cc: Dave Anderson Cc: Kazuhito Hagio Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Cc: kexec@lists.infradead.org Link: https://lore.kernel.org/r/1589395957-24628-3-git-send-email-bhsharma@redhat.com [catalin.marinas@arm.com: removed vabits_actual from the commit log] Signed-off-by: Catalin Marinas --- Documentation/admin-guide/kdump/vmcoreinfo.rst | 11 +++++++++++ arch/arm64/include/asm/pgtable-hwdef.h | 1 + arch/arm64/kernel/crash_core.c | 10 ++++++++++ 3 files changed, 22 insertions(+) diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst index 2a632020f809..2baad0bfb09d 100644 --- a/Documentation/admin-guide/kdump/vmcoreinfo.rst +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst @@ -404,6 +404,17 @@ KERNELPACMASK The mask to extract the Pointer Authentication Code from a kernel virtual address. +TCR_EL1.T1SZ +------------ + +Indicates the size offset of the memory region addressed by TTBR1_EL1. +The region size is 2^(64-T1SZ) bytes. + +TTBR1_EL1 is the table base address register specified by ARMv8-A +architecture which is used to lookup the page-tables for the Virtual +addresses in the higher VA range (refer to ARMv8 ARM document for +more details). + arm === diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 9c91a8f93a0e..9a757d724974 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -216,6 +216,7 @@ #define TCR_TxSZ(x) (TCR_T0SZ(x) | TCR_T1SZ(x)) #define TCR_TxSZ_WIDTH 6 #define TCR_T0SZ_MASK (((UL(1) << TCR_TxSZ_WIDTH) - 1) << TCR_T0SZ_OFFSET) +#define TCR_T1SZ_MASK (((UL(1) << TCR_TxSZ_WIDTH) - 1) << TCR_T1SZ_OFFSET) #define TCR_EPD0_SHIFT 7 #define TCR_EPD0_MASK (UL(1) << TCR_EPD0_SHIFT) diff --git a/arch/arm64/kernel/crash_core.c b/arch/arm64/kernel/crash_core.c index 1f646b07e3e9..314391a156ee 100644 --- a/arch/arm64/kernel/crash_core.c +++ b/arch/arm64/kernel/crash_core.c @@ -7,6 +7,14 @@ #include #include #include +#include + +static inline u64 get_tcr_el1_t1sz(void); + +static inline u64 get_tcr_el1_t1sz(void) +{ + return (read_sysreg(tcr_el1) & TCR_T1SZ_MASK) >> TCR_T1SZ_OFFSET; +} void arch_crash_save_vmcoreinfo(void) { @@ -16,6 +24,8 @@ void arch_crash_save_vmcoreinfo(void) kimage_voffset); vmcoreinfo_append_str("NUMBER(PHYS_OFFSET)=0x%llx\n", PHYS_OFFSET); + vmcoreinfo_append_str("NUMBER(TCR_EL1_T1SZ)=0x%llx\n", + get_tcr_el1_t1sz()); vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); vmcoreinfo_append_str("NUMBER(KERNELPACMASK)=0x%llx\n", system_supports_address_auth() ? -- cgit v1.2.3 From dd72078466ecd525f4d489e7b0093cd9b5044c8e Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 25 Jun 2020 14:15:07 +0100 Subject: arm64: Document sysctls for emulated deprecated instructions We have support for emulating a number of deprecated instructions in the kernel with individual Kconfig options enabling this support per instruction. In addition to the Kconfig options we also provide runtime control via sysctls but this is not currently mentioned in the Kconfig so not very discoverable for users. This is particularly important for SWP/SWPB since this is disabled by default at runtime and must be enabled via the sysctl, causing considerable frustration for users who have enabled the config option and are then confused to find that the instruction is still faulting. Add a reference to the sysctls in the help text for each of the config options, noting that SWP/SWPB is disabled by default, to improve the user experience. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20200625131507.32334-1-broonie@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 66dc41fd49f2..6c560caf9503 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1327,6 +1327,8 @@ config SWP_EMULATION ARMv8 obsoletes the use of A32 SWP/SWPB instructions such that they are always undefined. Say Y here to enable software emulation of these instructions for userspace using LDXR/STXR. + This feature can be controlled at runtime with the abi.swp + sysctl which is disabled by default. In some older versions of glibc [<=2.8] SWP is used during futex trylock() operations with the assumption that the code will not @@ -1353,7 +1355,8 @@ config CP15_BARRIER_EMULATION Say Y here to enable software emulation of these instructions for AArch32 userspace code. When this option is enabled, CP15 barrier usage is traced which can help - identify software that needs updating. + identify software that needs updating. This feature can be + controlled at runtime with the abi.cp15_barrier sysctl. If unsure, say Y @@ -1364,7 +1367,8 @@ config SETEND_EMULATION AArch32 EL0, and is deprecated in ARMv8. Say Y here to enable software emulation of the instruction - for AArch32 userspace code. + for AArch32 userspace code. This feature can be controlled + at runtime with the abi.setend sysctl. Note: All the cpus on the system must have mixed endian support at EL0 for this feature to be enabled. If a new CPU - which doesn't support mixed -- cgit v1.2.3 From bc67f10ad1d76a30e01c539c0043417fa34648d7 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 3 Jul 2020 09:21:34 +0530 Subject: arm64/cpufeature: Add remaining feature bits in ID_AA64MMFR0 register Enable EVC, FGT, EXS features bits in ID_AA64MMFR0 register as per ARM DDI 0487F.a specification. Suggested-by: Will Deacon Signed-off-by: Anshuman Khandual Reviewed-by: Suzuki K Poulose Cc: Will Deacon Cc: Mark Rutland Cc: Suzuki K Poulose Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Link: https://lore.kernel.org/r/1593748297-1965-2-git-send-email-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/sysreg.h | 3 +++ arch/arm64/kernel/cpufeature.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 463175f80341..2e36dfde2570 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -706,6 +706,9 @@ #define ID_AA64ZFR0_SVEVER_SVE2 0x1 /* id_aa64mmfr0 */ +#define ID_AA64MMFR0_ECV_SHIFT 60 +#define ID_AA64MMFR0_FGT_SHIFT 56 +#define ID_AA64MMFR0_EXS_SHIFT 44 #define ID_AA64MMFR0_TGRAN4_2_SHIFT 40 #define ID_AA64MMFR0_TGRAN64_2_SHIFT 36 #define ID_AA64MMFR0_TGRAN16_2_SHIFT 32 diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 9f63053a63a9..7a84f5f31527 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -269,6 +269,9 @@ static const struct arm64_ftr_bits ftr_id_aa64zfr0[] = { }; static const struct arm64_ftr_bits ftr_id_aa64mmfr0[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_ECV_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_FGT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EXS_SHIFT, 4, 0), /* * Page size not being supported at Stage-2 is not fatal. You * just give up KVM if PAGE_SIZE isn't supported there. Go fix -- cgit v1.2.3 From 853772ba8023c25b1caae56b6426ca76dae1eaff Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 3 Jul 2020 09:21:35 +0530 Subject: arm64/cpufeature: Add remaining feature bits in ID_AA64MMFR1 register Enable ETS, TWED, XNX and SPECSEI features bits in ID_AA64MMFR1 register as per ARM DDI 0487F.a specification. Suggested-by: Will Deacon Signed-off-by: Anshuman Khandual Reviewed-by: Suzuki K Poulose Cc: Will Deacon Cc: Mark Rutland Cc: Suzuki K Poulose Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Link: https://lore.kernel.org/r/1593748297-1965-3-git-send-email-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/sysreg.h | 4 ++++ arch/arm64/kernel/cpufeature.c | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 2e36dfde2570..889fa7729719 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -737,6 +737,10 @@ #endif /* id_aa64mmfr1 */ +#define ID_AA64MMFR1_ETS_SHIFT 36 +#define ID_AA64MMFR1_TWED_SHIFT 32 +#define ID_AA64MMFR1_XNX_SHIFT 28 +#define ID_AA64MMFR1_SPECSEI_SHIFT 24 #define ID_AA64MMFR1_PAN_SHIFT 20 #define ID_AA64MMFR1_LOR_SHIFT 16 #define ID_AA64MMFR1_HPD_SHIFT 12 diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 7a84f5f31527..764793c4a188 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -315,6 +315,10 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr0[] = { }; static const struct arm64_ftr_bits ftr_id_aa64mmfr1[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_ETS_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_TWED_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_XNX_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_HIGHER_SAFE, ID_AA64MMFR1_SPECSEI_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_PAN_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_LOR_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_HPD_SHIFT, 4, 0), -- cgit v1.2.3 From 356fdfbe8761da55c4100bd543259f349fc1ca3a Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 3 Jul 2020 09:21:36 +0530 Subject: arm64/cpufeature: Add remaining feature bits in ID_AA64MMFR2 register Enable EVT, BBM, TTL, IDS, ST, NV and CCIDX features bits in ID_AA64MMFR2 register as per ARM DDI 0487F.a specification. Suggested-by: Will Deacon Signed-off-by: Anshuman Khandual Reviewed-by: Suzuki K Poulose Cc: Will Deacon Cc: Mark Rutland Cc: Suzuki K Poulose Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Link: https://lore.kernel.org/r/1593748297-1965-4-git-send-email-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/sysreg.h | 7 +++++++ arch/arm64/kernel/cpufeature.c | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 889fa7729719..9ee324936ea2 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -753,8 +753,15 @@ /* id_aa64mmfr2 */ #define ID_AA64MMFR2_E0PD_SHIFT 60 +#define ID_AA64MMFR2_EVT_SHIFT 56 +#define ID_AA64MMFR2_BBM_SHIFT 52 +#define ID_AA64MMFR2_TTL_SHIFT 48 #define ID_AA64MMFR2_FWB_SHIFT 40 +#define ID_AA64MMFR2_IDS_SHIFT 36 #define ID_AA64MMFR2_AT_SHIFT 32 +#define ID_AA64MMFR2_ST_SHIFT 28 +#define ID_AA64MMFR2_NV_SHIFT 24 +#define ID_AA64MMFR2_CCIDX_SHIFT 20 #define ID_AA64MMFR2_LVA_SHIFT 16 #define ID_AA64MMFR2_IESB_SHIFT 12 #define ID_AA64MMFR2_LSM_SHIFT 8 diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 764793c4a188..93797d9bb931 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -330,8 +330,15 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr1[] = { static const struct arm64_ftr_bits ftr_id_aa64mmfr2[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_E0PD_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EVT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_BBM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_TTL_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_FWB_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_IDS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_AT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_ST_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_NV_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_CCIDX_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_LVA_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_IESB_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_LSM_SHIFT, 4, 0), -- cgit v1.2.3 From 8d3154afc10dd474265b62752cd169f66f40ae0d Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 3 Jul 2020 09:21:37 +0530 Subject: arm64/cpufeature: Replace all open bits shift encodings with macros There are many open bits shift encodings for various CPU ID registers that are scattered across cpufeature. This replaces them with register specific sensible macro definitions. This should not have any functional change. Signed-off-by: Anshuman Khandual Reviewed-by: Suzuki K Poulose Cc: Will Deacon Cc: Marc Zyngier Cc: Mark Rutland Cc: James Morse Cc: Suzuki K Poulose Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Link: https://lore.kernel.org/r/1593748297-1965-5-git-send-email-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/sysreg.h | 28 ++++++++++++++++++++++ arch/arm64/kernel/cpufeature.c | 53 +++++++++++++++++++++-------------------- 2 files changed, 55 insertions(+), 26 deletions(-) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 9ee324936ea2..b74c727c3bcd 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -769,6 +769,7 @@ #define ID_AA64MMFR2_CNP_SHIFT 0 /* id_aa64dfr0 */ +#define ID_AA64DFR0_DOUBLELOCK_SHIFT 36 #define ID_AA64DFR0_PMSVER_SHIFT 32 #define ID_AA64DFR0_CTX_CMPS_SHIFT 28 #define ID_AA64DFR0_WRPS_SHIFT 20 @@ -821,18 +822,40 @@ #define ID_ISAR6_DP_SHIFT 4 #define ID_ISAR6_JSCVT_SHIFT 0 +#define ID_MMFR0_INNERSHR_SHIFT 28 +#define ID_MMFR0_FCSE_SHIFT 24 +#define ID_MMFR0_AUXREG_SHIFT 20 +#define ID_MMFR0_TCM_SHIFT 16 +#define ID_MMFR0_SHARELVL_SHIFT 12 +#define ID_MMFR0_OUTERSHR_SHIFT 8 +#define ID_MMFR0_PMSA_SHIFT 4 +#define ID_MMFR0_VMSA_SHIFT 0 + #define ID_MMFR4_EVT_SHIFT 28 #define ID_MMFR4_CCIDX_SHIFT 24 #define ID_MMFR4_LSM_SHIFT 20 #define ID_MMFR4_HPDS_SHIFT 16 #define ID_MMFR4_CNP_SHIFT 12 #define ID_MMFR4_XNX_SHIFT 8 +#define ID_MMFR4_AC2_SHIFT 4 #define ID_MMFR4_SPECSEI_SHIFT 0 #define ID_MMFR5_ETS_SHIFT 0 #define ID_PFR0_DIT_SHIFT 24 #define ID_PFR0_CSV2_SHIFT 16 +#define ID_PFR0_STATE3_SHIFT 12 +#define ID_PFR0_STATE2_SHIFT 8 +#define ID_PFR0_STATE1_SHIFT 4 +#define ID_PFR0_STATE0_SHIFT 0 + +#define ID_DFR0_PERFMON_SHIFT 24 +#define ID_DFR0_MPROFDBG_SHIFT 20 +#define ID_DFR0_MMAPTRC_SHIFT 16 +#define ID_DFR0_COPTRC_SHIFT 12 +#define ID_DFR0_MMAPDBG_SHIFT 8 +#define ID_DFR0_COPSDBG_SHIFT 4 +#define ID_DFR0_COPDBG_SHIFT 0 #define ID_PFR2_SSBS_SHIFT 4 #define ID_PFR2_CSV3_SHIFT 0 @@ -875,6 +898,11 @@ #define ID_AA64MMFR0_TGRAN_SUPPORTED ID_AA64MMFR0_TGRAN64_SUPPORTED #endif +#define MVFR2_FPMISC_SHIFT 4 +#define MVFR2_SIMDMISC_SHIFT 0 + +#define DCZID_DZP_SHIFT 4 +#define DCZID_BS_SHIFT 0 /* * The ZCR_ELx_LEN_* definitions intentionally include bits [8:4] which diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 93797d9bb931..19146bd338b4 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -359,7 +359,7 @@ static const struct arm64_ftr_bits ftr_ctr[] = { * make use of *minLine. * If we have differing I-cache policies, report it as the weakest - VIPT. */ - ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_EXACT, 14, 2, ICACHE_POLICY_VIPT), /* L1Ip */ + ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_EXACT, CTR_L1IP_SHIFT, 2, ICACHE_POLICY_VIPT), /* L1Ip */ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_IMINLINE_SHIFT, 4, 0), ARM64_FTR_END, }; @@ -370,19 +370,19 @@ struct arm64_ftr_reg arm64_ftr_reg_ctrel0 = { }; static const struct arm64_ftr_bits ftr_id_mmfr0[] = { - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 28, 4, 0xf), /* InnerShr */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 24, 4, 0), /* FCSE */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, 20, 4, 0), /* AuxReg */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 16, 4, 0), /* TCM */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 12, 4, 0), /* ShareLvl */ - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 8, 4, 0xf), /* OuterShr */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0), /* PMSA */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 0, 4, 0), /* VMSA */ + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_INNERSHR_SHIFT, 4, 0xf), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_FCSE_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_MMFR0_AUXREG_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_TCM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_SHARELVL_SHIFT, 4, 0), + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_OUTERSHR_SHIFT, 4, 0xf), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_PMSA_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_VMSA_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64dfr0[] = { - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 36, 4, 0), + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_DOUBLELOCK_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64DFR0_PMSVER_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_CTX_CMPS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_WRPS_SHIFT, 4, 0), @@ -398,14 +398,14 @@ static const struct arm64_ftr_bits ftr_id_aa64dfr0[] = { }; static const struct arm64_ftr_bits ftr_mvfr2[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0), /* FPMisc */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 0, 4, 0), /* SIMDMisc */ + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR2_FPMISC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR2_SIMDMISC_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_dczid[] = { - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, 4, 1, 1), /* DZP */ - ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, 0, 4, 0), /* BS */ + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, DCZID_DZP_SHIFT, 1, 1), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, DCZID_BS_SHIFT, 4, 0), ARM64_FTR_END, }; @@ -437,7 +437,8 @@ static const struct arm64_ftr_bits ftr_id_mmfr4[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_HPDS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_CNP_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_XNX_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0), /* ac2 */ + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_AC2_SHIFT, 4, 0), + /* * SpecSEI = 1 indicates that the PE might generate an SError on an * external abort on speculative read. It is safe to assume that an @@ -479,10 +480,10 @@ static const struct arm64_ftr_bits ftr_id_isar6[] = { static const struct arm64_ftr_bits ftr_id_pfr0[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_DIT_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_PFR0_CSV2_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 12, 4, 0), /* State3 */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 8, 4, 0), /* State2 */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0), /* State1 */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 0, 4, 0), /* State0 */ + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_STATE3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_STATE2_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_STATE1_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_STATE0_SHIFT, 4, 0), ARM64_FTR_END, }; @@ -506,13 +507,13 @@ static const struct arm64_ftr_bits ftr_id_pfr2[] = { static const struct arm64_ftr_bits ftr_id_dfr0[] = { /* [31:28] TraceFilt */ - S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 24, 4, 0xf), /* PerfMon */ - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 20, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 16, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 12, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 8, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 0, 4, 0), + S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_PERFMON_SHIFT, 4, 0xf), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_MPROFDBG_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_MMAPTRC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_COPTRC_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_MMAPDBG_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_COPSDBG_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_COPDBG_SHIFT, 4, 0), ARM64_FTR_END, }; -- cgit v1.2.3 From 2a379716f3d76aebc5574155de247b547a0214cc Mon Sep 17 00:00:00 2001 From: Bhupesh Sharma Date: Tue, 7 Apr 2020 04:01:40 +0530 Subject: arm64/defconfig: Enable CONFIG_KEXEC_FILE kexec_file_load() syscall interface is now supported for arm64 architecture as well via commits: 3751e728cef2 ("arm64: kexec_file: add crash dump support") and 3ddd9992a590 ("arm64: enable KEXEC_FILE config")]. This patch enables config KEXEC_FILE by default in the arm64 defconfig, so that user-space tools like kexec-tools can use the same as the default interface for kexec/kdump on arm64. Cc: AKASHI Takahiro Cc: James Morse Cc: Mark Rutland Cc: Will Deacon Cc: kexec@lists.infradead.org Signed-off-by: Bhupesh Sharma Acked-by: Mark Rutland Link: https://lore.kernel.org/r/1586212300-30797-1-git-send-email-bhsharma@redhat.com Signed-off-by: Catalin Marinas --- arch/arm64/configs/defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 883e8bace3ed..1a33697a8492 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -66,6 +66,7 @@ CONFIG_SCHED_SMT=y CONFIG_NUMA=y CONFIG_SECCOMP=y CONFIG_KEXEC=y +CONFIG_KEXEC_FILE=y CONFIG_CRASH_DUMP=y CONFIG_XEN=y CONFIG_COMPAT=y -- cgit v1.2.3 From a1634a542f74309f843742fa849208bb26e279e4 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 30 Jun 2020 16:24:28 +1000 Subject: arm64/mm: Redefine CONT_{PTE, PMD}_SHIFT Currently, the value of CONT_{PTE, PMD}_SHIFT is off from standard {PAGE, PMD}_SHIFT. In turn, we have to consider adding {PAGE, PMD}_SHIFT when using CONT_{PTE, PMD}_SHIFT in the function hugetlbpage_init(). It's a bit confusing. This redefines CONT_{PTE, PMD}_SHIFT with {PAGE, PMD}_SHIFT included so that the later values needn't be added when using the former ones in function hugetlbpage_init(). Note that the values of CONT_{PTES, PMDS} are unchanged. Suggested-by: Will Deacon Signed-off-by: Gavin Shan Reviewed-by: Anshuman Khandual Link: https://lkml.org/lkml/2020/5/6/190 Link: https://lore.kernel.org/r/20200630062428.194235-1-gshan@redhat.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable-hwdef.h | 16 ++++++++-------- arch/arm64/mm/hugetlbpage.c | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 9c91a8f93a0e..ce3d14abb360 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -82,20 +82,20 @@ * Contiguous page definitions. */ #ifdef CONFIG_ARM64_64K_PAGES -#define CONT_PTE_SHIFT 5 -#define CONT_PMD_SHIFT 5 +#define CONT_PTE_SHIFT (5 + PAGE_SHIFT) +#define CONT_PMD_SHIFT (5 + PMD_SHIFT) #elif defined(CONFIG_ARM64_16K_PAGES) -#define CONT_PTE_SHIFT 7 -#define CONT_PMD_SHIFT 5 +#define CONT_PTE_SHIFT (7 + PAGE_SHIFT) +#define CONT_PMD_SHIFT (5 + PMD_SHIFT) #else -#define CONT_PTE_SHIFT 4 -#define CONT_PMD_SHIFT 4 +#define CONT_PTE_SHIFT (4 + PAGE_SHIFT) +#define CONT_PMD_SHIFT (4 + PMD_SHIFT) #endif -#define CONT_PTES (1 << CONT_PTE_SHIFT) +#define CONT_PTES (1 << (CONT_PTE_SHIFT - PAGE_SHIFT)) #define CONT_PTE_SIZE (CONT_PTES * PAGE_SIZE) #define CONT_PTE_MASK (~(CONT_PTE_SIZE - 1)) -#define CONT_PMDS (1 << CONT_PMD_SHIFT) +#define CONT_PMDS (1 << (CONT_PMD_SHIFT - PMD_SHIFT)) #define CONT_PMD_SIZE (CONT_PMDS * PMD_SIZE) #define CONT_PMD_MASK (~(CONT_PMD_SIZE - 1)) /* the the numerical offset of the PTE within a range of CONT_PTES */ diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 0a52ce46f020..c79084739096 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -457,9 +457,9 @@ static int __init hugetlbpage_init(void) #ifdef CONFIG_ARM64_4K_PAGES hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); #endif - hugetlb_add_hstate((CONT_PMD_SHIFT + PMD_SHIFT) - PAGE_SHIFT); + hugetlb_add_hstate(CONT_PMD_SHIFT - PAGE_SHIFT); hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); - hugetlb_add_hstate((CONT_PTE_SHIFT + PAGE_SHIFT) - PAGE_SHIFT); + hugetlb_add_hstate(CONT_PTE_SHIFT - PAGE_SHIFT); return 0; } -- cgit v1.2.3 From 552ae76face5584085845646c5f57e10c1a4ebdc Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 22 Dec 2018 12:00:10 +0000 Subject: arm64: Detect the ARMv8.4 TTL feature In order to reduce the cost of TLB invalidation, the ARMv8.4 TTL feature allows TLBs to be issued with a level allowing for quicker invalidation. Let's detect the feature for now. Further patches will implement its actual usage. Reviewed-by : Suzuki K Polose Reviewed-by: Catalin Marinas Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/cpucaps.h | 3 ++- arch/arm64/include/asm/sysreg.h | 1 + arch/arm64/kernel/cpufeature.c | 11 +++++++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index d7b3bb0cb180..d44ba903d11d 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -62,7 +62,8 @@ #define ARM64_HAS_GENERIC_AUTH 52 #define ARM64_HAS_32BIT_EL1 53 #define ARM64_BTI 54 +#define ARM64_HAS_ARMv8_4_TTL 55 -#define ARM64_NCAPS 55 +#define ARM64_NCAPS 56 #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 463175f80341..8c209aa17273 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -746,6 +746,7 @@ /* id_aa64mmfr2 */ #define ID_AA64MMFR2_E0PD_SHIFT 60 +#define ID_AA64MMFR2_TTL_SHIFT 48 #define ID_AA64MMFR2_FWB_SHIFT 40 #define ID_AA64MMFR2_AT_SHIFT 32 #define ID_AA64MMFR2_LVA_SHIFT 16 diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 9f63053a63a9..e877f56ff1ab 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -323,6 +323,7 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr1[] = { static const struct arm64_ftr_bits ftr_id_aa64mmfr2[] = { ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_E0PD_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_TTL_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_FWB_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_AT_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_LVA_SHIFT, 4, 0), @@ -1882,6 +1883,16 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_cpuid_feature, .cpu_enable = cpu_has_fwb, }, + { + .desc = "ARMv8.4 Translation Table Level", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_HAS_ARMv8_4_TTL, + .sys_reg = SYS_ID_AA64MMFR2_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64MMFR2_TTL_SHIFT, + .min_field_value = 1, + .matches = has_cpuid_feature, + }, #ifdef CONFIG_ARM64_HW_AFDBM { /* -- cgit v1.2.3 From 6fcfdf6d72898d1c5118d7dd3d3d38690e2f6a64 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 28 Dec 2018 09:11:50 +0000 Subject: arm64: Document SW reserved PTE/PMD bits in Stage-2 descriptors Advertise bits [58:55] as reserved for SW in the S2 descriptors. Reviewed-by: Andrew Scull Acked-by: Catalin Marinas Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/pgtable-hwdef.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 9c91a8f93a0e..de0b603955f4 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -178,10 +178,12 @@ #define PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[2:1] */ #define PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */ #define PTE_S2_XN (_AT(pteval_t, 2) << 53) /* XN[1:0] */ +#define PTE_S2_SW_RESVD (_AT(pteval_t, 15) << 55) /* Reserved for SW */ #define PMD_S2_RDONLY (_AT(pmdval_t, 1) << 6) /* HAP[2:1] */ #define PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */ #define PMD_S2_XN (_AT(pmdval_t, 2) << 53) /* XN[1:0] */ +#define PMD_S2_SW_RESVD (_AT(pmdval_t, 15) << 55) /* Reserved for SW */ #define PUD_S2_RDONLY (_AT(pudval_t, 1) << 6) /* HAP[2:1] */ #define PUD_S2_RDWR (_AT(pudval_t, 3) << 6) /* HAP[2:1] */ -- cgit v1.2.3 From c10bc62ae4d2135c9db40e96a8e994164faee531 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 2 Jan 2019 10:21:29 +0000 Subject: arm64: Add level-hinted TLB invalidation helper Add a level-hinted TLB invalidation helper that only gets used if ARMv8.4-TTL gets detected. Reviewed-by: Alexandru Elisei Reviewed-by: Catalin Marinas Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/stage2_pgtable.h | 9 +++++++ arch/arm64/include/asm/tlbflush.h | 45 +++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h index b767904f28b1..996bf98f0cab 100644 --- a/arch/arm64/include/asm/stage2_pgtable.h +++ b/arch/arm64/include/asm/stage2_pgtable.h @@ -256,4 +256,13 @@ stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) return (boundary - 1 < end - 1) ? boundary : end; } +/* + * Level values for the ARMv8.4-TTL extension, mapping PUD/PMD/PTE and + * the architectural page-table level. + */ +#define S2_NO_LEVEL_HINT 0 +#define S2_PUD_LEVEL 1 +#define S2_PMD_LEVEL 2 +#define S2_PTE_LEVEL 3 + #endif /* __ARM64_S2_PGTABLE_H_ */ diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index bc3949064725..3353f26302de 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -10,6 +10,7 @@ #ifndef __ASSEMBLY__ +#include #include #include #include @@ -59,6 +60,50 @@ __ta; \ }) +/* + * Level-based TLBI operations. + * + * When ARMv8.4-TTL exists, TLBI operations take an additional hint for + * the level at which the invalidation must take place. If the level is + * wrong, no invalidation may take place. In the case where the level + * cannot be easily determined, a 0 value for the level parameter will + * perform a non-hinted invalidation. + * + * For Stage-2 invalidation, use the level values provided to that effect + * in asm/stage2_pgtable.h. + */ +#define TLBI_TTL_MASK GENMASK_ULL(47, 44) +#define TLBI_TTL_TG_4K 1 +#define TLBI_TTL_TG_16K 2 +#define TLBI_TTL_TG_64K 3 + +#define __tlbi_level(op, addr, level) \ + do { \ + u64 arg = addr; \ + \ + if (cpus_have_const_cap(ARM64_HAS_ARMv8_4_TTL) && \ + level) { \ + u64 ttl = level & 3; \ + \ + switch (PAGE_SIZE) { \ + case SZ_4K: \ + ttl |= TLBI_TTL_TG_4K << 2; \ + break; \ + case SZ_16K: \ + ttl |= TLBI_TTL_TG_16K << 2; \ + break; \ + case SZ_64K: \ + ttl |= TLBI_TTL_TG_64K << 2; \ + break; \ + } \ + \ + arg &= ~TLBI_TTL_MASK; \ + arg |= FIELD_PREP(TLBI_TTL_MASK, ttl); \ + } \ + \ + __tlbi(op, arg); \ + } while(0) + /* * TLB Invalidation * ================ -- cgit v1.2.3 From 7af928851508fb25207806f57e287272dd498981 Mon Sep 17 00:00:00 2001 From: Andrew Scull Date: Thu, 18 Jun 2020 15:55:11 +0100 Subject: smccc: Make constants available to assembly Move constants out of the C-only section of the header next to the other constants that are available to assembly. Signed-off-by: Andrew Scull Reviewed-by: Sudeep Holla Acked-by: Will Deacon Link: https://lore.kernel.org/r/20200618145511.69203-1-ascull@google.com Signed-off-by: Catalin Marinas --- include/linux/arm-smccc.h | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h index 56d6a5c6e353..efcbde731f03 100644 --- a/include/linux/arm-smccc.h +++ b/include/linux/arm-smccc.h @@ -81,6 +81,28 @@ ARM_SMCCC_SMC_32, \ 0, 0x7fff) +/* Paravirtualised time calls (defined by ARM DEN0057A) */ +#define ARM_SMCCC_HV_PV_TIME_FEATURES \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_STANDARD_HYP, \ + 0x20) + +#define ARM_SMCCC_HV_PV_TIME_ST \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ + ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_STANDARD_HYP, \ + 0x21) + +/* + * Return codes defined in ARM DEN 0070A + * ARM DEN 0070A is now merged/consolidated into ARM DEN 0028 C + */ +#define SMCCC_RET_SUCCESS 0 +#define SMCCC_RET_NOT_SUPPORTED -1 +#define SMCCC_RET_NOT_REQUIRED -2 +#define SMCCC_RET_INVALID_PARAMETER -3 + #ifndef __ASSEMBLY__ #include @@ -331,15 +353,6 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1, */ #define arm_smccc_1_1_hvc(...) __arm_smccc_1_1(SMCCC_HVC_INST, __VA_ARGS__) -/* - * Return codes defined in ARM DEN 0070A - * ARM DEN 0070A is now merged/consolidated into ARM DEN 0028 C - */ -#define SMCCC_RET_SUCCESS 0 -#define SMCCC_RET_NOT_SUPPORTED -1 -#define SMCCC_RET_NOT_REQUIRED -2 -#define SMCCC_RET_INVALID_PARAMETER -3 - /* * Like arm_smccc_1_1* but always returns SMCCC_RET_NOT_SUPPORTED. * Used when the SMCCC conduit is not defined. The empty asm statement @@ -385,18 +398,5 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1, method; \ }) -/* Paravirtualised time calls (defined by ARM DEN0057A) */ -#define ARM_SMCCC_HV_PV_TIME_FEATURES \ - ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ - ARM_SMCCC_SMC_64, \ - ARM_SMCCC_OWNER_STANDARD_HYP, \ - 0x20) - -#define ARM_SMCCC_HV_PV_TIME_ST \ - ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ - ARM_SMCCC_SMC_64, \ - ARM_SMCCC_OWNER_STANDARD_HYP, \ - 0x21) - #endif /*__ASSEMBLY__*/ #endif /*__LINUX_ARM_SMCCC_H*/ -- cgit v1.2.3 From e735b98a5fe08c0f50f9fdc3e3a844e3638e6649 Mon Sep 17 00:00:00 2001 From: Zhenyu Ye Date: Thu, 25 Jun 2020 16:03:11 +0800 Subject: arm64: Add tlbi_user_level TLB invalidation helper Add a level-hinted parameter to __tlbi_user, which only gets used if ARMv8.4-TTL gets detected. ARMv8.4-TTL provides the TTL field in tlbi instruction to indicate the level of translation table walk holding the leaf entry for the address that is being invalidated. This patch set the default level value of flush_tlb_range() to 0, which will be updated in future patches. And set the ttl value of flush_tlb_page_nosync() to 3 because it is only called to flush a single pte page. Signed-off-by: Zhenyu Ye Link: https://lore.kernel.org/r/20200625080314.230-4-yezhenyu2@huawei.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/tlbflush.h | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 3353f26302de..e1d07612e147 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -104,6 +104,11 @@ __tlbi(op, arg); \ } while(0) +#define __tlbi_user_level(op, arg, level) do { \ + if (arm64_kernel_unmapped_at_el0()) \ + __tlbi_level(op, (arg | USER_ASID_FLAG), level); \ +} while (0) + /* * TLB Invalidation * ================ @@ -205,8 +210,9 @@ static inline void flush_tlb_page_nosync(struct vm_area_struct *vma, unsigned long addr = __TLBI_VADDR(uaddr, ASID(vma->vm_mm)); dsb(ishst); - __tlbi(vale1is, addr); - __tlbi_user(vale1is, addr); + /* This function is only called on a small page */ + __tlbi_level(vale1is, addr, 3); + __tlbi_user_level(vale1is, addr, 3); } static inline void flush_tlb_page(struct vm_area_struct *vma, @@ -246,11 +252,11 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, dsb(ishst); for (addr = start; addr < end; addr += stride) { if (last_level) { - __tlbi(vale1is, addr); - __tlbi_user(vale1is, addr); + __tlbi_level(vale1is, addr, 0); + __tlbi_user_level(vale1is, addr, 0); } else { - __tlbi(vae1is, addr); - __tlbi_user(vae1is, addr); + __tlbi_level(vae1is, addr, 0); + __tlbi_user_level(vae1is, addr, 0); } } dsb(ish); -- cgit v1.2.3 From 2631ed00b0498810f8d5c2163c6b5270d893687b Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Thu, 25 Jun 2020 16:03:12 +0800 Subject: tlb: mmu_gather: add tlb_flush_*_range APIs tlb_flush_{pte|pmd|pud|p4d}_range() adjust the tlb->start and tlb->end, then set corresponding cleared_*. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Zhenyu Ye Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20200625080314.230-5-yezhenyu2@huawei.com Signed-off-by: Catalin Marinas --- include/asm-generic/tlb.h | 55 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 40 insertions(+), 15 deletions(-) diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 3f1649a8cf55..ef75ec86f865 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -512,6 +512,38 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm } #endif +/* + * tlb_flush_{pte|pmd|pud|p4d}_range() adjust the tlb->start and tlb->end, + * and set corresponding cleared_*. + */ +static inline void tlb_flush_pte_range(struct mmu_gather *tlb, + unsigned long address, unsigned long size) +{ + __tlb_adjust_range(tlb, address, size); + tlb->cleared_ptes = 1; +} + +static inline void tlb_flush_pmd_range(struct mmu_gather *tlb, + unsigned long address, unsigned long size) +{ + __tlb_adjust_range(tlb, address, size); + tlb->cleared_pmds = 1; +} + +static inline void tlb_flush_pud_range(struct mmu_gather *tlb, + unsigned long address, unsigned long size) +{ + __tlb_adjust_range(tlb, address, size); + tlb->cleared_puds = 1; +} + +static inline void tlb_flush_p4d_range(struct mmu_gather *tlb, + unsigned long address, unsigned long size) +{ + __tlb_adjust_range(tlb, address, size); + tlb->cleared_p4ds = 1; +} + #ifndef __tlb_remove_tlb_entry #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) #endif @@ -525,19 +557,17 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm */ #define tlb_remove_tlb_entry(tlb, ptep, address) \ do { \ - __tlb_adjust_range(tlb, address, PAGE_SIZE); \ - tlb->cleared_ptes = 1; \ + tlb_flush_pte_range(tlb, address, PAGE_SIZE); \ __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ do { \ unsigned long _sz = huge_page_size(h); \ - __tlb_adjust_range(tlb, address, _sz); \ if (_sz == PMD_SIZE) \ - tlb->cleared_pmds = 1; \ + tlb_flush_pmd_range(tlb, address, _sz); \ else if (_sz == PUD_SIZE) \ - tlb->cleared_puds = 1; \ + tlb_flush_pud_range(tlb, address, _sz); \ __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) @@ -551,8 +581,7 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm #define tlb_remove_pmd_tlb_entry(tlb, pmdp, address) \ do { \ - __tlb_adjust_range(tlb, address, HPAGE_PMD_SIZE); \ - tlb->cleared_pmds = 1; \ + tlb_flush_pmd_range(tlb, address, HPAGE_PMD_SIZE); \ __tlb_remove_pmd_tlb_entry(tlb, pmdp, address); \ } while (0) @@ -566,8 +595,7 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm #define tlb_remove_pud_tlb_entry(tlb, pudp, address) \ do { \ - __tlb_adjust_range(tlb, address, HPAGE_PUD_SIZE); \ - tlb->cleared_puds = 1; \ + tlb_flush_pud_range(tlb, address, HPAGE_PUD_SIZE); \ __tlb_remove_pud_tlb_entry(tlb, pudp, address); \ } while (0) @@ -592,9 +620,8 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm #ifndef pte_free_tlb #define pte_free_tlb(tlb, ptep, address) \ do { \ - __tlb_adjust_range(tlb, address, PAGE_SIZE); \ + tlb_flush_pmd_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ - tlb->cleared_pmds = 1; \ __pte_free_tlb(tlb, ptep, address); \ } while (0) #endif @@ -602,9 +629,8 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm #ifndef pmd_free_tlb #define pmd_free_tlb(tlb, pmdp, address) \ do { \ - __tlb_adjust_range(tlb, address, PAGE_SIZE); \ + tlb_flush_pud_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ - tlb->cleared_puds = 1; \ __pmd_free_tlb(tlb, pmdp, address); \ } while (0) #endif @@ -612,9 +638,8 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm #ifndef pud_free_tlb #define pud_free_tlb(tlb, pudp, address) \ do { \ - __tlb_adjust_range(tlb, address, PAGE_SIZE); \ + tlb_flush_p4d_range(tlb, address, PAGE_SIZE); \ tlb->freed_tables = 1; \ - tlb->cleared_p4ds = 1; \ __pud_free_tlb(tlb, pudp, address); \ } while (0) #endif -- cgit v1.2.3 From c4ab2cbc1d8768eb505708a58c54c277dfe4a93d Mon Sep 17 00:00:00 2001 From: Zhenyu Ye Date: Thu, 25 Jun 2020 16:03:13 +0800 Subject: arm64: tlb: Set the TTL field in flush_tlb_range This patch uses the cleared_* in struct mmu_gather to set the TTL field in flush_tlb_range(). Signed-off-by: Zhenyu Ye Reviewed-by: Catalin Marinas Link: https://lore.kernel.org/r/20200625080314.230-6-yezhenyu2@huawei.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/tlb.h | 29 ++++++++++++++++++++++++++++- arch/arm64/include/asm/tlbflush.h | 14 ++++++++------ 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h index b76df828e6b7..61c97d3b58c7 100644 --- a/arch/arm64/include/asm/tlb.h +++ b/arch/arm64/include/asm/tlb.h @@ -21,11 +21,37 @@ static void tlb_flush(struct mmu_gather *tlb); #include +/* + * get the tlbi levels in arm64. Default value is 0 if more than one + * of cleared_* is set or neither is set. + * Arm64 doesn't support p4ds now. + */ +static inline int tlb_get_level(struct mmu_gather *tlb) +{ + if (tlb->cleared_ptes && !(tlb->cleared_pmds || + tlb->cleared_puds || + tlb->cleared_p4ds)) + return 3; + + if (tlb->cleared_pmds && !(tlb->cleared_ptes || + tlb->cleared_puds || + tlb->cleared_p4ds)) + return 2; + + if (tlb->cleared_puds && !(tlb->cleared_ptes || + tlb->cleared_pmds || + tlb->cleared_p4ds)) + return 1; + + return 0; +} + static inline void tlb_flush(struct mmu_gather *tlb) { struct vm_area_struct vma = TLB_FLUSH_VMA(tlb->mm, 0); bool last_level = !tlb->freed_tables; unsigned long stride = tlb_get_unmap_size(tlb); + int tlb_level = tlb_get_level(tlb); /* * If we're tearing down the address space then we only care about @@ -38,7 +64,8 @@ static inline void tlb_flush(struct mmu_gather *tlb) return; } - __flush_tlb_range(&vma, tlb->start, tlb->end, stride, last_level); + __flush_tlb_range(&vma, tlb->start, tlb->end, stride, + last_level, tlb_level); } static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index e1d07612e147..3505f6fbfca3 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -230,7 +230,8 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, static inline void __flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, - unsigned long stride, bool last_level) + unsigned long stride, bool last_level, + int tlb_level) { unsigned long asid = ASID(vma->vm_mm); unsigned long addr; @@ -252,11 +253,11 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, dsb(ishst); for (addr = start; addr < end; addr += stride) { if (last_level) { - __tlbi_level(vale1is, addr, 0); - __tlbi_user_level(vale1is, addr, 0); + __tlbi_level(vale1is, addr, tlb_level); + __tlbi_user_level(vale1is, addr, tlb_level); } else { - __tlbi_level(vae1is, addr, 0); - __tlbi_user_level(vae1is, addr, 0); + __tlbi_level(vae1is, addr, tlb_level); + __tlbi_user_level(vae1is, addr, tlb_level); } } dsb(ish); @@ -268,8 +269,9 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, /* * We cannot use leaf-only invalidation here, since we may be invalidating * table entries as part of collapsing hugepages or moving page tables. + * Set the tlb_level to 0 because we can not get enough information here. */ - __flush_tlb_range(vma, start, end, PAGE_SIZE, false); + __flush_tlb_range(vma, start, end, PAGE_SIZE, false, 0); } static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end) -- cgit v1.2.3 From a7ac1cfa4c0510217e74c2ba807ead549f80d82c Mon Sep 17 00:00:00 2001 From: Zhenyu Ye Date: Thu, 25 Jun 2020 16:03:14 +0800 Subject: arm64: tlb: Set the TTL field in flush_*_tlb_range This patch implement flush_{pmd|pud}_tlb_range() in arm64 by calling __flush_tlb_range() with the corresponding stride and tlb_level values. Signed-off-by: Zhenyu Ye Link: https://lore.kernel.org/r/20200625080314.230-7-yezhenyu2@huawei.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 758e2d1577d0..d5d3fbe73953 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -40,6 +40,16 @@ extern void __pmd_error(const char *file, int line, unsigned long val); extern void __pud_error(const char *file, int line, unsigned long val); extern void __pgd_error(const char *file, int line, unsigned long val); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE + +/* Set stride and tlb_level in flush_*_tlb_range */ +#define flush_pmd_tlb_range(vma, addr, end) \ + __flush_tlb_range(vma, addr, end, PMD_SIZE, false, 2) +#define flush_pud_tlb_range(vma, addr, end) \ + __flush_tlb_range(vma, addr, end, PUD_SIZE, false, 1) +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. -- cgit v1.2.3 From 34e36d81a0ef76047fa12a0f8e0dce4369b435cf Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Tue, 7 Jul 2020 11:26:14 +0100 Subject: arm64: Shift the __tlbi_level() indentation left This is for consistency with the other __tlbi macros in this file. No functional change. Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/tlbflush.h | 43 +++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 3505f6fbfca3..39aed2efd21b 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -77,32 +77,31 @@ #define TLBI_TTL_TG_16K 2 #define TLBI_TTL_TG_64K 3 -#define __tlbi_level(op, addr, level) \ - do { \ - u64 arg = addr; \ +#define __tlbi_level(op, addr, level) do { \ + u64 arg = addr; \ \ - if (cpus_have_const_cap(ARM64_HAS_ARMv8_4_TTL) && \ - level) { \ - u64 ttl = level & 3; \ + if (cpus_have_const_cap(ARM64_HAS_ARMv8_4_TTL) && \ + level) { \ + u64 ttl = level & 3; \ \ - switch (PAGE_SIZE) { \ - case SZ_4K: \ - ttl |= TLBI_TTL_TG_4K << 2; \ - break; \ - case SZ_16K: \ - ttl |= TLBI_TTL_TG_16K << 2; \ - break; \ - case SZ_64K: \ - ttl |= TLBI_TTL_TG_64K << 2; \ - break; \ - } \ - \ - arg &= ~TLBI_TTL_MASK; \ - arg |= FIELD_PREP(TLBI_TTL_MASK, ttl); \ + switch (PAGE_SIZE) { \ + case SZ_4K: \ + ttl |= TLBI_TTL_TG_4K << 2; \ + break; \ + case SZ_16K: \ + ttl |= TLBI_TTL_TG_16K << 2; \ + break; \ + case SZ_64K: \ + ttl |= TLBI_TTL_TG_64K << 2; \ + break; \ } \ \ - __tlbi(op, arg); \ - } while(0) + arg &= ~TLBI_TTL_MASK; \ + arg |= FIELD_PREP(TLBI_TTL_MASK, ttl); \ + } \ + \ + __tlbi(op, arg); \ +} while(0) #define __tlbi_user_level(op, arg, level) do { \ if (arm64_kernel_unmapped_at_el0()) \ -- cgit v1.2.3 From c6c83d757a13a5df51428a6fe133c9193810507b Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 7 Jul 2020 19:53:13 +0530 Subject: arm64/cpufeature: Validate feature bits spacing in arm64_ftr_regs[] arm64_feature_bits for a register in arm64_ftr_regs[] are in a descending order as per their shift values. Validate that these features bits are defined correctly and do not overlap with each other. This check protects against any inadvertent erroneous changes to the register definitions. Signed-off-by: Anshuman Khandual Reviewed-by: Suzuki K Poulose Cc: Will Deacon Cc: Suzuki K Poulose Cc: Mark Brown Cc: Mark Rutland Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Link: https://lore.kernel.org/r/1594131793-9498-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/cpufeature.c | 47 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 19146bd338b4..d9b51cb9cb8c 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -712,11 +712,52 @@ static s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new, static void __init sort_ftr_regs(void) { - int i; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(arm64_ftr_regs); i++) { + const struct arm64_ftr_reg *ftr_reg = arm64_ftr_regs[i].reg; + const struct arm64_ftr_bits *ftr_bits = ftr_reg->ftr_bits; + unsigned int j = 0; + + /* + * Features here must be sorted in descending order with respect + * to their shift values and should not overlap with each other. + */ + for (; ftr_bits->width != 0; ftr_bits++, j++) { + unsigned int width = ftr_reg->ftr_bits[j].width; + unsigned int shift = ftr_reg->ftr_bits[j].shift; + unsigned int prev_shift; + + WARN((shift + width) > 64, + "%s has invalid feature at shift %d\n", + ftr_reg->name, shift); + + /* + * Skip the first feature. There is nothing to + * compare against for now. + */ + if (j == 0) + continue; - /* Check that the array is sorted so that we can do the binary search */ - for (i = 1; i < ARRAY_SIZE(arm64_ftr_regs); i++) + prev_shift = ftr_reg->ftr_bits[j - 1].shift; + WARN((shift + width) > prev_shift, + "%s has feature overlap at shift %d\n", + ftr_reg->name, shift); + } + + /* + * Skip the first register. There is nothing to + * compare against for now. + */ + if (i == 0) + continue; + /* + * Registers here must be sorted in ascending order with respect + * to sys_id for subsequent binary search in get_arm64_ftr_reg() + * to work correctly. + */ BUG_ON(arm64_ftr_regs[i].sys_id < arm64_ftr_regs[i - 1].sys_id); + } } /* -- cgit v1.2.3 From 61c11656b67b0a30f702f240aabe81fd93e702ac Mon Sep 17 00:00:00 2001 From: Zhenyu Ye Date: Fri, 10 Jul 2020 17:41:58 +0800 Subject: arm64: tlb: don't set the ttl value in flush_tlb_page_nosync flush_tlb_page_nosync() may be called from pmd level, so we can not set the ttl = 3 here. The callstack is as follows: pmdp_set_access_flags ptep_set_access_flags flush_tlb_fix_spurious_fault flush_tlb_page flush_tlb_page_nosync Fixes: e735b98a5fe0 ("arm64: Add tlbi_user_level TLB invalidation helper") Reported-by: Catalin Marinas Signed-off-by: Zhenyu Ye Link: https://lore.kernel.org/r/20200710094158.468-1-yezhenyu2@huawei.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/tlbflush.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 39aed2efd21b..2cb275efcea3 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -209,9 +209,8 @@ static inline void flush_tlb_page_nosync(struct vm_area_struct *vma, unsigned long addr = __TLBI_VADDR(uaddr, ASID(vma->vm_mm)); dsb(ishst); - /* This function is only called on a small page */ - __tlbi_level(vale1is, addr, 3); - __tlbi_user_level(vale1is, addr, 3); + __tlbi(vale1is, addr); + __tlbi_user(vale1is, addr); } static inline void flush_tlb_page(struct vm_area_struct *vma, -- cgit v1.2.3 From f011856ce7b600fdc2d1102d56873b787ff6d1bb Mon Sep 17 00:00:00 2001 From: Jay Chen Date: Mon, 6 Jul 2020 19:22:45 +0800 Subject: perf/smmuv3: To simplify code for ioremap page in pmcg Use the devm_platform_get_and_ioremap_resource to simplify the code a bit. Signed-off-by: Jay Chen Link: https://lore.kernel.org/r/20200706112246.92220-2-jkchen@linux.alibaba.com Signed-off-by: Will Deacon --- drivers/perf/arm_smmuv3_pmu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c index 48e28ef93a70..2d09f3e47d12 100644 --- a/drivers/perf/arm_smmuv3_pmu.c +++ b/drivers/perf/arm_smmuv3_pmu.c @@ -755,8 +755,7 @@ static int smmu_pmu_probe(struct platform_device *pdev) .capabilities = PERF_PMU_CAP_NO_EXCLUDE, }; - res_0 = platform_get_resource(pdev, IORESOURCE_MEM, 0); - smmu_pmu->reg_base = devm_ioremap_resource(dev, res_0); + smmu_pmu->reg_base = devm_platform_get_and_ioremap_resource(pdev, 0, &res_0); if (IS_ERR(smmu_pmu->reg_base)) return PTR_ERR(smmu_pmu->reg_base); -- cgit v1.2.3 From 1583052d111f8ea43f9954c5e749164fd2b954af Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 26 Jun 2020 17:58:31 +0200 Subject: arm64/acpi: disallow AML memory opregions to access kernel memory AML uses SystemMemory opregions to allow AML handlers to access MMIO registers of, e.g., GPIO controllers, or access reserved regions of memory that are owned by the firmware. Currently, we also allow AML access to memory that is owned by the kernel and mapped via the linear region, which does not seem to be supported by a valid use case, and exposes the kernel's internal state to AML methods that may be buggy and exploitable. On arm64, ACPI support requires booting in EFI mode, and so we can cross reference the requested region against the EFI memory map, rather than just do a minimal check on the first page. So let's only permit regions to be remapped by the ACPI core if - they don't appear in the EFI memory map at all (which is the case for most MMIO), or - they are covered by a single region in the EFI memory map, which is not of a type that describes memory that is given to the kernel at boot. Reported-by: Jason A. Donenfeld Signed-off-by: Ard Biesheuvel Acked-by: Lorenzo Pieralisi Link: https://lore.kernel.org/r/20200626155832.2323789-2-ardb@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/acpi.h | 15 +--------- arch/arm64/kernel/acpi.c | 66 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 14 deletions(-) diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h index a45366c3909b..bd68e1b7f29f 100644 --- a/arch/arm64/include/asm/acpi.h +++ b/arch/arm64/include/asm/acpi.h @@ -47,20 +47,7 @@ pgprot_t __acpi_get_mem_attribute(phys_addr_t addr); /* ACPI table mapping after acpi_permanent_mmap is set */ -static inline void __iomem *acpi_os_ioremap(acpi_physical_address phys, - acpi_size size) -{ - /* For normal memory we already have a cacheable mapping. */ - if (memblock_is_map_memory(phys)) - return (void __iomem *)__phys_to_virt(phys); - - /* - * We should still honor the memory's attribute here because - * crash dump kernel possibly excludes some ACPI (reclaim) - * regions from memblock list. - */ - return __ioremap(phys, size, __acpi_get_mem_attribute(phys)); -} +void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size); #define acpi_os_ioremap acpi_os_ioremap typedef u64 phys_cpuid_t; diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index a7586a4db142..01b861e225b0 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -261,6 +261,72 @@ pgprot_t __acpi_get_mem_attribute(phys_addr_t addr) return __pgprot(PROT_DEVICE_nGnRnE); } +void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) +{ + efi_memory_desc_t *md, *region = NULL; + pgprot_t prot; + + if (WARN_ON_ONCE(!efi_enabled(EFI_MEMMAP))) + return NULL; + + for_each_efi_memory_desc(md) { + u64 end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT); + + if (phys < md->phys_addr || phys >= end) + continue; + + if (phys + size > end) { + pr_warn(FW_BUG "requested region covers multiple EFI memory regions\n"); + return NULL; + } + region = md; + break; + } + + /* + * It is fine for AML to remap regions that are not represented in the + * EFI memory map at all, as it only describes normal memory, and MMIO + * regions that require a virtual mapping to make them accessible to + * the EFI runtime services. + */ + prot = __pgprot(PROT_DEVICE_nGnRnE); + if (region) { + switch (region->type) { + case EFI_LOADER_CODE: + case EFI_LOADER_DATA: + case EFI_BOOT_SERVICES_CODE: + case EFI_BOOT_SERVICES_DATA: + case EFI_CONVENTIONAL_MEMORY: + case EFI_PERSISTENT_MEMORY: + pr_warn(FW_BUG "requested region covers kernel memory @ %pa\n", &phys); + return NULL; + + case EFI_ACPI_RECLAIM_MEMORY: + /* + * ACPI reclaim memory is used to pass firmware tables + * and other data that is intended for consumption by + * the OS only, which may decide it wants to reclaim + * that memory and use it for something else. We never + * do that, but we usually add it to the linear map + * anyway, in which case we should use the existing + * mapping. + */ + if (memblock_is_map_memory(phys)) + return (void __iomem *)__phys_to_virt(phys); + /* fall through */ + + default: + if (region->attribute & EFI_MEMORY_WB) + prot = PAGE_KERNEL; + else if (region->attribute & EFI_MEMORY_WT) + prot = __pgprot(PROT_NORMAL_WT); + else if (region->attribute & EFI_MEMORY_WC) + prot = __pgprot(PROT_NORMAL_NC); + } + } + return __ioremap(phys, size, prot); +} + /* * Claim Synchronous External Aborts as a firmware first notification. * -- cgit v1.2.3 From 325f5585ec36953a3fe2e000451f690440fe1bf5 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 26 Jun 2020 17:58:32 +0200 Subject: arm64/acpi: disallow writeable AML opregion mapping for EFI code regions Given that the contents of EFI runtime code and data regions are provided by the firmware, as well as the DSDT, it is not unimaginable that AML code exists today that accesses EFI runtime code regions using a SystemMemory OpRegion. There is nothing fundamentally wrong with that, but since we take great care to ensure that executable code is never mapped writeable and executable at the same time, we should not permit AML to create writable mapping. Signed-off-by: Ard Biesheuvel Acked-by: Lorenzo Pieralisi Link: https://lore.kernel.org/r/20200626155832.2323789-3-ardb@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/kernel/acpi.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index 01b861e225b0..455966401102 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -301,6 +301,15 @@ void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) pr_warn(FW_BUG "requested region covers kernel memory @ %pa\n", &phys); return NULL; + case EFI_RUNTIME_SERVICES_CODE: + /* + * This would be unusual, but not problematic per se, + * as long as we take care not to create a writable + * mapping for executable code. + */ + prot = PAGE_KERNEL_RO; + break; + case EFI_ACPI_RECLAIM_MEMORY: /* * ACPI reclaim memory is used to pass firmware tables -- cgit v1.2.3 From 0de674afe83cb23676ec391470251aaa9700f21a Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 10 Jul 2020 19:24:02 +0100 Subject: arm64: stacktrace: Move export for save_stack_trace_tsk() Due to refactoring way back in bb53c820c5b0f1 ("arm64: stacktrace: avoid listing stacktrace functions in stacktrace") the EXPORT_SYMBOL_GPL() for save_stack_trace_tsk() is at the end of __save_stack_trace() rather than the function it exports. Move it to the expected location. Signed-off-by: Mark Brown Acked-by: Will Deacon Link: https://lore.kernel.org/r/20200710182402.50473-1-broonie@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/kernel/stacktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 139679c745bf..2dd8e3b8b94b 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -199,12 +199,12 @@ static noinline void __save_stack_trace(struct task_struct *tsk, put_task_stack(tsk); } -EXPORT_SYMBOL_GPL(save_stack_trace_tsk); void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { __save_stack_trace(tsk, trace, 1); } +EXPORT_SYMBOL_GPL(save_stack_trace_tsk); void save_stack_trace(struct stack_trace *trace) { -- cgit v1.2.3 From abb7962adc80ab4f4313e8a065302525b6a9c2dc Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 1 Jul 2020 10:12:01 +0530 Subject: arm64/hugetlb: Reserve CMA areas for gigantic pages on 16K and 64K configs Currently 'hugetlb_cma=' command line argument does not create CMA area on ARM64_16K_PAGES and ARM64_64K_PAGES based platforms. Instead, it just ends up with the following warning message. Reason being, hugetlb_cma_reserve() never gets called for these huge page sizes. [ 64.255669] hugetlb_cma: the option isn't supported by current arch This enables CMA areas reservation on ARM64_16K_PAGES and ARM64_64K_PAGES configs by defining an unified arm64_hugetlb_cma_reseve() that is wrapped in CONFIG_CMA. Call site for arm64_hugetlb_cma_reserve() is also protected as is conditionally included and hence cannot contain stub for the inverse config i.e !(CONFIG_HUGETLB_PAGE && CONFIG_CMA). Signed-off-by: Anshuman Khandual Cc: Will Deacon Cc: Mark Rutland Cc: Mike Kravetz Cc: Barry Song Cc: Andrew Morton Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Link: https://lore.kernel.org/r/1593578521-24672-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/hugetlb.h | 2 ++ arch/arm64/mm/hugetlbpage.c | 38 ++++++++++++++++++++++++++++++++++++++ arch/arm64/mm/init.c | 4 ++-- 3 files changed, 42 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 94ba0c5bced2..5abf91e3494c 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -49,6 +49,8 @@ extern void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned long sz); #define set_huge_swap_pte_at set_huge_swap_pte_at +void __init arm64_hugetlb_cma_reserve(void); + #include #endif /* __ASM_HUGETLB_H */ diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index c79084739096..aa421bf4956e 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -19,6 +19,44 @@ #include #include +/* + * HugeTLB Support Matrix + * + * --------------------------------------------------- + * | Page Size | CONT PTE | PMD | CONT PMD | PUD | + * --------------------------------------------------- + * | 4K | 64K | 2M | 32M | 1G | + * | 16K | 2M | 32M | 1G | | + * | 64K | 2M | 512M | 16G | | + * --------------------------------------------------- + */ + +/* + * Reserve CMA areas for the largest supported gigantic + * huge page when requested. Any other smaller gigantic + * huge pages could still be served from those areas. + */ +#ifdef CONFIG_CMA +void __init arm64_hugetlb_cma_reserve(void) +{ + int order; + +#ifdef CONFIG_ARM64_4K_PAGES + order = PUD_SHIFT - PAGE_SHIFT; +#else + order = CONT_PMD_SHIFT + PMD_SHIFT - PAGE_SHIFT; +#endif + /* + * HugeTLB CMA reservation is required for gigantic + * huge pages which could not be allocated via the + * page allocator. Just warn if there is any change + * breaking this assumption. + */ + WARN_ON(order <= MAX_ORDER); + hugetlb_cma_reserve(order); +} +#endif /* CONFIG_CMA */ + #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION bool arch_hugetlb_migration_supported(struct hstate *h) { diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 6c3eb424c613..f8c19c6c8e71 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -425,8 +425,8 @@ void __init bootmem_init(void) * initialize node_online_map that gets used in hugetlb_cma_reserve() * while allocating required CMA size across online nodes. */ -#ifdef CONFIG_ARM64_4K_PAGES - hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT); +#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA) + arm64_hugetlb_cma_reserve(); #endif /* -- cgit v1.2.3 From b620ba54547cd0f98e35c1be102eec2cc25fda5d Mon Sep 17 00:00:00 2001 From: Zhenyu Ye Date: Wed, 15 Jul 2020 15:19:43 +0800 Subject: arm64: tlb: Detect the ARMv8.4 TLBI RANGE feature ARMv8.4-TLBI provides TLBI invalidation instruction that apply to a range of input addresses. This patch detect this feature. Signed-off-by: Zhenyu Ye Link: https://lore.kernel.org/r/20200715071945.897-2-yezhenyu2@huawei.com [catalin.marinas@arm.com: some renaming for consistency] Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/cpucaps.h | 3 ++- arch/arm64/include/asm/sysreg.h | 3 +++ arch/arm64/kernel/cpufeature.c | 10 ++++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index d44ba903d11d..07b643a70710 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -63,7 +63,8 @@ #define ARM64_HAS_32BIT_EL1 53 #define ARM64_BTI 54 #define ARM64_HAS_ARMv8_4_TTL 55 +#define ARM64_HAS_TLB_RANGE 56 -#define ARM64_NCAPS 56 +#define ARM64_NCAPS 57 #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 8c209aa17273..551f30ace4db 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -617,6 +617,9 @@ #define ID_AA64ISAR0_SHA1_SHIFT 8 #define ID_AA64ISAR0_AES_SHIFT 4 +#define ID_AA64ISAR0_TLB_RANGE_NI 0x0 +#define ID_AA64ISAR0_TLB_RANGE 0x2 + /* id_aa64isar1 */ #define ID_AA64ISAR1_I8MM_SHIFT 52 #define ID_AA64ISAR1_DGH_SHIFT 48 diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index e877f56ff1ab..2f5adefef34d 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1893,6 +1893,16 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .min_field_value = 1, .matches = has_cpuid_feature, }, + { + .desc = "TLB range maintenance instructions", + .capability = ARM64_HAS_TLB_RANGE, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + .sys_reg = SYS_ID_AA64ISAR0_EL1, + .field_pos = ID_AA64ISAR0_TLB_SHIFT, + .sign = FTR_UNSIGNED, + .min_field_value = ID_AA64ISAR0_TLB_RANGE, + }, #ifdef CONFIG_ARM64_HW_AFDBM { /* -- cgit v1.2.3 From 7c78f67e9bd97478d56157c2ad53823668b5b822 Mon Sep 17 00:00:00 2001 From: Zhenyu Ye Date: Wed, 15 Jul 2020 15:19:44 +0800 Subject: arm64: enable tlbi range instructions TLBI RANGE feature instoduces new assembly instructions and only support by binutils >= 2.30. Add necessary Kconfig logic to allow this to be enabled and pass '-march=armv8.4-a' to KBUILD_CFLAGS. Signed-off-by: Zhenyu Ye Link: https://lore.kernel.org/r/20200715071945.897-3-yezhenyu2@huawei.com Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 14 ++++++++++++++ arch/arm64/Makefile | 7 +++++++ 2 files changed, 21 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 66dc41fd49f2..0f39468dbc60 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1596,6 +1596,20 @@ config ARM64_AMU_EXTN correctly reflect reality. Most commonly, the value read will be 0, indicating that the counter is not enabled. +config AS_HAS_ARMV8_4 + def_bool $(cc-option,-Wa$(comma)-march=armv8.4-a) + +config ARM64_TLB_RANGE + bool "Enable support for tlbi range feature" + default y + depends on AS_HAS_ARMV8_4 + help + ARMv8.4-TLBI provides TLBI invalidation instruction that apply to a + range of input addresses. + + The feature introduces new assembly instructions, and they were + support when binutils >= 2.30. + endmenu menu "ARMv8.5 architectural features" diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index a0d94d063fa8..4e823b97c92e 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -82,11 +82,18 @@ endif # compiler to generate them and consequently to break the single image contract # we pass it only to the assembler. This option is utilized only in case of non # integrated assemblers. +ifneq ($(CONFIG_AS_HAS_ARMV8_4), y) branch-prot-flags-$(CONFIG_AS_HAS_PAC) += -Wa,-march=armv8.3-a endif +endif KBUILD_CFLAGS += $(branch-prot-flags-y) +ifeq ($(CONFIG_AS_HAS_ARMV8_4), y) +# make sure to pass the newest target architecture to -march. +KBUILD_CFLAGS += -Wa,-march=armv8.4-a +endif + ifeq ($(CONFIG_SHADOW_CALL_STACK), y) KBUILD_CFLAGS += -ffixed-x18 endif -- cgit v1.2.3 From d1d3aa98b1d4826a19adfefb69b96142a0cac633 Mon Sep 17 00:00:00 2001 From: Zhenyu Ye Date: Wed, 15 Jul 2020 15:19:45 +0800 Subject: arm64: tlb: Use the TLBI RANGE feature in arm64 Add __TLBI_VADDR_RANGE macro and rewrite __flush_tlb_range(). When cpu supports TLBI feature, the minimum range granularity is decided by 'scale', so we can not flush all pages by one instruction in some cases. For example, when the pages = 0xe81a, let's start 'scale' from maximum, and find right 'num' for each 'scale': 1. scale = 3, we can flush no pages because the minimum range is 2^(5*3 + 1) = 0x10000. 2. scale = 2, the minimum range is 2^(5*2 + 1) = 0x800, we can flush 0xe800 pages this time, the num = 0xe800/0x800 - 1 = 0x1c. Remaining pages is 0x1a; 3. scale = 1, the minimum range is 2^(5*1 + 1) = 0x40, no page can be flushed. 4. scale = 0, we flush the remaining 0x1a pages, the num = 0x1a/0x2 - 1 = 0xd. However, in most scenarios, the pages = 1 when flush_tlb_range() is called. Start from scale = 3 or other proper value (such as scale = ilog2(pages)), will incur extra overhead. So increase 'scale' from 0 to maximum, the flush order is exactly opposite to the example. Signed-off-by: Zhenyu Ye Link: https://lore.kernel.org/r/20200715071945.897-4-yezhenyu2@huawei.com [catalin.marinas@arm.com: removed unnecessary masks in __TLBI_VADDR_RANGE] [catalin.marinas@arm.com: __TLB_RANGE_NUM subtracts 1] [catalin.marinas@arm.com: minor adjustments to the comments] [catalin.marinas@arm.com: introduce system_supports_tlb_range()] Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/cpufeature.h | 6 ++ arch/arm64/include/asm/tlbflush.h | 154 +++++++++++++++++++++++++++++------- 2 files changed, 131 insertions(+), 29 deletions(-) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 5d1f4ae42799..cf56daa95a7d 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -692,6 +692,12 @@ static inline bool system_supports_bti(void) return IS_ENABLED(CONFIG_ARM64_BTI) && cpus_have_const_cap(ARM64_BTI); } +static inline bool system_supports_tlb_range(void) +{ + return IS_ENABLED(CONFIG_ARM64_TLB_RANGE) && + cpus_have_const_cap(ARM64_HAS_TLB_RANGE); +} + #define ARM64_BP_HARDEN_UNKNOWN -1 #define ARM64_BP_HARDEN_WA_NEEDED 0 #define ARM64_BP_HARDEN_NOT_REQUIRED 1 diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 2cb275efcea3..d493174415db 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -60,6 +60,31 @@ __ta; \ }) +/* + * Get translation granule of the system, which is decided by + * PAGE_SIZE. Used by TTL. + * - 4KB : 1 + * - 16KB : 2 + * - 64KB : 3 + */ +#define TLBI_TTL_TG_4K 1 +#define TLBI_TTL_TG_16K 2 +#define TLBI_TTL_TG_64K 3 + +static inline unsigned long get_trans_granule(void) +{ + switch (PAGE_SIZE) { + case SZ_4K: + return TLBI_TTL_TG_4K; + case SZ_16K: + return TLBI_TTL_TG_16K; + case SZ_64K: + return TLBI_TTL_TG_64K; + default: + return 0; + } +} + /* * Level-based TLBI operations. * @@ -73,9 +98,6 @@ * in asm/stage2_pgtable.h. */ #define TLBI_TTL_MASK GENMASK_ULL(47, 44) -#define TLBI_TTL_TG_4K 1 -#define TLBI_TTL_TG_16K 2 -#define TLBI_TTL_TG_64K 3 #define __tlbi_level(op, addr, level) do { \ u64 arg = addr; \ @@ -83,19 +105,7 @@ if (cpus_have_const_cap(ARM64_HAS_ARMv8_4_TTL) && \ level) { \ u64 ttl = level & 3; \ - \ - switch (PAGE_SIZE) { \ - case SZ_4K: \ - ttl |= TLBI_TTL_TG_4K << 2; \ - break; \ - case SZ_16K: \ - ttl |= TLBI_TTL_TG_16K << 2; \ - break; \ - case SZ_64K: \ - ttl |= TLBI_TTL_TG_64K << 2; \ - break; \ - } \ - \ + ttl |= get_trans_granule() << 2; \ arg &= ~TLBI_TTL_MASK; \ arg |= FIELD_PREP(TLBI_TTL_MASK, ttl); \ } \ @@ -108,6 +118,44 @@ __tlbi_level(op, (arg | USER_ASID_FLAG), level); \ } while (0) +/* + * This macro creates a properly formatted VA operand for the TLB RANGE. + * The value bit assignments are: + * + * +----------+------+-------+-------+-------+----------------------+ + * | ASID | TG | SCALE | NUM | TTL | BADDR | + * +-----------------+-------+-------+-------+----------------------+ + * |63 48|47 46|45 44|43 39|38 37|36 0| + * + * The address range is determined by below formula: + * [BADDR, BADDR + (NUM + 1) * 2^(5*SCALE + 1) * PAGESIZE) + * + */ +#define __TLBI_VADDR_RANGE(addr, asid, scale, num, ttl) \ + ({ \ + unsigned long __ta = (addr) >> PAGE_SHIFT; \ + __ta &= GENMASK_ULL(36, 0); \ + __ta |= (unsigned long)(ttl) << 37; \ + __ta |= (unsigned long)(num) << 39; \ + __ta |= (unsigned long)(scale) << 44; \ + __ta |= get_trans_granule() << 46; \ + __ta |= (unsigned long)(asid) << 48; \ + __ta; \ + }) + +/* These macros are used by the TLBI RANGE feature. */ +#define __TLBI_RANGE_PAGES(num, scale) \ + ((unsigned long)((num) + 1) << (5 * (scale) + 1)) +#define MAX_TLBI_RANGE_PAGES __TLBI_RANGE_PAGES(31, 3) + +/* + * Generate 'num' values from -1 to 30 with -1 rejected by the + * __flush_tlb_range() loop below. + */ +#define TLBI_RANGE_MASK GENMASK_ULL(4, 0) +#define __TLBI_RANGE_NUM(pages, scale) \ + ((((pages) >> (5 * (scale) + 1)) & TLBI_RANGE_MASK) - 1) + /* * TLB Invalidation * ================ @@ -231,32 +279,80 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, unsigned long stride, bool last_level, int tlb_level) { + int num = 0; + int scale = 0; unsigned long asid = ASID(vma->vm_mm); unsigned long addr; + unsigned long pages; start = round_down(start, stride); end = round_up(end, stride); + pages = (end - start) >> PAGE_SHIFT; - if ((end - start) >= (MAX_TLBI_OPS * stride)) { + /* + * When not uses TLB range ops, we can handle up to + * (MAX_TLBI_OPS - 1) pages; + * When uses TLB range ops, we can handle up to + * (MAX_TLBI_RANGE_PAGES - 1) pages. + */ + if ((!system_supports_tlb_range() && + (end - start) >= (MAX_TLBI_OPS * stride)) || + pages >= MAX_TLBI_RANGE_PAGES) { flush_tlb_mm(vma->vm_mm); return; } - /* Convert the stride into units of 4k */ - stride >>= 12; + dsb(ishst); - start = __TLBI_VADDR(start, asid); - end = __TLBI_VADDR(end, asid); + /* + * When the CPU does not support TLB range operations, flush the TLB + * entries one by one at the granularity of 'stride'. If the the TLB + * range ops are supported, then: + * + * 1. If 'pages' is odd, flush the first page through non-range + * operations; + * + * 2. For remaining pages: the minimum range granularity is decided + * by 'scale', so multiple range TLBI operations may be required. + * Start from scale = 0, flush the corresponding number of pages + * ((num+1)*2^(5*scale+1) starting from 'addr'), then increase it + * until no pages left. + * + * Note that certain ranges can be represented by either num = 31 and + * scale or num = 0 and scale + 1. The loop below favours the latter + * since num is limited to 30 by the __TLBI_RANGE_NUM() macro. + */ + while (pages > 0) { + if (!system_supports_tlb_range() || + pages % 2 == 1) { + addr = __TLBI_VADDR(start, asid); + if (last_level) { + __tlbi_level(vale1is, addr, tlb_level); + __tlbi_user_level(vale1is, addr, tlb_level); + } else { + __tlbi_level(vae1is, addr, tlb_level); + __tlbi_user_level(vae1is, addr, tlb_level); + } + start += stride; + pages -= stride >> PAGE_SHIFT; + continue; + } - dsb(ishst); - for (addr = start; addr < end; addr += stride) { - if (last_level) { - __tlbi_level(vale1is, addr, tlb_level); - __tlbi_user_level(vale1is, addr, tlb_level); - } else { - __tlbi_level(vae1is, addr, tlb_level); - __tlbi_user_level(vae1is, addr, tlb_level); + num = __TLBI_RANGE_NUM(pages, scale); + if (num >= 0) { + addr = __TLBI_VADDR_RANGE(start, asid, scale, + num, tlb_level); + if (last_level) { + __tlbi(rvale1is, addr); + __tlbi_user(rvale1is, addr); + } else { + __tlbi(rvae1is, addr); + __tlbi_user(rvae1is, addr); + } + start += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT; + pages -= __TLBI_RANGE_PAGES(num, scale); } + scale++; } dsb(ish); } -- cgit v1.2.3 From 539707caa1a89ee4efc57b4e4231c20c46575ccc Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Thu, 18 Jun 2020 21:35:44 +0800 Subject: arm64: perf: Correct the event index in sysfs When PMU event ID is equal or greater than 0x4000, it will be reduced by 0x4000 and it is not the raw number in the sysfs. Let's correct it and obtain the raw event ID. Before this patch: cat /sys/bus/event_source/devices/armv8_pmuv3_0/events/sample_feed event=0x001 After this patch: cat /sys/bus/event_source/devices/armv8_pmuv3_0/events/sample_feed event=0x4001 Signed-off-by: Shaokun Zhang Cc: Will Deacon Cc: Mark Rutland Cc: Link: https://lore.kernel.org/r/1592487344-30555-3-git-send-email-zhangshaokun@hisilicon.com [will: fixed formatting of 'if' condition] Signed-off-by: Will Deacon --- arch/arm64/kernel/perf_event.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 4d7879484cec..581602413a13 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -155,7 +155,7 @@ armv8pmu_events_sysfs_show(struct device *dev, pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr); - return sprintf(page, "event=0x%03llx\n", pmu_attr->id); + return sprintf(page, "event=0x%04llx\n", pmu_attr->id); } #define ARMV8_EVENT_ATTR(name, config) \ @@ -244,10 +244,13 @@ armv8pmu_event_attr_is_visible(struct kobject *kobj, test_bit(pmu_attr->id, cpu_pmu->pmceid_bitmap)) return attr->mode; - pmu_attr->id -= ARMV8_PMUV3_EXT_COMMON_EVENT_BASE; - if (pmu_attr->id < ARMV8_PMUV3_MAX_COMMON_EVENTS && - test_bit(pmu_attr->id, cpu_pmu->pmceid_ext_bitmap)) - return attr->mode; + if (pmu_attr->id >= ARMV8_PMUV3_EXT_COMMON_EVENT_BASE) { + u64 id = pmu_attr->id - ARMV8_PMUV3_EXT_COMMON_EVENT_BASE; + + if (id < ARMV8_PMUV3_MAX_COMMON_EVENTS && + test_bit(id, cpu_pmu->pmceid_ext_bitmap)) + return attr->mode; + } return 0; } -- cgit v1.2.3 From 1b86abc1c645ad5c9c7bf70910cb3ce73939d2d7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Jul 2020 13:11:24 +0800 Subject: sched_clock: Expose struct clock_read_data In order to support perf_event_mmap_page::cap_time features, an architecture needs, aside from a userspace readable counter register, to expose the exact clock data so that userspace can convert the counter register into a correct timestamp. Provide struct clock_read_data and two (seqcount) helpers so that architectures (arm64 in specific) can expose the numbers to userspace. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Leo Yan Link: https://lore.kernel.org/r/20200716051130.4359-2-leo.yan@linaro.org Signed-off-by: Will Deacon --- include/linux/sched_clock.h | 28 ++++++++++++++++++++++++++++ kernel/time/sched_clock.c | 41 +++++++++++++---------------------------- 2 files changed, 41 insertions(+), 28 deletions(-) diff --git a/include/linux/sched_clock.h b/include/linux/sched_clock.h index 0bb04a96a6d4..528718e4ed52 100644 --- a/include/linux/sched_clock.h +++ b/include/linux/sched_clock.h @@ -6,6 +6,34 @@ #define LINUX_SCHED_CLOCK #ifdef CONFIG_GENERIC_SCHED_CLOCK +/** + * struct clock_read_data - data required to read from sched_clock() + * + * @epoch_ns: sched_clock() value at last update + * @epoch_cyc: Clock cycle value at last update. + * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit + * clocks. + * @read_sched_clock: Current clock source (or dummy source when suspended). + * @mult: Multipler for scaled math conversion. + * @shift: Shift value for scaled math conversion. + * + * Care must be taken when updating this structure; it is read by + * some very hot code paths. It occupies <=40 bytes and, when combined + * with the seqcount used to synchronize access, comfortably fits into + * a 64 byte cache line. + */ +struct clock_read_data { + u64 epoch_ns; + u64 epoch_cyc; + u64 sched_clock_mask; + u64 (*read_sched_clock)(void); + u32 mult; + u32 shift; +}; + +extern struct clock_read_data *sched_clock_read_begin(unsigned int *seq); +extern int sched_clock_read_retry(unsigned int seq); + extern void generic_sched_clock_init(void); extern void sched_clock_register(u64 (*read)(void), int bits, diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index fa3f800d7d76..0acaadc3156c 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -19,31 +19,6 @@ #include "timekeeping.h" -/** - * struct clock_read_data - data required to read from sched_clock() - * - * @epoch_ns: sched_clock() value at last update - * @epoch_cyc: Clock cycle value at last update. - * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit - * clocks. - * @read_sched_clock: Current clock source (or dummy source when suspended). - * @mult: Multipler for scaled math conversion. - * @shift: Shift value for scaled math conversion. - * - * Care must be taken when updating this structure; it is read by - * some very hot code paths. It occupies <=40 bytes and, when combined - * with the seqcount used to synchronize access, comfortably fits into - * a 64 byte cache line. - */ -struct clock_read_data { - u64 epoch_ns; - u64 epoch_cyc; - u64 sched_clock_mask; - u64 (*read_sched_clock)(void); - u32 mult; - u32 shift; -}; - /** * struct clock_data - all data needed for sched_clock() (including * registration of a new clock source) @@ -93,6 +68,17 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) return (cyc * mult) >> shift; } +struct clock_read_data *sched_clock_read_begin(unsigned int *seq) +{ + *seq = raw_read_seqcount(&cd.seq); + return cd.read_data + (*seq & 1); +} + +int sched_clock_read_retry(unsigned int seq) +{ + return read_seqcount_retry(&cd.seq, seq); +} + unsigned long long notrace sched_clock(void) { u64 cyc, res; @@ -100,13 +86,12 @@ unsigned long long notrace sched_clock(void) struct clock_read_data *rd; do { - seq = raw_read_seqcount(&cd.seq); - rd = cd.read_data + (seq & 1); + rd = sched_clock_read_begin(&seq); cyc = (rd->read_sched_clock() - rd->epoch_cyc) & rd->sched_clock_mask; res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift); - } while (read_seqcount_retry(&cd.seq, seq)); + } while (sched_clock_read_retry(seq)); return res; } -- cgit v1.2.3 From aadd6e5caaacd6feca9691ba30536e7de5a7d152 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Thu, 16 Jul 2020 13:11:25 +0800 Subject: time/sched_clock: Use raw_read_seqcount_latch() sched_clock uses seqcount_t latching to switch between two storage places protected by the sequence counter. This allows it to have interruptible, NMI-safe, seqcount_t write side critical sections. Since 7fc26327b756 ("seqlock: Introduce raw_read_seqcount_latch()"), raw_read_seqcount_latch() became the standardized way for seqcount_t latch read paths. Due to the dependent load, it also has one read memory barrier less than the currently used raw_read_seqcount() API. Use raw_read_seqcount_latch() for the seqcount_t latch read path. Signed-off-by: Ahmed S. Darwish Signed-off-by: Leo Yan Link: https://lkml.kernel.org/r/20200625085745.GD117543@hirez.programming.kicks-ass.net Link: https://lkml.kernel.org/r/20200715092345.GA231464@debian-buster-darwi.lab.linutronix.de Link: https://lore.kernel.org/r/20200716051130.4359-3-leo.yan@linaro.org References: 1809bfa44e10 ("timers, sched/clock: Avoid deadlock during read from NMI") Signed-off-by: Will Deacon --- kernel/time/sched_clock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 0acaadc3156c..0deaf4b79fb4 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -70,7 +70,7 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) struct clock_read_data *sched_clock_read_begin(unsigned int *seq) { - *seq = raw_read_seqcount(&cd.seq); + *seq = raw_read_seqcount_latch(&cd.seq); return cd.read_data + (*seq & 1); } -- cgit v1.2.3 From 950b74ddefc4a42add8b1ae0170aa309338ffe73 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Jul 2020 13:11:26 +0800 Subject: arm64: perf: Implement correct cap_user_time As reported by Leo; the existing implementation is broken when the clock and counter don't intersect at 0. Use the sched_clock's struct clock_read_data information to correctly implement cap_user_time and cap_user_time_zero. Note that the ARM64 counter is architecturally only guaranteed to be 56bit wide (implementations are allowed to be wider) and the existing perf ABI cannot deal with wrap-around. This implementation should also be faster than the old; seeing how we don't need to recompute mult and shift all the time. [leoyan: Use mul_u64_u32_shr() to convert cyc to ns to avoid overflow] Reported-by: Leo Yan Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Leo Yan Link: https://lore.kernel.org/r/20200716051130.4359-4-leo.yan@linaro.org Signed-off-by: Will Deacon --- arch/arm64/kernel/perf_event.c | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 581602413a13..3bbbc22a5148 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -19,6 +19,7 @@ #include #include #include +#include #include /* ARMv8 Cortex-A53 specific event types. */ @@ -1168,28 +1169,47 @@ device_initcall(armv8_pmu_driver_init) void arch_perf_update_userpage(struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now) { - u32 freq; - u32 shift; + struct clock_read_data *rd; + unsigned int seq; + u64 ns; /* * Internal timekeeping for enabled/running/stopped times * is always computed with the sched_clock. */ - freq = arch_timer_get_rate(); userpg->cap_user_time = 1; + userpg->cap_user_time_zero = 1; + + do { + rd = sched_clock_read_begin(&seq); + + userpg->time_mult = rd->mult; + userpg->time_shift = rd->shift; + userpg->time_zero = rd->epoch_ns; + + /* + * This isn't strictly correct, the ARM64 counter can be + * 'short' and then we get funnies when it wraps. The correct + * thing would be to extend the perf ABI with a cycle and mask + * value, but because wrapping on ARM64 is very rare in + * practise this 'works'. + */ + ns = mul_u64_u32_shr(rd->epoch_cyc, rd->mult, rd->shift); + userpg->time_zero -= ns; + + } while (sched_clock_read_retry(seq)); + + userpg->time_offset = userpg->time_zero - now; - clocks_calc_mult_shift(&userpg->time_mult, &shift, freq, - NSEC_PER_SEC, 0); /* * time_shift is not expected to be greater than 31 due to * the original published conversion algorithm shifting a * 32-bit value (now specifies a 64-bit value) - refer * perf_event_mmap_page documentation in perf_event.h. */ - if (shift == 32) { - shift = 31; + if (userpg->time_shift == 32) { + userpg->time_shift = 31; userpg->time_mult >>= 1; } - userpg->time_shift = (u16)shift; - userpg->time_offset = -now; + } -- cgit v1.2.3 From 279a811eb520594fac3cd3a541e6c7ea50072ac9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Jul 2020 13:11:27 +0800 Subject: arm64: perf: Only advertise cap_user_time for arch_timer When sched_clock is running on anything other than arch_timer, don't advertise cap_user_time*. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Leo Yan Link: https://lore.kernel.org/r/20200716051130.4359-5-leo.yan@linaro.org Requested-by: Will Deacon Signed-off-by: Will Deacon --- arch/arm64/kernel/perf_event.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 3bbbc22a5148..674edc7ba8ca 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -13,6 +13,8 @@ #include #include +#include + #include #include #include @@ -1173,16 +1175,15 @@ void arch_perf_update_userpage(struct perf_event *event, unsigned int seq; u64 ns; - /* - * Internal timekeeping for enabled/running/stopped times - * is always computed with the sched_clock. - */ - userpg->cap_user_time = 1; - userpg->cap_user_time_zero = 1; + userpg->cap_user_time = 0; + userpg->cap_user_time_zero = 0; do { rd = sched_clock_read_begin(&seq); + if (rd->read_sched_clock != arch_timer_read_counter) + return; + userpg->time_mult = rd->mult; userpg->time_shift = rd->shift; userpg->time_zero = rd->epoch_ns; @@ -1212,4 +1213,10 @@ void arch_perf_update_userpage(struct perf_event *event, userpg->time_mult >>= 1; } + /* + * Internal timekeeping for enabled/running/stopped times + * is always computed with the sched_clock. + */ + userpg->cap_user_time = 1; + userpg->cap_user_time_zero = 1; } -- cgit v1.2.3 From 6c0246a4588d418f72acd40a7b7601be403d80a9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Jul 2020 13:11:28 +0800 Subject: perf: Add perf_event_mmap_page::cap_user_time_short ABI In order to support short clock counters, provide an ABI extension. As a whole: u64 time, delta, cyc = read_cycle_counter(); + if (cap_user_time_short) + cyc = time_cycle + ((cyc - time_cycle) & time_mask); delta = mul_u64_u32_shr(cyc, time_mult, time_shift); if (cap_user_time_zero) time = time_zero + delta; delta += time_offset; Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Leo Yan Link: https://lore.kernel.org/r/20200716051130.4359-6-leo.yan@linaro.org Signed-off-by: Will Deacon --- include/uapi/linux/perf_event.h | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 7b2d6fc9e6ed..21a1edd08cbe 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -532,9 +532,10 @@ struct perf_event_mmap_page { cap_bit0_is_deprecated : 1, /* Always 1, signals that bit 0 is zero */ cap_user_rdpmc : 1, /* The RDPMC instruction can be used to read counts */ - cap_user_time : 1, /* The time_* fields are used */ + cap_user_time : 1, /* The time_{shift,mult,offset} fields are used */ cap_user_time_zero : 1, /* The time_zero field is used */ - cap_____res : 59; + cap_user_time_short : 1, /* the time_{cycle,mask} fields are used */ + cap_____res : 58; }; }; @@ -593,13 +594,29 @@ struct perf_event_mmap_page { * ((rem * time_mult) >> time_shift); */ __u64 time_zero; + __u32 size; /* Header size up to __reserved[] fields. */ + __u32 __reserved_1; + + /* + * If cap_usr_time_short, the hardware clock is less than 64bit wide + * and we must compute the 'cyc' value, as used by cap_usr_time, as: + * + * cyc = time_cycles + ((cyc - time_cycles) & time_mask) + * + * NOTE: this form is explicitly chosen such that cap_usr_time_short + * is a correction on top of cap_usr_time, and code that doesn't + * know about cap_usr_time_short still works under the assumption + * the counter doesn't wrap. + */ + __u64 time_cycles; + __u64 time_mask; /* * Hole for extension of the self monitor capabilities */ - __u8 __reserved[118*8+4]; /* align to 1k. */ + __u8 __reserved[116*8]; /* align to 1k. */ /* * Control data for the mmap() data buffer. -- cgit v1.2.3 From c8f9eb0d6ebaa768c9f6eb2ee21b01d74230934d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Jul 2020 13:11:29 +0800 Subject: arm64: perf: Add cap_user_time_short This completes the ARM64 cap_user_time support. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Leo Yan Link: https://lore.kernel.org/r/20200716051130.4359-7-leo.yan@linaro.org Signed-off-by: Will Deacon --- arch/arm64/kernel/perf_event.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 674edc7ba8ca..fdb6029c9021 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -1177,6 +1177,7 @@ void arch_perf_update_userpage(struct perf_event *event, userpg->cap_user_time = 0; userpg->cap_user_time_zero = 0; + userpg->cap_user_time_short = 0; do { rd = sched_clock_read_begin(&seq); @@ -1187,13 +1188,13 @@ void arch_perf_update_userpage(struct perf_event *event, userpg->time_mult = rd->mult; userpg->time_shift = rd->shift; userpg->time_zero = rd->epoch_ns; + userpg->time_cycles = rd->epoch_cyc; + userpg->time_mask = rd->sched_clock_mask; /* - * This isn't strictly correct, the ARM64 counter can be - * 'short' and then we get funnies when it wraps. The correct - * thing would be to extend the perf ABI with a cycle and mask - * value, but because wrapping on ARM64 is very rare in - * practise this 'works'. + * Subtract the cycle base, such that software that + * doesn't know about cap_user_time_short still 'works' + * assuming no wraps. */ ns = mul_u64_u32_shr(rd->epoch_cyc, rd->mult, rd->shift); userpg->time_zero -= ns; @@ -1219,4 +1220,5 @@ void arch_perf_update_userpage(struct perf_event *event, */ userpg->cap_user_time = 1; userpg->cap_user_time_zero = 1; + userpg->cap_user_time_short = 1; } -- cgit v1.2.3 From 5271d915a99c696a2f16ae59cf6a037be35afa22 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Thu, 16 Jul 2020 13:11:30 +0800 Subject: tools headers UAPI: Update tools's copy of linux/perf_event.h To get the changes in the commit: "perf: Add perf_event_mmap_page::cap_user_time_short ABI" This update is a prerequisite to add support for short clock counters related ABI extension. Signed-off-by: Leo Yan Link: https://lore.kernel.org/r/20200716051130.4359-8-leo.yan@linaro.org Signed-off-by: Will Deacon --- tools/include/uapi/linux/perf_event.h | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 7b2d6fc9e6ed..21a1edd08cbe 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -532,9 +532,10 @@ struct perf_event_mmap_page { cap_bit0_is_deprecated : 1, /* Always 1, signals that bit 0 is zero */ cap_user_rdpmc : 1, /* The RDPMC instruction can be used to read counts */ - cap_user_time : 1, /* The time_* fields are used */ + cap_user_time : 1, /* The time_{shift,mult,offset} fields are used */ cap_user_time_zero : 1, /* The time_zero field is used */ - cap_____res : 59; + cap_user_time_short : 1, /* the time_{cycle,mask} fields are used */ + cap_____res : 58; }; }; @@ -593,13 +594,29 @@ struct perf_event_mmap_page { * ((rem * time_mult) >> time_shift); */ __u64 time_zero; + __u32 size; /* Header size up to __reserved[] fields. */ + __u32 __reserved_1; + + /* + * If cap_usr_time_short, the hardware clock is less than 64bit wide + * and we must compute the 'cyc' value, as used by cap_usr_time, as: + * + * cyc = time_cycles + ((cyc - time_cycles) & time_mask) + * + * NOTE: this form is explicitly chosen such that cap_usr_time_short + * is a correction on top of cap_usr_time, and code that doesn't + * know about cap_usr_time_short still works under the assumption + * the counter doesn't wrap. + */ + __u64 time_cycles; + __u64 time_mask; /* * Hole for extension of the self monitor capabilities */ - __u8 __reserved[118*8+4]; /* align to 1k. */ + __u8 __reserved[116*8]; /* align to 1k. */ /* * Control data for the mmap() data buffer. -- cgit v1.2.3 From f143c11bb7b924403ea2d5b5c990717772293620 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 21 Nov 2019 12:41:40 +0000 Subject: tools: bpf: Use local copy of headers including uapi/linux/filter.h Pulling header files directly out of the kernel sources for inclusion in userspace programs is highly error prone, not least because it bypasses the kbuild infrastructure entirely and so may end up referencing other header files that have not been generated. Subsequent patches will cause compiler.h to pull in the ungenerated asm/rwonce.h file via filter.h, breaking the build for tools/bpf: | $ make -C tools/bpf | make: Entering directory '/linux/tools/bpf' | CC bpf_jit_disasm.o | LINK bpf_jit_disasm | CC bpf_dbg.o | In file included from /linux/include/uapi/linux/filter.h:9, | from /linux/tools/bpf/bpf_dbg.c:41: | /linux/include/linux/compiler.h:247:10: fatal error: asm/rwonce.h: No such file or directory | #include | ^~~~~~~~~~~~~~ | compilation terminated. | make: *** [Makefile:61: bpf_dbg.o] Error 1 | make: Leaving directory '/linux/tools/bpf' Take a copy of the installed version of linux/filter.h (i.e. the one created by the 'headers_install' target) into tools/include/uapi/linux/ and adjust the BPF tool Makefile to reference the local include directories instead of those in the main source tree. Cc: Masahiro Yamada Acked-by: Peter Zijlstra (Intel) Acked-by: Alexei Starovoitov Suggested-by: Daniel Borkmann Reported-by: Xiao Yang Signed-off-by: Will Deacon --- tools/bpf/Makefile | 3 +- tools/include/uapi/linux/filter.h | 90 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+), 1 deletion(-) create mode 100644 tools/include/uapi/linux/filter.h diff --git a/tools/bpf/Makefile b/tools/bpf/Makefile index 6df1850f8353..8a69258fd8aa 100644 --- a/tools/bpf/Makefile +++ b/tools/bpf/Makefile @@ -9,7 +9,8 @@ MAKE = make INSTALL ?= install CFLAGS += -Wall -O2 -CFLAGS += -D__EXPORTED_HEADERS__ -I$(srctree)/include/uapi -I$(srctree)/include +CFLAGS += -D__EXPORTED_HEADERS__ -I$(srctree)/tools/include/uapi \ + -I$(srctree)/tools/include # This will work when bpf is built in tools env. where srctree # isn't set and when invoked from selftests build, where srctree diff --git a/tools/include/uapi/linux/filter.h b/tools/include/uapi/linux/filter.h new file mode 100644 index 000000000000..eaef459e7bd4 --- /dev/null +++ b/tools/include/uapi/linux/filter.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Linux Socket Filter Data Structures + */ + +#ifndef __LINUX_FILTER_H__ +#define __LINUX_FILTER_H__ + + +#include +#include + +/* + * Current version of the filter code architecture. + */ +#define BPF_MAJOR_VERSION 1 +#define BPF_MINOR_VERSION 1 + +/* + * Try and keep these values and structures similar to BSD, especially + * the BPF code definitions which need to match so you can share filters + */ + +struct sock_filter { /* Filter block */ + __u16 code; /* Actual filter code */ + __u8 jt; /* Jump true */ + __u8 jf; /* Jump false */ + __u32 k; /* Generic multiuse field */ +}; + +struct sock_fprog { /* Required for SO_ATTACH_FILTER. */ + unsigned short len; /* Number of filter blocks */ + struct sock_filter *filter; +}; + +/* ret - BPF_K and BPF_X also apply */ +#define BPF_RVAL(code) ((code) & 0x18) +#define BPF_A 0x10 + +/* misc */ +#define BPF_MISCOP(code) ((code) & 0xf8) +#define BPF_TAX 0x00 +#define BPF_TXA 0x80 + +/* + * Macros for filter block array initializers. + */ +#ifndef BPF_STMT +#define BPF_STMT(code, k) { (unsigned short)(code), 0, 0, k } +#endif +#ifndef BPF_JUMP +#define BPF_JUMP(code, k, jt, jf) { (unsigned short)(code), jt, jf, k } +#endif + +/* + * Number of scratch memory words for: BPF_ST and BPF_STX + */ +#define BPF_MEMWORDS 16 + +/* RATIONALE. Negative offsets are invalid in BPF. + We use them to reference ancillary data. + Unlike introduction new instructions, it does not break + existing compilers/optimizers. + */ +#define SKF_AD_OFF (-0x1000) +#define SKF_AD_PROTOCOL 0 +#define SKF_AD_PKTTYPE 4 +#define SKF_AD_IFINDEX 8 +#define SKF_AD_NLATTR 12 +#define SKF_AD_NLATTR_NEST 16 +#define SKF_AD_MARK 20 +#define SKF_AD_QUEUE 24 +#define SKF_AD_HATYPE 28 +#define SKF_AD_RXHASH 32 +#define SKF_AD_CPU 36 +#define SKF_AD_ALU_XOR_X 40 +#define SKF_AD_VLAN_TAG 44 +#define SKF_AD_VLAN_TAG_PRESENT 48 +#define SKF_AD_PAY_OFFSET 52 +#define SKF_AD_RANDOM 56 +#define SKF_AD_VLAN_TPID 60 +#define SKF_AD_MAX 64 + +#define SKF_NET_OFF (-0x100000) +#define SKF_LL_OFF (-0x200000) + +#define BPF_NET_OFF SKF_NET_OFF +#define BPF_LL_OFF SKF_LL_OFF + +#endif /* __LINUX_FILTER_H__ */ -- cgit v1.2.3 From e506ea451254ab17e0bf918ca36232fec2a9b10c Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 15 Oct 2019 16:29:32 -0700 Subject: compiler.h: Split {READ,WRITE}_ONCE definitions out into rwonce.h In preparation for allowing architectures to define their own implementation of the READ_ONCE() macro, move the generic {READ,WRITE}_ONCE() definitions out of the unwieldy 'linux/compiler.h' file and into a new 'rwonce.h' header under 'asm-generic'. Acked-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Signed-off-by: Will Deacon --- include/asm-generic/Kbuild | 1 + include/asm-generic/barrier.h | 2 +- include/asm-generic/rwonce.h | 101 ++++++++++++++++++++++++++++++++++++++++++ include/linux/compiler.h | 93 +------------------------------------- 4 files changed, 105 insertions(+), 92 deletions(-) create mode 100644 include/asm-generic/rwonce.h diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild index 44ec80e70518..74b0612601dd 100644 --- a/include/asm-generic/Kbuild +++ b/include/asm-generic/Kbuild @@ -45,6 +45,7 @@ mandatory-y += pci.h mandatory-y += percpu.h mandatory-y += pgalloc.h mandatory-y += preempt.h +mandatory-y += rwonce.h mandatory-y += sections.h mandatory-y += serial.h mandatory-y += shmparam.h diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h index 2eacaf7d62f6..8116744bb82c 100644 --- a/include/asm-generic/barrier.h +++ b/include/asm-generic/barrier.h @@ -13,7 +13,7 @@ #ifndef __ASSEMBLY__ -#include +#include #ifndef nop #define nop() asm volatile ("nop") diff --git a/include/asm-generic/rwonce.h b/include/asm-generic/rwonce.h new file mode 100644 index 000000000000..87584379da43 --- /dev/null +++ b/include/asm-generic/rwonce.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Prevent the compiler from merging or refetching reads or writes. The + * compiler is also forbidden from reordering successive instances of + * READ_ONCE and WRITE_ONCE, but only when the compiler is aware of some + * particular ordering. One way to make the compiler aware of ordering is to + * put the two invocations of READ_ONCE or WRITE_ONCE in different C + * statements. + * + * These two macros will also work on aggregate data types like structs or + * unions. + * + * Their two major use cases are: (1) Mediating communication between + * process-level code and irq/NMI handlers, all running on the same CPU, + * and (2) Ensuring that the compiler does not fold, spindle, or otherwise + * mutilate accesses that either do not require ordering or that interact + * with an explicit memory barrier or atomic instruction that provides the + * required ordering. + */ +#ifndef __ASM_GENERIC_RWONCE_H +#define __ASM_GENERIC_RWONCE_H + +#ifndef __ASSEMBLY__ + +#include +#include +#include + +#include + +/* + * Yes, this permits 64-bit accesses on 32-bit architectures. These will + * actually be atomic in some cases (namely Armv7 + LPAE), but for others we + * rely on the access being split into 2x32-bit accesses for a 32-bit quantity + * (e.g. a virtual address) and a strong prevailing wind. + */ +#define compiletime_assert_rwonce_type(t) \ + compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long), \ + "Unsupported access size for {READ,WRITE}_ONCE().") + +/* + * Use __READ_ONCE() instead of READ_ONCE() if you do not require any + * atomicity or dependency ordering guarantees. Note that this may result + * in tears! + */ +#define __READ_ONCE(x) (*(const volatile __unqual_scalar_typeof(x) *)&(x)) + +#define __READ_ONCE_SCALAR(x) \ +({ \ + __unqual_scalar_typeof(x) __x = __READ_ONCE(x); \ + smp_read_barrier_depends(); \ + (typeof(x))__x; \ +}) + +#define READ_ONCE(x) \ +({ \ + compiletime_assert_rwonce_type(x); \ + __READ_ONCE_SCALAR(x); \ +}) + +#define __WRITE_ONCE(x, val) \ +do { \ + *(volatile typeof(x) *)&(x) = (val); \ +} while (0) + +#define WRITE_ONCE(x, val) \ +do { \ + compiletime_assert_rwonce_type(x); \ + __WRITE_ONCE(x, val); \ +} while (0) + +static __no_sanitize_or_inline +unsigned long __read_once_word_nocheck(const void *addr) +{ + return __READ_ONCE(*(unsigned long *)addr); +} + +/* + * Use READ_ONCE_NOCHECK() instead of READ_ONCE() if you need to load a + * word from memory atomically but without telling KASAN/KCSAN. This is + * usually used by unwinding code when walking the stack of a running process. + */ +#define READ_ONCE_NOCHECK(x) \ +({ \ + unsigned long __x; \ + compiletime_assert(sizeof(x) == sizeof(__x), \ + "Unsupported access size for READ_ONCE_NOCHECK()."); \ + __x = __read_once_word_nocheck(&(x)); \ + smp_read_barrier_depends(); \ + (typeof(x))__x; \ +}) + +static __no_kasan_or_inline +unsigned long read_word_at_a_time(const void *addr) +{ + kasan_check_read(addr, 1); + return *(unsigned long *)addr; +} + +#endif /* __ASSEMBLY__ */ +#endif /* __ASM_GENERIC_RWONCE_H */ diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 204e76856435..f075a3df4fe2 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -230,28 +230,6 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, # define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __LINE__) #endif -/* - * Prevent the compiler from merging or refetching reads or writes. The - * compiler is also forbidden from reordering successive instances of - * READ_ONCE and WRITE_ONCE, but only when the compiler is aware of some - * particular ordering. One way to make the compiler aware of ordering is to - * put the two invocations of READ_ONCE or WRITE_ONCE in different C - * statements. - * - * These two macros will also work on aggregate data types like structs or - * unions. - * - * Their two major use cases are: (1) Mediating communication between - * process-level code and irq/NMI handlers, all running on the same CPU, - * and (2) Ensuring that the compiler does not fold, spindle, or otherwise - * mutilate accesses that either do not require ordering or that interact - * with an explicit memory barrier or atomic instruction that provides the - * required ordering. - */ -#include -#include -#include - /** * data_race - mark an expression as containing intentional data races * @@ -272,65 +250,6 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, __v; \ }) -/* - * Use __READ_ONCE() instead of READ_ONCE() if you do not require any - * atomicity or dependency ordering guarantees. Note that this may result - * in tears! - */ -#define __READ_ONCE(x) (*(const volatile __unqual_scalar_typeof(x) *)&(x)) - -#define __READ_ONCE_SCALAR(x) \ -({ \ - __unqual_scalar_typeof(x) __x = __READ_ONCE(x); \ - smp_read_barrier_depends(); \ - (typeof(x))__x; \ -}) - -#define READ_ONCE(x) \ -({ \ - compiletime_assert_rwonce_type(x); \ - __READ_ONCE_SCALAR(x); \ -}) - -#define __WRITE_ONCE(x, val) \ -do { \ - *(volatile typeof(x) *)&(x) = (val); \ -} while (0) - -#define WRITE_ONCE(x, val) \ -do { \ - compiletime_assert_rwonce_type(x); \ - __WRITE_ONCE(x, val); \ -} while (0) - -static __no_sanitize_or_inline -unsigned long __read_once_word_nocheck(const void *addr) -{ - return __READ_ONCE(*(unsigned long *)addr); -} - -/* - * Use READ_ONCE_NOCHECK() instead of READ_ONCE() if you need to load a - * word from memory atomically but without telling KASAN/KCSAN. This is - * usually used by unwinding code when walking the stack of a running process. - */ -#define READ_ONCE_NOCHECK(x) \ -({ \ - unsigned long __x; \ - compiletime_assert(sizeof(x) == sizeof(__x), \ - "Unsupported access size for READ_ONCE_NOCHECK()."); \ - __x = __read_once_word_nocheck(&(x)); \ - smp_read_barrier_depends(); \ - (typeof(x))__x; \ -}) - -static __no_kasan_or_inline -unsigned long read_word_at_a_time(const void *addr) -{ - kasan_check_read(addr, 1); - return *(unsigned long *)addr; -} - #endif /* __KERNEL__ */ /* @@ -395,16 +314,6 @@ static inline void *offset_to_ptr(const int *off) compiletime_assert(__native_word(t), \ "Need native word sized stores/loads for atomicity.") -/* - * Yes, this permits 64-bit accesses on 32-bit architectures. These will - * actually be atomic in some cases (namely Armv7 + LPAE), but for others we - * rely on the access being split into 2x32-bit accesses for a 32-bit quantity - * (e.g. a virtual address) and a strong prevailing wind. - */ -#define compiletime_assert_rwonce_type(t) \ - compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long), \ - "Unsupported access size for {READ,WRITE}_ONCE().") - /* &a[0] degrades to a pointer: a different type from an array */ #define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) @@ -414,4 +323,6 @@ static inline void *offset_to_ptr(const int *off) */ #define prevent_tail_call_optimization() mb() +#include + #endif /* __LINUX_COMPILER_H */ -- cgit v1.2.3 From b78b331a3f5c0773171dadd6bbfa2a2242b45604 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 15 Oct 2019 17:30:47 -0700 Subject: asm/rwonce: Allow __READ_ONCE to be overridden by the architecture The meat and potatoes of READ_ONCE() is defined by the __READ_ONCE() macro, which uses a volatile casts in an attempt to avoid tearing of byte, halfword, word and double-word accesses. Allow this to be overridden by the architecture code in the case that things like memory barriers are also required. Acked-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Signed-off-by: Will Deacon --- include/asm-generic/rwonce.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/asm-generic/rwonce.h b/include/asm-generic/rwonce.h index 87584379da43..04586b55a7c2 100644 --- a/include/asm-generic/rwonce.h +++ b/include/asm-generic/rwonce.h @@ -43,7 +43,9 @@ * atomicity or dependency ordering guarantees. Note that this may result * in tears! */ +#ifndef __READ_ONCE #define __READ_ONCE(x) (*(const volatile __unqual_scalar_typeof(x) *)&(x)) +#endif #define __READ_ONCE_SCALAR(x) \ ({ \ -- cgit v1.2.3 From d6462858851549c62d73eaa14b31132b0f32d6b6 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 30 Oct 2019 16:50:10 +0000 Subject: alpha: Override READ_ONCE() with barriered implementation Rather then relying on the core code to use smp_read_barrier_depends() as part of the READ_ONCE() definition, instead override __READ_ONCE() in the Alpha code so that it generates the required mb() and then implement smp_load_acquire() using the new macro to avoid redundant back-to-back barriers from the generic implementation. Acked-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Signed-off-by: Will Deacon --- arch/alpha/include/asm/barrier.h | 59 ++++------------------------------------ arch/alpha/include/asm/rwonce.h | 35 ++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 54 deletions(-) create mode 100644 arch/alpha/include/asm/rwonce.h diff --git a/arch/alpha/include/asm/barrier.h b/arch/alpha/include/asm/barrier.h index 92ec486a4f9e..c56bfffc9918 100644 --- a/arch/alpha/include/asm/barrier.h +++ b/arch/alpha/include/asm/barrier.h @@ -2,64 +2,15 @@ #ifndef __BARRIER_H #define __BARRIER_H -#include - #define mb() __asm__ __volatile__("mb": : :"memory") #define rmb() __asm__ __volatile__("mb": : :"memory") #define wmb() __asm__ __volatile__("wmb": : :"memory") -/** - * read_barrier_depends - Flush all pending reads that subsequents reads - * depend on. - * - * No data-dependent reads from memory-like regions are ever reordered - * over this barrier. All reads preceding this primitive are guaranteed - * to access memory (but not necessarily other CPUs' caches) before any - * reads following this primitive that depend on the data return by - * any of the preceding reads. This primitive is much lighter weight than - * rmb() on most CPUs, and is never heavier weight than is - * rmb(). - * - * These ordering constraints are respected by both the local CPU - * and the compiler. - * - * Ordering is not guaranteed by anything other than these primitives, - * not even by data dependencies. See the documentation for - * memory_barrier() for examples and URLs to more information. - * - * For example, the following code would force ordering (the initial - * value of "a" is zero, "b" is one, and "p" is "&a"): - * - * - * CPU 0 CPU 1 - * - * b = 2; - * memory_barrier(); - * p = &b; q = p; - * read_barrier_depends(); - * d = *q; - * - * - * because the read of "*q" depends on the read of "p" and these - * two reads are separated by a read_barrier_depends(). However, - * the following code, with the same initial values for "a" and "b": - * - * - * CPU 0 CPU 1 - * - * a = 2; - * memory_barrier(); - * b = 3; y = b; - * read_barrier_depends(); - * x = a; - * - * - * does not enforce ordering, since there is no data dependency between - * the read of "a" and the read of "b". Therefore, on some CPUs, such - * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb() - * in cases like this where there are no data dependencies. - */ -#define read_barrier_depends() __asm__ __volatile__("mb": : :"memory") +#define __smp_load_acquire(p) \ +({ \ + compiletime_assert_atomic_type(*p); \ + __READ_ONCE(*p); \ +}) #ifdef CONFIG_SMP #define __ASM_SMP_MB "\tmb\n" diff --git a/arch/alpha/include/asm/rwonce.h b/arch/alpha/include/asm/rwonce.h new file mode 100644 index 000000000000..35542bcf92b3 --- /dev/null +++ b/arch/alpha/include/asm/rwonce.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019 Google LLC. + */ +#ifndef __ASM_RWONCE_H +#define __ASM_RWONCE_H + +#ifdef CONFIG_SMP + +#include + +/* + * Alpha is apparently daft enough to reorder address-dependent loads + * on some CPU implementations. Knock some common sense into it with + * a memory barrier in READ_ONCE(). + * + * For the curious, more information about this unusual reordering is + * available in chapter 15 of the "perfbook": + * + * https://kernel.org/pub/linux/kernel/people/paulmck/perfbook/perfbook.html + * + */ +#define __READ_ONCE(x) \ +({ \ + __unqual_scalar_typeof(x) __x = \ + (*(volatile typeof(__x) *)(&(x))); \ + mb(); \ + (typeof(x))__x; \ +}) + +#endif /* CONFIG_SMP */ + +#include + +#endif /* __ASM_RWONCE_H */ -- cgit v1.2.3 From 3c9184109e78ea2371ca8fa66d7f36986a53af98 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 30 Oct 2019 16:51:07 +0000 Subject: asm/rwonce: Remove smp_read_barrier_depends() invocation Alpha overrides __READ_ONCE() directly, so there's no need to use smp_read_barrier_depends() in the core code. This also means that __READ_ONCE() can be relied upon to provide dependency ordering. Acked-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Signed-off-by: Will Deacon --- include/asm-generic/rwonce.h | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/include/asm-generic/rwonce.h b/include/asm-generic/rwonce.h index 04586b55a7c2..3a7f737c77bd 100644 --- a/include/asm-generic/rwonce.h +++ b/include/asm-generic/rwonce.h @@ -40,24 +40,16 @@ /* * Use __READ_ONCE() instead of READ_ONCE() if you do not require any - * atomicity or dependency ordering guarantees. Note that this may result - * in tears! + * atomicity. Note that this may result in tears! */ #ifndef __READ_ONCE #define __READ_ONCE(x) (*(const volatile __unqual_scalar_typeof(x) *)&(x)) #endif -#define __READ_ONCE_SCALAR(x) \ -({ \ - __unqual_scalar_typeof(x) __x = __READ_ONCE(x); \ - smp_read_barrier_depends(); \ - (typeof(x))__x; \ -}) - #define READ_ONCE(x) \ ({ \ compiletime_assert_rwonce_type(x); \ - __READ_ONCE_SCALAR(x); \ + __READ_ONCE(x); \ }) #define __WRITE_ONCE(x, val) \ @@ -84,12 +76,9 @@ unsigned long __read_once_word_nocheck(const void *addr) */ #define READ_ONCE_NOCHECK(x) \ ({ \ - unsigned long __x; \ - compiletime_assert(sizeof(x) == sizeof(__x), \ + compiletime_assert(sizeof(x) == sizeof(unsigned long), \ "Unsupported access size for READ_ONCE_NOCHECK()."); \ - __x = __read_once_word_nocheck(&(x)); \ - smp_read_barrier_depends(); \ - (typeof(x))__x; \ + (typeof(x))__read_once_word_nocheck(&(x)); \ }) static __no_kasan_or_inline -- cgit v1.2.3 From 002dff36acfba3476b685a09f78ffb7c452f5951 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 10 Jul 2020 14:49:40 +0100 Subject: asm/rwonce: Don't pull into 'asm-generic/rwonce.h' Now that 'smp_read_barrier_depends()' has gone the way of the Norwegian Blue, drop the inclusion of in 'asm-generic/rwonce.h'. This requires fixups to some architecture vdso headers which were previously relying on 'asm/barrier.h' coming in via 'linux/compiler.h'. Acked-by: Peter Zijlstra (Intel) Signed-off-by: Will Deacon --- arch/arm/include/asm/vdso/gettimeofday.h | 1 + arch/arm64/include/asm/vdso/compat_gettimeofday.h | 1 + arch/arm64/include/asm/vdso/gettimeofday.h | 1 + arch/riscv/include/asm/vdso/gettimeofday.h | 1 + include/asm-generic/rwonce.h | 2 -- include/linux/nospec.h | 2 ++ 6 files changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/arm/include/asm/vdso/gettimeofday.h b/arch/arm/include/asm/vdso/gettimeofday.h index 36dc18553ed8..1b207cf07697 100644 --- a/arch/arm/include/asm/vdso/gettimeofday.h +++ b/arch/arm/include/asm/vdso/gettimeofday.h @@ -7,6 +7,7 @@ #ifndef __ASSEMBLY__ +#include #include #include #include diff --git a/arch/arm64/include/asm/vdso/compat_gettimeofday.h b/arch/arm64/include/asm/vdso/compat_gettimeofday.h index b6907ae78e53..bcf7649999a4 100644 --- a/arch/arm64/include/asm/vdso/compat_gettimeofday.h +++ b/arch/arm64/include/asm/vdso/compat_gettimeofday.h @@ -7,6 +7,7 @@ #ifndef __ASSEMBLY__ +#include #include #include diff --git a/arch/arm64/include/asm/vdso/gettimeofday.h b/arch/arm64/include/asm/vdso/gettimeofday.h index afba6ba332f8..127fa63893e2 100644 --- a/arch/arm64/include/asm/vdso/gettimeofday.h +++ b/arch/arm64/include/asm/vdso/gettimeofday.h @@ -7,6 +7,7 @@ #ifndef __ASSEMBLY__ +#include #include #define VDSO_HAS_CLOCK_GETRES 1 diff --git a/arch/riscv/include/asm/vdso/gettimeofday.h b/arch/riscv/include/asm/vdso/gettimeofday.h index c8e818688ec1..3099362d9f26 100644 --- a/arch/riscv/include/asm/vdso/gettimeofday.h +++ b/arch/riscv/include/asm/vdso/gettimeofday.h @@ -4,6 +4,7 @@ #ifndef __ASSEMBLY__ +#include #include #include #include diff --git a/include/asm-generic/rwonce.h b/include/asm-generic/rwonce.h index 3a7f737c77bd..8d0a6280e982 100644 --- a/include/asm-generic/rwonce.h +++ b/include/asm-generic/rwonce.h @@ -26,8 +26,6 @@ #include #include -#include - /* * Yes, this permits 64-bit accesses on 32-bit architectures. These will * actually be atomic in some cases (namely Armv7 + LPAE), but for others we diff --git a/include/linux/nospec.h b/include/linux/nospec.h index 0c5ef54fd416..c1e79f72cd89 100644 --- a/include/linux/nospec.h +++ b/include/linux/nospec.h @@ -5,6 +5,8 @@ #ifndef _LINUX_NOSPEC_H #define _LINUX_NOSPEC_H + +#include #include struct task_struct; -- cgit v1.2.3 From 71c0b9a65cefa8c34eab83d337a1e3ec61fb7cc2 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 30 Oct 2019 16:22:17 +0000 Subject: vhost: Remove redundant use of read_barrier_depends() barrier Since commit 76ebbe78f739 ("locking/barriers: Add implicit smp_read_barrier_depends() to READ_ONCE()"), there is no need to use smp_read_barrier_depends() outside of the Alpha architecture code. Unfortunately, there is precisely _one_ user in the vhost code, and there isn't an obvious READ_ONCE() access making the barrier redundant. However, on closer inspection (thanks, Jason), it appears that vring synchronisation between the producer and consumer occurs via the 'avail_idx' field, which is followed up by an rmb() in vhost_get_vq_desc(), making the read_barrier_depends() redundant on Alpha. Jason says: | I'm also confused about the barrier here, basically in driver side | we did: | | 1) allocate pages | 2) store pages in indirect->addr | 3) smp_wmb() | 4) increase the avail idx (somehow a tail pointer of vring) | | in vhost we did: | | 1) read avail idx | 2) smp_rmb() | 3) read indirect->addr | 4) read from indirect->addr | | It looks to me even the data dependency barrier is not necessary | since we have rmb() which is sufficient for us to the correct | indirect->addr and driver are not expected to do any writing to | indirect->addr after avail idx is increased Remove the redundant barrier invocation. Acked-by: Michael S. Tsirkin Acked-by: Peter Zijlstra (Intel) Suggested-by: Jason Wang Acked-by: Paul E. McKenney Signed-off-by: Will Deacon --- drivers/vhost/vhost.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index d7b8df3edffc..74d135ee7e26 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2092,11 +2092,6 @@ static int get_indirect(struct vhost_virtqueue *vq, return ret; } iov_iter_init(&from, READ, vq->indirect, ret, len); - - /* We will use the result as an address to read from, so most - * architectures only need a compiler barrier here. */ - read_barrier_depends(); - count = len / sizeof desc; /* Buffers are chained via a 16 bit next field, so * we can have at most 2^16 of these. */ -- cgit v1.2.3 From bb7cdd38185a4f9fa32e62db115c2c6dceb2b621 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 30 Oct 2019 17:15:01 +0000 Subject: alpha: Replace smp_read_barrier_depends() usage with smp_[r]mb() In preparation for removing smp_read_barrier_depends() altogether, move the Alpha code over to using smp_rmb() and smp_mb() directly. Acked-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Signed-off-by: Will Deacon --- arch/alpha/include/asm/atomic.h | 16 ++++++++-------- arch/alpha/include/asm/pgtable.h | 10 +++++----- mm/memory.c | 2 +- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/alpha/include/asm/atomic.h b/arch/alpha/include/asm/atomic.h index 2144530d1428..2f8f7e54792f 100644 --- a/arch/alpha/include/asm/atomic.h +++ b/arch/alpha/include/asm/atomic.h @@ -16,10 +16,10 @@ /* * To ensure dependency ordering is preserved for the _relaxed and - * _release atomics, an smp_read_barrier_depends() is unconditionally - * inserted into the _relaxed variants, which are used to build the - * barriered versions. Avoid redundant back-to-back fences in the - * _acquire and _fence versions. + * _release atomics, an smp_mb() is unconditionally inserted into the + * _relaxed variants, which are used to build the barriered versions. + * Avoid redundant back-to-back fences in the _acquire and _fence + * versions. */ #define __atomic_acquire_fence() #define __atomic_post_full_fence() @@ -70,7 +70,7 @@ static inline int atomic_##op##_return_relaxed(int i, atomic_t *v) \ ".previous" \ :"=&r" (temp), "=m" (v->counter), "=&r" (result) \ :"Ir" (i), "m" (v->counter) : "memory"); \ - smp_read_barrier_depends(); \ + smp_mb(); \ return result; \ } @@ -88,7 +88,7 @@ static inline int atomic_fetch_##op##_relaxed(int i, atomic_t *v) \ ".previous" \ :"=&r" (temp), "=m" (v->counter), "=&r" (result) \ :"Ir" (i), "m" (v->counter) : "memory"); \ - smp_read_barrier_depends(); \ + smp_mb(); \ return result; \ } @@ -123,7 +123,7 @@ static __inline__ s64 atomic64_##op##_return_relaxed(s64 i, atomic64_t * v) \ ".previous" \ :"=&r" (temp), "=m" (v->counter), "=&r" (result) \ :"Ir" (i), "m" (v->counter) : "memory"); \ - smp_read_barrier_depends(); \ + smp_mb(); \ return result; \ } @@ -141,7 +141,7 @@ static __inline__ s64 atomic64_fetch_##op##_relaxed(s64 i, atomic64_t * v) \ ".previous" \ :"=&r" (temp), "=m" (v->counter), "=&r" (result) \ :"Ir" (i), "m" (v->counter) : "memory"); \ - smp_read_barrier_depends(); \ + smp_mb(); \ return result; \ } diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h index 162c17b2631f..660b14ce1317 100644 --- a/arch/alpha/include/asm/pgtable.h +++ b/arch/alpha/include/asm/pgtable.h @@ -277,9 +277,9 @@ extern inline pte_t pte_mkdirty(pte_t pte) { pte_val(pte) |= __DIRTY_BITS; retur extern inline pte_t pte_mkyoung(pte_t pte) { pte_val(pte) |= __ACCESS_BITS; return pte; } /* - * The smp_read_barrier_depends() in the following functions are required to - * order the load of *dir (the pointer in the top level page table) with any - * subsequent load of the returned pmd_t *ret (ret is data dependent on *dir). + * The smp_rmb() in the following functions are required to order the load of + * *dir (the pointer in the top level page table) with any subsequent load of + * the returned pmd_t *ret (ret is data dependent on *dir). * * If this ordering is not enforced, the CPU might load an older value of * *ret, which may be uninitialized data. See mm/memory.c:__pte_alloc for @@ -293,7 +293,7 @@ extern inline pte_t pte_mkyoung(pte_t pte) { pte_val(pte) |= __ACCESS_BITS; retu extern inline pmd_t * pmd_offset(pud_t * dir, unsigned long address) { pmd_t *ret = (pmd_t *) pud_page_vaddr(*dir) + ((address >> PMD_SHIFT) & (PTRS_PER_PAGE - 1)); - smp_read_barrier_depends(); /* see above */ + smp_rmb(); /* see above */ return ret; } #define pmd_offset pmd_offset @@ -303,7 +303,7 @@ extern inline pte_t * pte_offset_kernel(pmd_t * dir, unsigned long address) { pte_t *ret = (pte_t *) pmd_page_vaddr(*dir) + ((address >> PAGE_SHIFT) & (PTRS_PER_PAGE - 1)); - smp_read_barrier_depends(); /* see above */ + smp_rmb(); /* see above */ return ret; } #define pte_offset_kernel pte_offset_kernel diff --git a/mm/memory.c b/mm/memory.c index 87ec87cdc1ff..e1f2c730d8bb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -437,7 +437,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd) * of a chain of data-dependent loads, meaning most CPUs (alpha * being the notable exception) will already guarantee loads are * seen in-order. See the alpha page table accessors for the - * smp_read_barrier_depends() barriers in page table walking code. + * smp_rmb() barriers in page table walking code. */ smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ -- cgit v1.2.3 From 93fab07c22930c9ac4f01212fd92913c9a812f9f Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 30 Oct 2019 17:17:22 +0000 Subject: locking/barriers: Remove definitions for [smp_]read_barrier_depends() There are no remaining users of [smp_]read_barrier_depends(), so remove it from the generic implementation of 'barrier.h'. Acked-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Signed-off-by: Will Deacon --- include/asm-generic/barrier.h | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h index 8116744bb82c..fec97dc34de7 100644 --- a/include/asm-generic/barrier.h +++ b/include/asm-generic/barrier.h @@ -46,10 +46,6 @@ #define dma_wmb() wmb() #endif -#ifndef read_barrier_depends -#define read_barrier_depends() do { } while (0) -#endif - #ifndef __smp_mb #define __smp_mb() mb() #endif @@ -62,10 +58,6 @@ #define __smp_wmb() wmb() #endif -#ifndef __smp_read_barrier_depends -#define __smp_read_barrier_depends() read_barrier_depends() -#endif - #ifdef CONFIG_SMP #ifndef smp_mb @@ -80,10 +72,6 @@ #define smp_wmb() __smp_wmb() #endif -#ifndef smp_read_barrier_depends -#define smp_read_barrier_depends() __smp_read_barrier_depends() -#endif - #else /* !CONFIG_SMP */ #ifndef smp_mb @@ -98,10 +86,6 @@ #define smp_wmb() barrier() #endif -#ifndef smp_read_barrier_depends -#define smp_read_barrier_depends() do { } while (0) -#endif - #endif /* CONFIG_SMP */ #ifndef __smp_store_mb @@ -196,7 +180,6 @@ do { \ #define virt_mb() __smp_mb() #define virt_rmb() __smp_rmb() #define virt_wmb() __smp_wmb() -#define virt_read_barrier_depends() __smp_read_barrier_depends() #define virt_store_mb(var, value) __smp_store_mb(var, value) #define virt_mb__before_atomic() __smp_mb__before_atomic() #define virt_mb__after_atomic() __smp_mb__after_atomic() -- cgit v1.2.3 From 8ca924aeb4f28e5bf552707e8ecbe105c4f17c7b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 7 Nov 2019 14:36:37 +0000 Subject: Documentation/barriers: Remove references to [smp_]read_barrier_depends() The [smp_]read_barrier_depends() barrier macros no longer exist as part of the Linux memory model, so remove all references to them from the Documentation/ directory. Although this is fairly mechanical on the whole, we drop the "CACHE COHERENCY" section entirely from 'memory-barriers.txt' as it doesn't make any sense now that the dependency barriers have been removed. Acked-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Signed-off-by: Will Deacon --- .../RCU/Design/Requirements/Requirements.rst | 2 +- Documentation/memory-barriers.txt | 156 ++------------------- 2 files changed, 9 insertions(+), 149 deletions(-) diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst index 75b8ca007a11..50d5c43c48b0 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.rst +++ b/Documentation/RCU/Design/Requirements/Requirements.rst @@ -463,7 +463,7 @@ again without disrupting RCU readers. This guarantee was only partially premeditated. DYNIX/ptx used an explicit memory barrier for publication, but had nothing resembling ``rcu_dereference()`` for subscription, nor did it have anything -resembling the ``smp_read_barrier_depends()`` that was later subsumed +resembling the dependency-ordering barrier that was later subsumed into ``rcu_dereference()`` and later still into ``READ_ONCE()``. The need for these operations made itself known quite suddenly at a late-1990s meeting with the DEC Alpha architects, back in the days when diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index eaabc3134294..4e55aba3eb4a 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -553,12 +553,12 @@ There are certain things that the Linux kernel memory barriers do not guarantee: DATA DEPENDENCY BARRIERS (HISTORICAL) ------------------------------------- -As of v4.15 of the Linux kernel, an smp_read_barrier_depends() was -added to READ_ONCE(), which means that about the only people who -need to pay attention to this section are those working on DEC Alpha -architecture-specific code and those working on READ_ONCE() itself. -For those who need it, and for those who are interested in the history, -here is the story of data-dependency barriers. +As of v4.15 of the Linux kernel, an smp_mb() was added to READ_ONCE() for +DEC Alpha, which means that about the only people who need to pay attention +to this section are those working on DEC Alpha architecture-specific code +and those working on READ_ONCE() itself. For those who need it, and for +those who are interested in the history, here is the story of +data-dependency barriers. The usage requirements of data dependency barriers are a little subtle, and it's not always obvious that they're needed. To illustrate, consider the @@ -2708,144 +2708,6 @@ the properties of the memory window through which devices are accessed and/or the use of any special device communication instructions the CPU may have. -CACHE COHERENCY ---------------- - -Life isn't quite as simple as it may appear above, however: for while the -caches are expected to be coherent, there's no guarantee that that coherency -will be ordered. This means that while changes made on one CPU will -eventually become visible on all CPUs, there's no guarantee that they will -become apparent in the same order on those other CPUs. - - -Consider dealing with a system that has a pair of CPUs (1 & 2), each of which -has a pair of parallel data caches (CPU 1 has A/B, and CPU 2 has C/D): - - : - : +--------+ - : +---------+ | | - +--------+ : +--->| Cache A |<------->| | - | | : | +---------+ | | - | CPU 1 |<---+ | | - | | : | +---------+ | | - +--------+ : +--->| Cache B |<------->| | - : +---------+ | | - : | Memory | - : +---------+ | System | - +--------+ : +--->| Cache C |<------->| | - | | : | +---------+ | | - | CPU 2 |<---+ | | - | | : | +---------+ | | - +--------+ : +--->| Cache D |<------->| | - : +---------+ | | - : +--------+ - : - -Imagine the system has the following properties: - - (*) an odd-numbered cache line may be in cache A, cache C or it may still be - resident in memory; - - (*) an even-numbered cache line may be in cache B, cache D or it may still be - resident in memory; - - (*) while the CPU core is interrogating one cache, the other cache may be - making use of the bus to access the rest of the system - perhaps to - displace a dirty cacheline or to do a speculative load; - - (*) each cache has a queue of operations that need to be applied to that cache - to maintain coherency with the rest of the system; - - (*) the coherency queue is not flushed by normal loads to lines already - present in the cache, even though the contents of the queue may - potentially affect those loads. - -Imagine, then, that two writes are made on the first CPU, with a write barrier -between them to guarantee that they will appear to reach that CPU's caches in -the requisite order: - - CPU 1 CPU 2 COMMENT - =============== =============== ======================================= - u == 0, v == 1 and p == &u, q == &u - v = 2; - smp_wmb(); Make sure change to v is visible before - change to p - v is now in cache A exclusively - p = &v; - p is now in cache B exclusively - -The write memory barrier forces the other CPUs in the system to perceive that -the local CPU's caches have apparently been updated in the correct order. But -now imagine that the second CPU wants to read those values: - - CPU 1 CPU 2 COMMENT - =============== =============== ======================================= - ... - q = p; - x = *q; - -The above pair of reads may then fail to happen in the expected order, as the -cacheline holding p may get updated in one of the second CPU's caches while -the update to the cacheline holding v is delayed in the other of the second -CPU's caches by some other cache event: - - CPU 1 CPU 2 COMMENT - =============== =============== ======================================= - u == 0, v == 1 and p == &u, q == &u - v = 2; - smp_wmb(); - - - p = &v; q = p; - - - - x = *q; - Reads from v before v updated in cache - - - -Basically, while both cachelines will be updated on CPU 2 eventually, there's -no guarantee that, without intervention, the order of update will be the same -as that committed on CPU 1. - - -To intervene, we need to interpolate a data dependency barrier or a read -barrier between the loads (which as of v4.15 is supplied unconditionally -by the READ_ONCE() macro). This will force the cache to commit its -coherency queue before processing any further requests: - - CPU 1 CPU 2 COMMENT - =============== =============== ======================================= - u == 0, v == 1 and p == &u, q == &u - v = 2; - smp_wmb(); - - - p = &v; q = p; - - - - smp_read_barrier_depends() - - - x = *q; - Reads from v after v updated in cache - - -This sort of problem can be encountered on DEC Alpha processors as they have a -split cache that improves performance by making better use of the data bus. -While most CPUs do imply a data dependency barrier on the read when a memory -access depends on a read, not all do, so it may not be relied on. - -Other CPUs may also have split caches, but must coordinate between the various -cachelets for normal memory accesses. The semantics of the Alpha removes the -need for hardware coordination in the absence of memory barriers, which -permitted Alpha to sport higher CPU clock rates back in the day. However, -please note that (again, as of v4.15) smp_read_barrier_depends() should not -be used except in Alpha arch-specific code and within the READ_ONCE() macro. - - CACHE COHERENCY VS DMA ---------------------- @@ -3009,10 +2871,8 @@ caches with the memory coherence system, thus making it seem like pointer changes vs new data occur in the right order. The Alpha defines the Linux kernel's memory model, although as of v4.15 -the Linux kernel's addition of smp_read_barrier_depends() to READ_ONCE() -greatly reduced Alpha's impact on the memory model. - -See the subsection on "Cache Coherency" above. +the Linux kernel's addition of smp_mb() to READ_ONCE() on Alpha greatly +reduced its impact on the memory model. VIRTUAL MACHINE GUESTS -- cgit v1.2.3 From 9ce1b14e74042a3477f880bee675945044880b01 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 29 Nov 2019 19:08:37 +0100 Subject: Documentation/barriers/kokr: Remove references to [smp_]read_barrier_depends() This commit translates commit ("Documentation/barriers: Remove references to [smp_]read_barrier_depends()") into Korean. Acked-by: Peter Zijlstra (Intel) Reviewed-by: Yunjae Lee Signed-off-by: SeongJae Park Signed-off-by: Will Deacon --- .../translations/ko_KR/memory-barriers.txt | 146 +-------------------- 1 file changed, 3 insertions(+), 143 deletions(-) diff --git a/Documentation/translations/ko_KR/memory-barriers.txt b/Documentation/translations/ko_KR/memory-barriers.txt index 34d041d68f78..a1f772ef622c 100644 --- a/Documentation/translations/ko_KR/memory-barriers.txt +++ b/Documentation/translations/ko_KR/memory-barriers.txt @@ -577,7 +577,7 @@ ACQUIRE 는 해당 오퍼레이션의 로드 부분에만 적용되고 RELEASE 데이터 의존성 배리어 (역사적) ----------------------------- -리눅스 커널 v4.15 기준으로, smp_read_barrier_depends() 가 READ_ONCE() 에 +리눅스 커널 v4.15 기준으로, smp_mb() 가 DEC Alpha 용 READ_ONCE() 코드에 추가되었는데, 이는 이 섹션에 주의를 기울여야 하는 사람들은 DEC Alpha 아키텍쳐 전용 코드를 만드는 사람들과 READ_ONCE() 자체를 만드는 사람들 뿐임을 의미합니다. 그런 분들을 위해, 그리고 역사에 관심 있는 분들을 위해, 여기 데이터 의존성 @@ -2664,144 +2664,6 @@ CPU 코어는 프로그램의 인과성이 유지된다고만 여겨진다면 수도 있습니다. -캐시 일관성 ------------ - -하지만 삶은 앞에서 이야기한 것처럼 단순하지 않습니다: 캐시들은 일관적일 것으로 -기대되지만, 그 일관성이 순서에도 적용될 거라는 보장은 없습니다. 한 CPU 에서 -만들어진 변경 사항은 최종적으로는 시스템의 모든 CPU 에게 보여지게 되지만, 다른 -CPU 들에게도 같은 순서로 보이게 될 거라는 보장은 없다는 뜻입니다. - - -두개의 CPU (1 & 2) 가 달려 있고, 각 CPU 에 두개의 데이터 캐시(CPU 1 은 A/B 를, -CPU 2 는 C/D 를 갖습니다)가 병렬로 연결되어 있는 시스템을 다룬다고 생각해 -봅시다: - - : - : +--------+ - : +---------+ | | - +--------+ : +--->| Cache A |<------->| | - | | : | +---------+ | | - | CPU 1 |<---+ | | - | | : | +---------+ | | - +--------+ : +--->| Cache B |<------->| | - : +---------+ | | - : | Memory | - : +---------+ | System | - +--------+ : +--->| Cache C |<------->| | - | | : | +---------+ | | - | CPU 2 |<---+ | | - | | : | +---------+ | | - +--------+ : +--->| Cache D |<------->| | - : +---------+ | | - : +--------+ - : - -이 시스템이 다음과 같은 특성을 갖는다 생각해 봅시다: - - (*) 홀수번 캐시라인은 캐시 A, 캐시 C 또는 메모리에 위치할 수 있음; - - (*) 짝수번 캐시라인은 캐시 B, 캐시 D 또는 메모리에 위치할 수 있음; - - (*) CPU 코어가 한개의 캐시에 접근하는 동안, 다른 캐시는 - 더티 캐시라인을 - 메모리에 내리거나 추측성 로드를 하거나 하기 위해 - 시스템의 다른 부분에 - 액세스 하기 위해 버스를 사용할 수 있음; - - (*) 각 캐시는 시스템의 나머지 부분들과 일관성을 맞추기 위해 해당 캐시에 - 적용되어야 할 오퍼레이션들의 큐를 가짐; - - (*) 이 일관성 큐는 캐시에 이미 존재하는 라인에 가해지는 평범한 로드에 의해서는 - 비워지지 않는데, 큐의 오퍼레이션들이 이 로드의 결과에 영향을 끼칠 수 있다 - 할지라도 그러함. - -이제, 첫번째 CPU 에서 두개의 쓰기 오퍼레이션을 만드는데, 해당 CPU 의 캐시에 -요청된 순서로 오퍼레이션이 도달됨을 보장하기 위해 두 오퍼레이션 사이에 쓰기 -배리어를 사용하는 상황을 상상해 봅시다: - - CPU 1 CPU 2 COMMENT - =============== =============== ======================================= - u == 0, v == 1 and p == &u, q == &u - v = 2; - smp_wmb(); v 의 변경이 p 의 변경 전에 보일 것을 - 분명히 함 - v 는 이제 캐시 A 에 독점적으로 존재함 - p = &v; - p 는 이제 캐시 B 에 독점적으로 존재함 - -여기서의 쓰기 메모리 배리어는 CPU 1 의 캐시가 올바른 순서로 업데이트 된 것으로 -시스템의 다른 CPU 들이 인지하게 만듭니다. 하지만, 이제 두번째 CPU 가 그 값들을 -읽으려 하는 상황을 생각해 봅시다: - - CPU 1 CPU 2 COMMENT - =============== =============== ======================================= - ... - q = p; - x = *q; - -위의 두개의 읽기 오퍼레이션은 예상된 순서로 일어나지 못할 수 있는데, 두번째 CPU -의 한 캐시에 다른 캐시 이벤트가 발생해 v 를 담고 있는 캐시라인의 해당 캐시에의 -업데이트가 지연되는 사이, p 를 담고 있는 캐시라인은 두번째 CPU 의 다른 캐시에 -업데이트 되어버렸을 수 있기 때문입니다. - - CPU 1 CPU 2 COMMENT - =============== =============== ======================================= - u == 0, v == 1 and p == &u, q == &u - v = 2; - smp_wmb(); - - - p = &v; q = p; - - - - x = *q; - 캐시에 업데이트 되기 전의 v 를 읽음 - - - -기본적으로, 두개의 캐시라인 모두 CPU 2 에 최종적으로는 업데이트 될 것이지만, -별도의 개입 없이는, 업데이트의 순서가 CPU 1 에서 만들어진 순서와 동일할 -것이라는 보장이 없습니다. - - -여기에 개입하기 위해선, 데이터 의존성 배리어나 읽기 배리어를 로드 오퍼레이션들 -사이에 넣어야 합니다 (v4.15 부터는 READ_ONCE() 매크로에 의해 무조건적으로 -그렇게 됩니다). 이렇게 함으로써 캐시가 다음 요청을 처리하기 전에 일관성 큐를 -처리하도록 강제하게 됩니다. - - CPU 1 CPU 2 COMMENT - =============== =============== ======================================= - u == 0, v == 1 and p == &u, q == &u - v = 2; - smp_wmb(); - - - p = &v; q = p; - - - - smp_read_barrier_depends() - - - x = *q; - 캐시에 업데이트 된 v 를 읽음 - - -이런 부류의 문제는 DEC Alpha 계열 프로세서들에서 발견될 수 있는데, 이들은 -데이터 버스를 좀 더 잘 사용해 성능을 개선할 수 있는, 분할된 캐시를 가지고 있기 -때문입니다. 대부분의 CPU 는 하나의 읽기 오퍼레이션의 메모리 액세스가 다른 읽기 -오퍼레이션에 의존적이라면 데이터 의존성 배리어를 내포시킵니다만, 모두가 그런건 -아니기 때문에 이점에 의존해선 안됩니다. - -다른 CPU 들도 분할된 캐시를 가지고 있을 수 있지만, 그런 CPU 들은 평범한 메모리 -액세스를 위해서도 이 분할된 캐시들 사이의 조정을 해야만 합니다. Alpha 는 가장 -약한 메모리 순서 시맨틱 (semantic) 을 선택함으로써 메모리 배리어가 명시적으로 -사용되지 않았을 때에는 그런 조정이 필요하지 않게 했으며, 이는 Alpha 가 당시에 -더 높은 CPU 클락 속도를 가질 수 있게 했습니다. 하지만, (다시 말하건대, v4.15 -이후부터는) Alpha 아키텍쳐 전용 코드와 READ_ONCE() 매크로 내부에서를 제외하고는 -smp_read_barrier_depends() 가 사용되지 않아야 함을 알아두시기 바랍니다. - - 캐시 일관성 VS DMA ------------------ @@ -2962,10 +2824,8 @@ Alpha CPU 의 일부 버전은 분할된 데이터 캐시를 가지고 있어서 데이터의 발견을 올바른 순서로 일어나게 하기 때문입니다. 리눅스 커널의 메모리 배리어 모델은 Alpha 에 기초해서 정의되었습니다만, v4.15 -부터는 리눅스 커널이 READ_ONCE() 내에 smp_read_barrier_depends() 를 추가해서 -Alpha 의 메모리 모델로의 영향력이 크게 줄어들긴 했습니다. - -위의 "캐시 일관성" 서브섹션을 참고하세요. +부터는 Alpha 용 READ_ONCE() 코드 내에 smp_mb() 가 추가되어서 메모리 모델로의 +Alpha 의 영향력이 크게 줄어들었습니다. 가상 머신 게스트 -- cgit v1.2.3 From 628fd55671f753a1e4fe8c21b6a0553503cade08 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 7 Nov 2019 14:44:06 +0000 Subject: tools/memory-model: Remove smp_read_barrier_depends() from informal doc smp_read_barrier_depends() has gone the way of mmiowb() and so many esoteric memory barriers before it. Drop the two mentions of this deceased barrier from the LKMM informal explanation document. Acked-by: Peter Zijlstra (Intel) Acked-by: Alan Stern Acked-by: Paul E. McKenney Signed-off-by: Will Deacon --- tools/memory-model/Documentation/explanation.txt | 26 +++++++++++------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/tools/memory-model/Documentation/explanation.txt b/tools/memory-model/Documentation/explanation.txt index e91a2eb19592..01adf9e0ebac 100644 --- a/tools/memory-model/Documentation/explanation.txt +++ b/tools/memory-model/Documentation/explanation.txt @@ -1122,12 +1122,10 @@ maintain at least the appearance of FIFO order. In practice, this difficulty is solved by inserting a special fence between P1's two loads when the kernel is compiled for the Alpha architecture. In fact, as of version 4.15, the kernel automatically -adds this fence (called smp_read_barrier_depends() and defined as -nothing at all on non-Alpha builds) after every READ_ONCE() and atomic -load. The effect of the fence is to cause the CPU not to execute any -po-later instructions until after the local cache has finished -processing all the stores it has already received. Thus, if the code -was changed to: +adds this fence after every READ_ONCE() and atomic load on Alpha. The +effect of the fence is to cause the CPU not to execute any po-later +instructions until after the local cache has finished processing all +the stores it has already received. Thus, if the code was changed to: P1() { @@ -1146,14 +1144,14 @@ READ_ONCE() or another synchronization primitive rather than accessed directly. The LKMM requires that smp_rmb(), acquire fences, and strong fences -share this property with smp_read_barrier_depends(): They do not allow -the CPU to execute any po-later instructions (or po-later loads in the -case of smp_rmb()) until all outstanding stores have been processed by -the local cache. In the case of a strong fence, the CPU first has to -wait for all of its po-earlier stores to propagate to every other CPU -in the system; then it has to wait for the local cache to process all -the stores received as of that time -- not just the stores received -when the strong fence began. +share this property: They do not allow the CPU to execute any po-later +instructions (or po-later loads in the case of smp_rmb()) until all +outstanding stores have been processed by the local cache. In the +case of a strong fence, the CPU first has to wait for all of its +po-earlier stores to propagate to every other CPU in the system; then +it has to wait for the local cache to process all the stores received +as of that time -- not just the stores received when the strong fence +began. And of course, none of this matters for any architecture other than Alpha. -- cgit v1.2.3 From c6cd2e011655aead2097273a04350f52429a1a8d Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 7 Nov 2019 14:46:59 +0000 Subject: include/linux: Remove smp_read_barrier_depends() from comments smp_read_barrier_depends() doesn't exist any more, so reword the two comments that mention it to refer to "dependency ordering" instead. Acked-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Signed-off-by: Will Deacon --- include/linux/percpu-refcount.h | 2 +- include/linux/ptr_ring.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 22d9d183950d..87d8a38bdea1 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -155,7 +155,7 @@ static inline bool __ref_is_percpu(struct percpu_ref *ref, * between contaminating the pointer value, meaning that * READ_ONCE() is required when fetching it. * - * The smp_read_barrier_depends() implied by READ_ONCE() pairs + * The dependency ordering from the READ_ONCE() pairs * with smp_store_release() in __percpu_ref_switch_to_percpu(). */ percpu_ptr = READ_ONCE(ref->percpu_count_ptr); diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h index 417db0a79a62..808f9d3ee546 100644 --- a/include/linux/ptr_ring.h +++ b/include/linux/ptr_ring.h @@ -107,7 +107,7 @@ static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr) return -ENOSPC; /* Make sure the pointer we are storing points to a valid data. */ - /* Pairs with smp_read_barrier_depends in __ptr_ring_consume. */ + /* Pairs with the dependency ordering in __ptr_ring_consume. */ smp_wmb(); WRITE_ONCE(r->queue[r->producer++], ptr); -- cgit v1.2.3 From ad83ec6ce13618a8b975ffdd8291742cb5b0005b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 7 Nov 2019 14:49:00 +0000 Subject: checkpatch: Remove checks relating to [smp_]read_barrier_depends() The [smp_]read_barrier_depends() macros no longer exist, so we don't need to deal with them in the checkpatch script. Acked-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Signed-off-by: Will Deacon --- scripts/checkpatch.pl | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 4c820607540b..8032f80c5bc7 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -5903,8 +5903,7 @@ sub process { my $barriers = qr{ mb| rmb| - wmb| - read_barrier_depends + wmb }x; my $barrier_stems = qr{ mb__before_atomic| @@ -5953,12 +5952,6 @@ sub process { } } -# check for smp_read_barrier_depends and read_barrier_depends - if (!$file && $line =~ /\b(smp_|)read_barrier_depends\s*\(/) { - WARN("READ_BARRIER_DEPENDS", - "$1read_barrier_depends should only be used in READ_ONCE or DEC Alpha code\n" . $herecurr); - } - # check of hardware specific defines if ($line =~ m@^.\s*\#\s*if.*\b(__i386__|__powerpc64__|__sun__|__s390x__)\b@ && $realfile !~ m@include/asm-@) { CHK("ARCH_DEFINES", -- cgit v1.2.3 From eb5c2d4b45e3d2d5d052ea6b8f1463976b1020d5 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 21 Jul 2020 09:54:15 +0100 Subject: compiler.h: Move compiletime_assert() macros into compiler_types.h The kernel test robot reports that moving READ_ONCE() out into its own header breaks a W=1 build for parisc, which is relying on the definition of compiletime_assert() being available: | In file included from ./arch/parisc/include/generated/asm/rwonce.h:1, | from ./include/asm-generic/barrier.h:16, | from ./arch/parisc/include/asm/barrier.h:29, | from ./arch/parisc/include/asm/atomic.h:11, | from ./include/linux/atomic.h:7, | from kernel/locking/percpu-rwsem.c:2: | ./arch/parisc/include/asm/atomic.h: In function 'atomic_read': | ./include/asm-generic/rwonce.h:36:2: error: implicit declaration of function 'compiletime_assert' [-Werror=implicit-function-declaration] | 36 | compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long), \ | | ^~~~~~~~~~~~~~~~~~ | ./include/asm-generic/rwonce.h:49:2: note: in expansion of macro 'compiletime_assert_rwonce_type' | 49 | compiletime_assert_rwonce_type(x); \ | | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ./arch/parisc/include/asm/atomic.h:73:9: note: in expansion of macro 'READ_ONCE' | 73 | return READ_ONCE((v)->counter); | | ^~~~~~~~~ Move these macros into compiler_types.h, so that they are available to READ_ONCE() and friends. Link: http://lists.infradead.org/pipermail/linux-arm-kernel/2020-July/587094.html Reported-by: kernel test robot Signed-off-by: Will Deacon --- include/linux/compiler.h | 41 ----------------------------------------- include/linux/compiler_types.h | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 41 deletions(-) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index f075a3df4fe2..59f7194fdf08 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -273,47 +273,6 @@ static inline void *offset_to_ptr(const int *off) #endif /* __ASSEMBLY__ */ -/* Compile time object size, -1 for unknown */ -#ifndef __compiletime_object_size -# define __compiletime_object_size(obj) -1 -#endif -#ifndef __compiletime_warning -# define __compiletime_warning(message) -#endif -#ifndef __compiletime_error -# define __compiletime_error(message) -#endif - -#ifdef __OPTIMIZE__ -# define __compiletime_assert(condition, msg, prefix, suffix) \ - do { \ - extern void prefix ## suffix(void) __compiletime_error(msg); \ - if (!(condition)) \ - prefix ## suffix(); \ - } while (0) -#else -# define __compiletime_assert(condition, msg, prefix, suffix) do { } while (0) -#endif - -#define _compiletime_assert(condition, msg, prefix, suffix) \ - __compiletime_assert(condition, msg, prefix, suffix) - -/** - * compiletime_assert - break build and emit msg if condition is false - * @condition: a compile-time constant condition to check - * @msg: a message to emit if condition is false - * - * In tradition of POSIX assert, this macro will break the build if the - * supplied condition is *false*, emitting the supplied error message if the - * compiler has support to do so. - */ -#define compiletime_assert(condition, msg) \ - _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) - -#define compiletime_assert_atomic_type(t) \ - compiletime_assert(__native_word(t), \ - "Need native word sized stores/loads for atomicity.") - /* &a[0] degrades to a pointer: a different type from an array */ #define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index c3bf7710f69a..d9bbb62a3e2a 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -300,6 +300,47 @@ struct ftrace_likely_data { (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || \ sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long)) +/* Compile time object size, -1 for unknown */ +#ifndef __compiletime_object_size +# define __compiletime_object_size(obj) -1 +#endif +#ifndef __compiletime_warning +# define __compiletime_warning(message) +#endif +#ifndef __compiletime_error +# define __compiletime_error(message) +#endif + +#ifdef __OPTIMIZE__ +# define __compiletime_assert(condition, msg, prefix, suffix) \ + do { \ + extern void prefix ## suffix(void) __compiletime_error(msg); \ + if (!(condition)) \ + prefix ## suffix(); \ + } while (0) +#else +# define __compiletime_assert(condition, msg, prefix, suffix) do { } while (0) +#endif + +#define _compiletime_assert(condition, msg, prefix, suffix) \ + __compiletime_assert(condition, msg, prefix, suffix) + +/** + * compiletime_assert - break build and emit msg if condition is false + * @condition: a compile-time constant condition to check + * @msg: a message to emit if condition is false + * + * In tradition of POSIX assert, this macro will break the build if the + * supplied condition is *false*, emitting the supplied error message if the + * compiler has support to do so. + */ +#define compiletime_assert(condition, msg) \ + _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) + +#define compiletime_assert_atomic_type(t) \ + compiletime_assert(__native_word(t), \ + "Need native word sized stores/loads for atomicity.") + /* Helpers for emitting diagnostics in pragmas. */ #ifndef __diag #define __diag(string) -- cgit v1.2.3 From 5f1f7f6c205a2e7f1d92229ac358254bd2826c2d Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 30 Jun 2020 13:53:07 +0100 Subject: arm64: Reduce the number of header files pulled into vmlinux.lds.S Although vmlinux.lds.S smells like an assembly file and is compiled with __ASSEMBLY__ defined, it's actually just fed to the preprocessor to create our linker script. This means that any assembly macros defined by headers that it includes will result in a helpful link error: | aarch64-linux-gnu-ld:./arch/arm64/kernel/vmlinux.lds:1: syntax error In preparation for an arm64-private asm/rwonce.h implementation, which will end up pulling assembly macros into linux/compiler.h, reduce the number of headers we include directly and transitively in vmlinux.lds.S Acked-by: Peter Zijlstra (Intel) Signed-off-by: Will Deacon --- arch/arm64/include/asm/kernel-pgtable.h | 2 +- arch/arm64/include/asm/memory.h | 11 ++++++----- arch/arm64/include/asm/uaccess.h | 1 + arch/arm64/kernel/entry.S | 1 + arch/arm64/kernel/vmlinux.lds.S | 1 - arch/arm64/kvm/hyp-init.S | 1 + 6 files changed, 10 insertions(+), 7 deletions(-) diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h index 3bf626f6fe0c..329fb15f6bac 100644 --- a/arch/arm64/include/asm/kernel-pgtable.h +++ b/arch/arm64/include/asm/kernel-pgtable.h @@ -8,7 +8,7 @@ #ifndef __ASM_KERNEL_PGTABLE_H #define __ASM_KERNEL_PGTABLE_H -#include +#include #include /* diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index a1871bb32bb1..9d4bf58cf7b3 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -10,11 +10,8 @@ #ifndef __ASM_MEMORY_H #define __ASM_MEMORY_H -#include #include #include -#include -#include #include /* @@ -157,11 +154,15 @@ #endif #ifndef __ASSEMBLY__ -extern u64 vabits_actual; -#define PAGE_END (_PAGE_END(vabits_actual)) #include +#include #include +#include +#include + +extern u64 vabits_actual; +#define PAGE_END (_PAGE_END(vabits_actual)) extern s64 physvirt_offset; extern s64 memstart_addr; diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index bc5c7b091152..8d7c466f809b 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -19,6 +19,7 @@ #include #include +#include #include #include #include diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 5304d193c79d..b668aad3b762 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 6827da7f3aa5..e1e7c0431b4d 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -10,7 +10,6 @@ #include #include #include -#include #include #include diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S index 6e6ed5581eed..076544393c3c 100644 --- a/arch/arm64/kvm/hyp-init.S +++ b/arch/arm64/kvm/hyp-init.S @@ -6,6 +6,7 @@ #include +#include #include #include #include -- cgit v1.2.3 From 55fdc1f44cd6bb1d61c9ca946d8f7cd67ea0bf36 Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Tue, 21 Jul 2020 18:49:33 +0800 Subject: arm64: perf: Expose some new events via sysfs Some new PMU events can been detected by PMCEID1_EL0, but it can't be listed, Let's expose these through sysfs. Signed-off-by: Shaokun Zhang Cc: Will Deacon Cc: Mark Rutland Link: https://lore.kernel.org/r/1595328573-12751-2-git-send-email-zhangshaokun@hisilicon.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/perf_event.h | 27 +++++++++++++++++++++++++++ arch/arm64/kernel/perf_event.c | 19 +++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/arch/arm64/include/asm/perf_event.h b/arch/arm64/include/asm/perf_event.h index e7765b62c712..2c2d7dbe8a02 100644 --- a/arch/arm64/include/asm/perf_event.h +++ b/arch/arm64/include/asm/perf_event.h @@ -72,6 +72,13 @@ #define ARMV8_PMUV3_PERFCTR_LL_CACHE_RD 0x36 #define ARMV8_PMUV3_PERFCTR_LL_CACHE_MISS_RD 0x37 #define ARMV8_PMUV3_PERFCTR_REMOTE_ACCESS_RD 0x38 +#define ARMV8_PMUV3_PERFCTR_L1D_CACHE_LMISS_RD 0x39 +#define ARMV8_PMUV3_PERFCTR_OP_RETIRED 0x3A +#define ARMV8_PMUV3_PERFCTR_OP_SPEC 0x3B +#define ARMV8_PMUV3_PERFCTR_STALL 0x3C +#define ARMV8_PMUV3_PERFCTR_STALL_SLOT_BACKEND 0x3D +#define ARMV8_PMUV3_PERFCTR_STALL_SLOT_FRONTEND 0x3E +#define ARMV8_PMUV3_PERFCTR_STALL_SLOT 0x3F /* Statistical profiling extension microarchitectural events */ #define ARMV8_SPE_PERFCTR_SAMPLE_POP 0x4000 @@ -79,6 +86,26 @@ #define ARMV8_SPE_PERFCTR_SAMPLE_FILTRATE 0x4002 #define ARMV8_SPE_PERFCTR_SAMPLE_COLLISION 0x4003 +/* AMUv1 architecture events */ +#define ARMV8_AMU_PERFCTR_CNT_CYCLES 0x4004 +#define ARMV8_AMU_PERFCTR_STALL_BACKEND_MEM 0x4005 + +/* long-latency read miss events */ +#define ARMV8_PMUV3_PERFCTR_L1I_CACHE_LMISS 0x4006 +#define ARMV8_PMUV3_PERFCTR_L2D_CACHE_LMISS_RD 0x4009 +#define ARMV8_PMUV3_PERFCTR_L2I_CACHE_LMISS 0x400A +#define ARMV8_PMUV3_PERFCTR_L3D_CACHE_LMISS_RD 0x400B + +/* additional latency from alignment events */ +#define ARMV8_PMUV3_PERFCTR_LDST_ALIGN_LAT 0x4020 +#define ARMV8_PMUV3_PERFCTR_LD_ALIGN_LAT 0x4021 +#define ARMV8_PMUV3_PERFCTR_ST_ALIGN_LAT 0x4022 + +/* Armv8.5 Memory Tagging Extension events */ +#define ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED 0x4024 +#define ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED_RD 0x4025 +#define ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED_WR 0x4026 + /* ARMv8 recommended implementation defined event types */ #define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_RD 0x40 #define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR 0x41 diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index fdb6029c9021..462f9a9cc44b 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -225,10 +225,29 @@ static struct attribute *armv8_pmuv3_event_attrs[] = { ARMV8_EVENT_ATTR(ll_cache_rd, ARMV8_PMUV3_PERFCTR_LL_CACHE_RD), ARMV8_EVENT_ATTR(ll_cache_miss_rd, ARMV8_PMUV3_PERFCTR_LL_CACHE_MISS_RD), ARMV8_EVENT_ATTR(remote_access_rd, ARMV8_PMUV3_PERFCTR_REMOTE_ACCESS_RD), + ARMV8_EVENT_ATTR(l1d_cache_lmiss_rd, ARMV8_PMUV3_PERFCTR_L1D_CACHE_LMISS_RD), + ARMV8_EVENT_ATTR(op_retired, ARMV8_PMUV3_PERFCTR_OP_RETIRED), + ARMV8_EVENT_ATTR(op_spec, ARMV8_PMUV3_PERFCTR_OP_SPEC), + ARMV8_EVENT_ATTR(stall, ARMV8_PMUV3_PERFCTR_STALL), + ARMV8_EVENT_ATTR(stall_slot_backend, ARMV8_PMUV3_PERFCTR_STALL_SLOT_BACKEND), + ARMV8_EVENT_ATTR(stall_slot_frontend, ARMV8_PMUV3_PERFCTR_STALL_SLOT_FRONTEND), + ARMV8_EVENT_ATTR(stall_slot, ARMV8_PMUV3_PERFCTR_STALL_SLOT), ARMV8_EVENT_ATTR(sample_pop, ARMV8_SPE_PERFCTR_SAMPLE_POP), ARMV8_EVENT_ATTR(sample_feed, ARMV8_SPE_PERFCTR_SAMPLE_FEED), ARMV8_EVENT_ATTR(sample_filtrate, ARMV8_SPE_PERFCTR_SAMPLE_FILTRATE), ARMV8_EVENT_ATTR(sample_collision, ARMV8_SPE_PERFCTR_SAMPLE_COLLISION), + ARMV8_EVENT_ATTR(cnt_cycles, ARMV8_AMU_PERFCTR_CNT_CYCLES), + ARMV8_EVENT_ATTR(stall_backend_mem, ARMV8_AMU_PERFCTR_STALL_BACKEND_MEM), + ARMV8_EVENT_ATTR(l1i_cache_lmiss, ARMV8_PMUV3_PERFCTR_L1I_CACHE_LMISS), + ARMV8_EVENT_ATTR(l2d_cache_lmiss_rd, ARMV8_PMUV3_PERFCTR_L2D_CACHE_LMISS_RD), + ARMV8_EVENT_ATTR(l2i_cache_lmiss, ARMV8_PMUV3_PERFCTR_L2I_CACHE_LMISS), + ARMV8_EVENT_ATTR(l3d_cache_lmiss_rd, ARMV8_PMUV3_PERFCTR_L3D_CACHE_LMISS_RD), + ARMV8_EVENT_ATTR(ldst_align_lat, ARMV8_PMUV3_PERFCTR_LDST_ALIGN_LAT), + ARMV8_EVENT_ATTR(ld_align_lat, ARMV8_PMUV3_PERFCTR_LD_ALIGN_LAT), + ARMV8_EVENT_ATTR(st_align_lat, ARMV8_PMUV3_PERFCTR_ST_ALIGN_LAT), + ARMV8_EVENT_ATTR(mem_access_checked, ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED), + ARMV8_EVENT_ATTR(mem_access_checked_rd, ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED_RD), + ARMV8_EVENT_ATTR(mem_access_checked_wr, ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED_WR), NULL, }; -- cgit v1.2.3 From 493cf9b723bcc87e9284e5e5971259951a13f22e Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Tue, 21 Jul 2020 10:12:59 +0100 Subject: arm64: s/AMEVTYPE/AMEVTYPER Activity Monitor Event Type Registers are named as AMEVTYPER{0,1} Signed-off-by: Vladimir Murzin Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20200721091259.102756-1-vladimir.murzin@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/sysreg.h | 4 +-- arch/arm64/kvm/sys_regs.c | 68 ++++++++++++++++++++--------------------- 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 463175f80341..273bb1d15d21 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -421,9 +421,9 @@ */ #define SYS_AMEVCNTR0_EL0(n) SYS_AM_EL0(4 + ((n) >> 3), (n) & 7) -#define SYS_AMEVTYPE0_EL0(n) SYS_AM_EL0(6 + ((n) >> 3), (n) & 7) +#define SYS_AMEVTYPER0_EL0(n) SYS_AM_EL0(6 + ((n) >> 3), (n) & 7) #define SYS_AMEVCNTR1_EL0(n) SYS_AM_EL0(12 + ((n) >> 3), (n) & 7) -#define SYS_AMEVTYPE1_EL0(n) SYS_AM_EL0(14 + ((n) >> 3), (n) & 7) +#define SYS_AMEVTYPER1_EL0(n) SYS_AM_EL0(14 + ((n) >> 3), (n) & 7) /* AMU v1: Fixed (architecturally defined) activity monitors */ #define SYS_AMEVCNTR0_CORE_EL0 SYS_AMEVCNTR0_EL0(0) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index baf5ce9225ce..d3196671c590 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1024,9 +1024,9 @@ static bool access_amu(struct kvm_vcpu *vcpu, struct sys_reg_params *p, /* Macro to expand the AMU counter and type registers*/ #define AMU_AMEVCNTR0_EL0(n) { SYS_DESC(SYS_AMEVCNTR0_EL0(n)), access_amu } -#define AMU_AMEVTYPE0_EL0(n) { SYS_DESC(SYS_AMEVTYPE0_EL0(n)), access_amu } +#define AMU_AMEVTYPER0_EL0(n) { SYS_DESC(SYS_AMEVTYPER0_EL0(n)), access_amu } #define AMU_AMEVCNTR1_EL0(n) { SYS_DESC(SYS_AMEVCNTR1_EL0(n)), access_amu } -#define AMU_AMEVTYPE1_EL0(n) { SYS_DESC(SYS_AMEVTYPE1_EL0(n)), access_amu } +#define AMU_AMEVTYPER1_EL0(n) { SYS_DESC(SYS_AMEVTYPER1_EL0(n)), access_amu } static bool trap_ptrauth(struct kvm_vcpu *vcpu, struct sys_reg_params *p, @@ -1629,22 +1629,22 @@ static const struct sys_reg_desc sys_reg_descs[] = { AMU_AMEVCNTR0_EL0(13), AMU_AMEVCNTR0_EL0(14), AMU_AMEVCNTR0_EL0(15), - AMU_AMEVTYPE0_EL0(0), - AMU_AMEVTYPE0_EL0(1), - AMU_AMEVTYPE0_EL0(2), - AMU_AMEVTYPE0_EL0(3), - AMU_AMEVTYPE0_EL0(4), - AMU_AMEVTYPE0_EL0(5), - AMU_AMEVTYPE0_EL0(6), - AMU_AMEVTYPE0_EL0(7), - AMU_AMEVTYPE0_EL0(8), - AMU_AMEVTYPE0_EL0(9), - AMU_AMEVTYPE0_EL0(10), - AMU_AMEVTYPE0_EL0(11), - AMU_AMEVTYPE0_EL0(12), - AMU_AMEVTYPE0_EL0(13), - AMU_AMEVTYPE0_EL0(14), - AMU_AMEVTYPE0_EL0(15), + AMU_AMEVTYPER0_EL0(0), + AMU_AMEVTYPER0_EL0(1), + AMU_AMEVTYPER0_EL0(2), + AMU_AMEVTYPER0_EL0(3), + AMU_AMEVTYPER0_EL0(4), + AMU_AMEVTYPER0_EL0(5), + AMU_AMEVTYPER0_EL0(6), + AMU_AMEVTYPER0_EL0(7), + AMU_AMEVTYPER0_EL0(8), + AMU_AMEVTYPER0_EL0(9), + AMU_AMEVTYPER0_EL0(10), + AMU_AMEVTYPER0_EL0(11), + AMU_AMEVTYPER0_EL0(12), + AMU_AMEVTYPER0_EL0(13), + AMU_AMEVTYPER0_EL0(14), + AMU_AMEVTYPER0_EL0(15), AMU_AMEVCNTR1_EL0(0), AMU_AMEVCNTR1_EL0(1), AMU_AMEVCNTR1_EL0(2), @@ -1661,22 +1661,22 @@ static const struct sys_reg_desc sys_reg_descs[] = { AMU_AMEVCNTR1_EL0(13), AMU_AMEVCNTR1_EL0(14), AMU_AMEVCNTR1_EL0(15), - AMU_AMEVTYPE1_EL0(0), - AMU_AMEVTYPE1_EL0(1), - AMU_AMEVTYPE1_EL0(2), - AMU_AMEVTYPE1_EL0(3), - AMU_AMEVTYPE1_EL0(4), - AMU_AMEVTYPE1_EL0(5), - AMU_AMEVTYPE1_EL0(6), - AMU_AMEVTYPE1_EL0(7), - AMU_AMEVTYPE1_EL0(8), - AMU_AMEVTYPE1_EL0(9), - AMU_AMEVTYPE1_EL0(10), - AMU_AMEVTYPE1_EL0(11), - AMU_AMEVTYPE1_EL0(12), - AMU_AMEVTYPE1_EL0(13), - AMU_AMEVTYPE1_EL0(14), - AMU_AMEVTYPE1_EL0(15), + AMU_AMEVTYPER1_EL0(0), + AMU_AMEVTYPER1_EL0(1), + AMU_AMEVTYPER1_EL0(2), + AMU_AMEVTYPER1_EL0(3), + AMU_AMEVTYPER1_EL0(4), + AMU_AMEVTYPER1_EL0(5), + AMU_AMEVTYPER1_EL0(6), + AMU_AMEVTYPER1_EL0(7), + AMU_AMEVTYPER1_EL0(8), + AMU_AMEVTYPER1_EL0(9), + AMU_AMEVTYPER1_EL0(10), + AMU_AMEVTYPER1_EL0(11), + AMU_AMEVTYPER1_EL0(12), + AMU_AMEVTYPER1_EL0(13), + AMU_AMEVTYPER1_EL0(14), + AMU_AMEVTYPER1_EL0(15), { SYS_DESC(SYS_CNTP_TVAL_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTP_CTL_EL0), access_arch_timer }, -- cgit v1.2.3 From 0ae3b13aab210e2a8c14371731abddfee228ae24 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 21 Jul 2020 10:33:15 +0200 Subject: arm64/entry: deduplicate SW PAN entry/exit routines Factor the 12 copies of the SW PAN entry and exit code into callable subroutines, and use alternatives patching to either emit a 'bl' instruction to call them, or a NOP if h/w PAN is found to be available at runtime. Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20200721083315.4816-1-ardb@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/kernel/entry.S | 95 +++++++++++++++++++++++------------------------ 1 file changed, 47 insertions(+), 48 deletions(-) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 5304d193c79d..7b9a7c45ef85 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -209,28 +209,9 @@ alternative_cb_end add x29, sp, #S_STACKFRAME #ifdef CONFIG_ARM64_SW_TTBR0_PAN - /* - * Set the TTBR0 PAN bit in SPSR. When the exception is taken from - * EL0, there is no need to check the state of TTBR0_EL1 since - * accesses are always enabled. - * Note that the meaning of this bit differs from the ARMv8.1 PAN - * feature as all TTBR0_EL1 accesses are disabled, not just those to - * user mappings. - */ -alternative_if ARM64_HAS_PAN - b 1f // skip TTBR0 PAN +alternative_if_not ARM64_HAS_PAN + bl __swpan_entry_el\el alternative_else_nop_endif - - .if \el != 0 - mrs x21, ttbr0_el1 - tst x21, #TTBR_ASID_MASK // Check for the reserved ASID - orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR - b.eq 1f // TTBR0 access already disabled - and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR - .endif - - __uaccess_ttbr0_disable x21 -1: #endif stp x22, x23, [sp, #S_PC] @@ -284,34 +265,9 @@ alternative_else_nop_endif .endif #ifdef CONFIG_ARM64_SW_TTBR0_PAN - /* - * Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR - * PAN bit checking. - */ -alternative_if ARM64_HAS_PAN - b 2f // skip TTBR0 PAN +alternative_if_not ARM64_HAS_PAN + bl __swpan_exit_el\el alternative_else_nop_endif - - .if \el != 0 - tbnz x22, #22, 1f // Skip re-enabling TTBR0 access if the PSR_PAN_BIT is set - .endif - - __uaccess_ttbr0_enable x0, x1 - - .if \el == 0 - /* - * Enable errata workarounds only if returning to user. The only - * workaround currently required for TTBR0_EL1 changes are for the - * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache - * corruption). - */ - bl post_ttbr_update_workaround - .endif -1: - .if \el != 0 - and x22, x22, #~PSR_PAN_BIT // ARMv8.0 CPUs do not understand this bit - .endif -2: #endif .if \el == 0 @@ -391,6 +347,49 @@ alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0 sb .endm +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + /* + * Set the TTBR0 PAN bit in SPSR. When the exception is taken from + * EL0, there is no need to check the state of TTBR0_EL1 since + * accesses are always enabled. + * Note that the meaning of this bit differs from the ARMv8.1 PAN + * feature as all TTBR0_EL1 accesses are disabled, not just those to + * user mappings. + */ +SYM_CODE_START_LOCAL(__swpan_entry_el1) + mrs x21, ttbr0_el1 + tst x21, #TTBR_ASID_MASK // Check for the reserved ASID + orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR + b.eq 1f // TTBR0 access already disabled + and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR +SYM_INNER_LABEL(__swpan_entry_el0, SYM_L_LOCAL) + __uaccess_ttbr0_disable x21 +1: ret +SYM_CODE_END(__swpan_entry_el1) + + /* + * Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR + * PAN bit checking. + */ +SYM_CODE_START_LOCAL(__swpan_exit_el1) + tbnz x22, #22, 1f // Skip re-enabling TTBR0 access if the PSR_PAN_BIT is set + __uaccess_ttbr0_enable x0, x1 +1: and x22, x22, #~PSR_PAN_BIT // ARMv8.0 CPUs do not understand this bit + ret +SYM_CODE_END(__swpan_exit_el1) + +SYM_CODE_START_LOCAL(__swpan_exit_el0) + __uaccess_ttbr0_enable x0, x1 + /* + * Enable errata workarounds only if returning to user. The only + * workaround currently required for TTBR0_EL1 changes are for the + * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache + * corruption). + */ + b post_ttbr_update_workaround +SYM_CODE_END(__swpan_exit_el0) +#endif + .macro irq_stack_entry mov x19, sp // preserve the original sp #ifdef CONFIG_SHADOW_CALL_STACK -- cgit v1.2.3 From a46cec12f4a53ee5113f42b327cbb8d4cda074d2 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 24 Jul 2020 10:41:31 +0100 Subject: arm64: Reserve HWCAP2_MTE as (1 << 18) While MTE is not supported in the upstream kernel yet, add a comment that HWCAP2_MTE as (1 << 18) is reserved. Glibc makes use of it for the resolving (ifunc) of the MTE-safe string routines. Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/hwcap.h | 1 + arch/arm64/include/uapi/asm/hwcap.h | 1 + arch/arm64/kernel/cpuinfo.c | 1 + 3 files changed, 3 insertions(+) diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h index d683bcbf1e7c..22f73fe09030 100644 --- a/arch/arm64/include/asm/hwcap.h +++ b/arch/arm64/include/asm/hwcap.h @@ -95,6 +95,7 @@ #define KERNEL_HWCAP_DGH __khwcap2_feature(DGH) #define KERNEL_HWCAP_RNG __khwcap2_feature(RNG) #define KERNEL_HWCAP_BTI __khwcap2_feature(BTI) +/* reserved for KERNEL_HWCAP_MTE __khwcap2_feature(MTE) */ /* * This yields a mask that user programs can use to figure out what diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h index 2d6ba1c2592e..912162f73529 100644 --- a/arch/arm64/include/uapi/asm/hwcap.h +++ b/arch/arm64/include/uapi/asm/hwcap.h @@ -74,5 +74,6 @@ #define HWCAP2_DGH (1 << 15) #define HWCAP2_RNG (1 << 16) #define HWCAP2_BTI (1 << 17) +/* reserved for HWCAP2_MTE (1 << 18) */ #endif /* _UAPI__ASM_HWCAP_H */ diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 86637466daa8..393c6fb1f1cb 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -93,6 +93,7 @@ static const char *const hwcap_str[] = { "dgh", "rng", "bti", + /* reserved for "mte" */ NULL }; -- cgit v1.2.3 From ea0eada45632f4807b2f49de951072283e2d781c Mon Sep 17 00:00:00 2001 From: Gregory Herrero Date: Fri, 17 Jul 2020 16:33:38 +0200 Subject: recordmcount: only record relocation of type R_AARCH64_CALL26 on arm64. Currently, if a section has a relocation to '_mcount' symbol, a new __mcount_loc entry will be added whatever the relocation type is. This is problematic when a relocation to '_mcount' is in the middle of a section and is not a call for ftrace use. Such relocation could be generated with below code for example: bool is_mcount(unsigned long addr) { return (target == (unsigned long) &_mcount); } With this snippet of code, ftrace will try to patch the mcount location generated by this code on module load and fail with: Call trace: ftrace_bug+0xa0/0x28c ftrace_process_locs+0x2f4/0x430 ftrace_module_init+0x30/0x38 load_module+0x14f0/0x1e78 __do_sys_finit_module+0x100/0x11c __arm64_sys_finit_module+0x28/0x34 el0_svc_common+0x88/0x194 el0_svc_handler+0x38/0x8c el0_svc+0x8/0xc ---[ end trace d828d06b36ad9d59 ]--- ftrace failed to modify [] 0xffffa2dbf3a3a41c actual: 66:a9:3c:90 Initializing ftrace call sites ftrace record flags: 2000000 (0) expected tramp: ffffa2dc6cf66724 So Limit the relocation type to R_AARCH64_CALL26 as in perl version of recordmcount. Fixes: af64d2aa872a ("ftrace: Add arm64 support to recordmcount") Signed-off-by: Gregory Herrero Acked-by: Steven Rostedt (VMware) Link: https://lore.kernel.org/r/20200717143338.19302-1-gregory.herrero@oracle.com Signed-off-by: Catalin Marinas --- scripts/recordmcount.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index 7225107a9aaf..e59022b3f125 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c @@ -434,6 +434,11 @@ static int arm_is_fake_mcount(Elf32_Rel const *rp) return 1; } +static int arm64_is_fake_mcount(Elf64_Rel const *rp) +{ + return ELF64_R_TYPE(w(rp->r_info)) != R_AARCH64_CALL26; +} + /* 64-bit EM_MIPS has weird ELF64_Rela.r_info. * http://techpubs.sgi.com/library/manuals/4000/007-4658-001/pdf/007-4658-001.pdf * We interpret Table 29 Relocation Operation (Elf64_Rel, Elf64_Rela) [p.40] @@ -547,6 +552,7 @@ static int do_file(char const *const fname) make_nop = make_nop_arm64; rel_type_nop = R_AARCH64_NONE; ideal_nop = ideal_nop4_arm64; + is_fake_mcount64 = arm64_is_fake_mcount; break; case EM_IA_64: reltype = R_IA64_IMM64; break; case EM_MIPS: /* reltype: e_class */ break; -- cgit v1.2.3 From d53b5c013e1e7c3b43a7487171a84ee2acdd9597 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 24 Jun 2020 01:33:16 -0700 Subject: arm64/vdso: use the fault callback to map vvar pages Currently the vdso has no awareness of time namespaces, which may apply distinct offsets to processes in different namespaces. To handle this within the vdso, we'll need to expose a per-namespace data page. As a preparatory step, this patch separates the vdso data page from the code pages, and has it faulted in via its own fault callback. Subsquent patches will extend this to support distinct pages per time namespace. The vvar vma has to be installed with the VM_PFNMAP flag to handle faults via its vma fault callback. Signed-off-by: Andrei Vagin Reviewed-by: Vincenzo Frascino Reviewed-by: Dmitry Safonov Link: https://lore.kernel.org/r/20200624083321.144975-2-avagin@gmail.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/vdso.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index e546df0efefb..eb7798e5eb00 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -107,29 +107,32 @@ static int __vdso_init(enum vdso_abi abi) vdso_info[abi].vdso_code_start) >> PAGE_SHIFT; - /* Allocate the vDSO pagelist, plus a page for the data. */ - vdso_pagelist = kcalloc(vdso_info[abi].vdso_pages + 1, + vdso_pagelist = kcalloc(vdso_info[abi].vdso_pages, sizeof(struct page *), GFP_KERNEL); if (vdso_pagelist == NULL) return -ENOMEM; - /* Grab the vDSO data page. */ - vdso_pagelist[0] = phys_to_page(__pa_symbol(vdso_data)); - - /* Grab the vDSO code pages. */ pfn = sym_to_pfn(vdso_info[abi].vdso_code_start); for (i = 0; i < vdso_info[abi].vdso_pages; i++) - vdso_pagelist[i + 1] = pfn_to_page(pfn + i); + vdso_pagelist[i] = pfn_to_page(pfn + i); - vdso_info[abi].dm->pages = &vdso_pagelist[0]; - vdso_info[abi].cm->pages = &vdso_pagelist[1]; + vdso_info[abi].cm->pages = vdso_pagelist; return 0; } +static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, struct vm_fault *vmf) +{ + if (vmf->pgoff == 0) + return vmf_insert_pfn(vma, vmf->address, + sym_to_pfn(vdso_data)); + return VM_FAULT_SIGBUS; +} + static int __setup_additional_pages(enum vdso_abi abi, struct mm_struct *mm, struct linux_binprm *bprm, @@ -150,7 +153,7 @@ static int __setup_additional_pages(enum vdso_abi abi, } ret = _install_special_mapping(mm, vdso_base, PAGE_SIZE, - VM_READ|VM_MAYREAD, + VM_READ|VM_MAYREAD|VM_PFNMAP, vdso_info[abi].dm); if (IS_ERR(ret)) goto up_fail; @@ -206,6 +209,7 @@ static struct vm_special_mapping aarch32_vdso_maps[] = { #ifdef CONFIG_COMPAT_VDSO [AA32_MAP_VVAR] = { .name = "[vvar]", + .fault = vvar_fault, }, [AA32_MAP_VDSO] = { .name = "[vdso]", @@ -371,6 +375,7 @@ enum aarch64_map { static struct vm_special_mapping aarch64_vdso_maps[] __ro_after_init = { [AA64_MAP_VVAR] = { .name = "[vvar]", + .fault = vvar_fault, }, [AA64_MAP_VDSO] = { .name = "[vdso]", -- cgit v1.2.3 From 1b6867d2916bb91e94ddcc9c709e4779419fe391 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 24 Jun 2020 01:33:17 -0700 Subject: arm64/vdso: Zap vvar pages when switching to a time namespace The order of vvar pages depends on whether a task belongs to the root time namespace or not. In the root time namespace, a task doesn't have a per-namespace page. In a non-root namespace, the VVAR page which contains the system-wide VDSO data is replaced with a namespace specific page that contains clock offsets. Whenever a task changes its namespace, the VVAR page tables are cleared and then they will be re-faulted with a corresponding layout. A task can switch its time namespace only if its ->mm isn't shared with another task. Signed-off-by: Andrei Vagin Reviewed-by: Vincenzo Frascino Reviewed-by: Dmitry Safonov Reviewed-by: Christian Brauner Link: https://lore.kernel.org/r/20200624083321.144975-3-avagin@gmail.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/vdso.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index eb7798e5eb00..33ac18060bfc 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -124,6 +124,37 @@ static int __vdso_init(enum vdso_abi abi) return 0; } +#ifdef CONFIG_TIME_NS +/* + * The vvar mapping contains data for a specific time namespace, so when a task + * changes namespace we must unmap its vvar data for the old namespace. + * Subsequent faults will map in data for the new namespace. + * + * For more details see timens_setup_vdso_data(). + */ +int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) +{ + struct mm_struct *mm = task->mm; + struct vm_area_struct *vma; + + mmap_read_lock(mm); + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + unsigned long size = vma->vm_end - vma->vm_start; + + if (vma_is_special_mapping(vma, vdso_info[VDSO_ABI_AA64].dm)) + zap_page_range(vma, vma->vm_start, size); +#ifdef CONFIG_COMPAT_VDSO + if (vma_is_special_mapping(vma, vdso_info[VDSO_ABI_AA32].dm)) + zap_page_range(vma, vma->vm_start, size); +#endif + } + + mmap_read_unlock(mm); + return 0; +} +#endif + static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf) { -- cgit v1.2.3 From 3503d56cc7233ced602e38a4c13caa64f00ab2aa Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 24 Jun 2020 01:33:18 -0700 Subject: arm64/vdso: Add time namespace page Allocate the time namespace page among VVAR pages. Provide __arch_get_timens_vdso_data() helper for VDSO code to get the code-relative position of VVARs on that special page. If a task belongs to a time namespace then the VVAR page which contains the system wide VDSO data is replaced with a namespace specific page which has the same layout as the VVAR page. That page has vdso_data->seq set to 1 to enforce the slow path and vdso_data->clock_mode set to VCLOCK_TIMENS to enforce the time namespace handling path. The extra check in the case that vdso_data->seq is odd, e.g. a concurrent update of the VDSO data is in progress, is not really affecting regular tasks which are not part of a time namespace as the task is spin waiting for the update to finish and vdso_data->seq to become even again. If a time namespace task hits that code path, it invokes the corresponding time getter function which retrieves the real VVAR page, reads host time and then adds the offset for the requested clock which is stored in the special VVAR page. The time-namespace page isn't allocated on !CONFIG_TIME_NAMESPACE, but vma is the same size, which simplifies criu/vdso migration between different kernel configs. Signed-off-by: Andrei Vagin Reviewed-by: Vincenzo Frascino Reviewed-by: Dmitry Safonov Cc: Mark Rutland Link: https://lore.kernel.org/r/20200624083321.144975-4-avagin@gmail.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/vdso.h | 2 ++ arch/arm64/include/asm/vdso/compat_gettimeofday.h | 12 ++++++++++++ arch/arm64/include/asm/vdso/gettimeofday.h | 8 ++++++++ arch/arm64/kernel/vdso.c | 19 ++++++++++++++++--- arch/arm64/kernel/vdso/vdso.lds.S | 5 ++++- arch/arm64/kernel/vdso32/vdso.lds.S | 5 ++++- include/vdso/datapage.h | 1 + 7 files changed, 47 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/vdso.h b/arch/arm64/include/asm/vdso.h index 07468428fd29..f99dcb94b438 100644 --- a/arch/arm64/include/asm/vdso.h +++ b/arch/arm64/include/asm/vdso.h @@ -12,6 +12,8 @@ */ #define VDSO_LBASE 0x0 +#define __VVAR_PAGES 2 + #ifndef __ASSEMBLY__ #include diff --git a/arch/arm64/include/asm/vdso/compat_gettimeofday.h b/arch/arm64/include/asm/vdso/compat_gettimeofday.h index b6907ae78e53..b7c549d46d18 100644 --- a/arch/arm64/include/asm/vdso/compat_gettimeofday.h +++ b/arch/arm64/include/asm/vdso/compat_gettimeofday.h @@ -152,6 +152,18 @@ static __always_inline const struct vdso_data *__arch_get_vdso_data(void) return ret; } +#ifdef CONFIG_TIME_NS +static __always_inline const struct vdso_data *__arch_get_timens_vdso_data(void) +{ + const struct vdso_data *ret; + + /* See __arch_get_vdso_data(). */ + asm volatile("mov %0, %1" : "=r"(ret) : "r"(_timens_data)); + + return ret; +} +#endif + #endif /* !__ASSEMBLY__ */ #endif /* __ASM_VDSO_GETTIMEOFDAY_H */ diff --git a/arch/arm64/include/asm/vdso/gettimeofday.h b/arch/arm64/include/asm/vdso/gettimeofday.h index afba6ba332f8..cf39eae5eaaf 100644 --- a/arch/arm64/include/asm/vdso/gettimeofday.h +++ b/arch/arm64/include/asm/vdso/gettimeofday.h @@ -96,6 +96,14 @@ const struct vdso_data *__arch_get_vdso_data(void) return _vdso_data; } +#ifdef CONFIG_TIME_NS +static __always_inline +const struct vdso_data *__arch_get_timens_vdso_data(void) +{ + return _timens_data; +} +#endif + #endif /* !__ASSEMBLY__ */ #endif /* __ASM_VDSO_GETTIMEOFDAY_H */ diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index 33ac18060bfc..fcb559726920 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -40,6 +40,12 @@ enum vdso_abi { #endif /* CONFIG_COMPAT_VDSO */ }; +enum vvar_pages { + VVAR_DATA_PAGE_OFFSET, + VVAR_TIMENS_PAGE_OFFSET, + VVAR_NR_PAGES, +}; + struct vdso_abi_info { const char *name; const char *vdso_code_start; @@ -125,6 +131,11 @@ static int __vdso_init(enum vdso_abi abi) } #ifdef CONFIG_TIME_NS +struct vdso_data *arch_get_vdso_data(void *vvar_page) +{ + return (struct vdso_data *)(vvar_page); +} + /* * The vvar mapping contains data for a specific time namespace, so when a task * changes namespace we must unmap its vvar data for the old namespace. @@ -173,9 +184,11 @@ static int __setup_additional_pages(enum vdso_abi abi, unsigned long gp_flags = 0; void *ret; + BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES); + vdso_text_len = vdso_info[abi].vdso_pages << PAGE_SHIFT; /* Be sure to map the data page */ - vdso_mapping_len = vdso_text_len + PAGE_SIZE; + vdso_mapping_len = vdso_text_len + VVAR_NR_PAGES * PAGE_SIZE; vdso_base = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0); if (IS_ERR_VALUE(vdso_base)) { @@ -183,7 +196,7 @@ static int __setup_additional_pages(enum vdso_abi abi, goto up_fail; } - ret = _install_special_mapping(mm, vdso_base, PAGE_SIZE, + ret = _install_special_mapping(mm, vdso_base, VVAR_NR_PAGES * PAGE_SIZE, VM_READ|VM_MAYREAD|VM_PFNMAP, vdso_info[abi].dm); if (IS_ERR(ret)) @@ -192,7 +205,7 @@ static int __setup_additional_pages(enum vdso_abi abi, if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) && system_supports_bti()) gp_flags = VM_ARM64_BTI; - vdso_base += PAGE_SIZE; + vdso_base += VVAR_NR_PAGES * PAGE_SIZE; mm->context.vdso = (void *)vdso_base; ret = _install_special_mapping(mm, vdso_base, vdso_text_len, VM_READ|VM_EXEC|gp_flags| diff --git a/arch/arm64/kernel/vdso/vdso.lds.S b/arch/arm64/kernel/vdso/vdso.lds.S index 7ad2d3a0cd48..d808ad31e01f 100644 --- a/arch/arm64/kernel/vdso/vdso.lds.S +++ b/arch/arm64/kernel/vdso/vdso.lds.S @@ -17,7 +17,10 @@ OUTPUT_ARCH(aarch64) SECTIONS { - PROVIDE(_vdso_data = . - PAGE_SIZE); + PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); +#ifdef CONFIG_TIME_NS + PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); +#endif . = VDSO_LBASE + SIZEOF_HEADERS; .hash : { *(.hash) } :text diff --git a/arch/arm64/kernel/vdso32/vdso.lds.S b/arch/arm64/kernel/vdso32/vdso.lds.S index 337d03522048..3348ce5ea306 100644 --- a/arch/arm64/kernel/vdso32/vdso.lds.S +++ b/arch/arm64/kernel/vdso32/vdso.lds.S @@ -17,7 +17,10 @@ OUTPUT_ARCH(arm) SECTIONS { - PROVIDE_HIDDEN(_vdso_data = . - PAGE_SIZE); + PROVIDE_HIDDEN(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); +#ifdef CONFIG_TIME_NS + PROVIDE_HIDDEN(_timens_data = _vdso_data + PAGE_SIZE); +#endif . = VDSO_LBASE + SIZEOF_HEADERS; .hash : { *(.hash) } :text diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h index 7955c56d6b3c..ee810cae4e1e 100644 --- a/include/vdso/datapage.h +++ b/include/vdso/datapage.h @@ -109,6 +109,7 @@ struct vdso_data { * relocation, and this is what we need. */ extern struct vdso_data _vdso_data[CS_BASES] __attribute__((visibility("hidden"))); +extern struct vdso_data _timens_data[CS_BASES] __attribute__((visibility("hidden"))); /* * The generic vDSO implementation requires that gettimeofday.h -- cgit v1.2.3 From ee3cda8e46060b021087b6ef451e1cd9fa648af6 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 24 Jun 2020 01:33:19 -0700 Subject: arm64/vdso: Handle faults on timens page If a task belongs to a time namespace then the VVAR page which contains the system wide VDSO data is replaced with a namespace specific page which has the same layout as the VVAR page. Signed-off-by: Andrei Vagin Reviewed-by: Vincenzo Frascino Reviewed-by: Dmitry Safonov Link: https://lore.kernel.org/r/20200624083321.144975-5-avagin@gmail.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/vdso.c | 56 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index fcb559726920..c11ee18e3e79 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -164,15 +165,62 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) mmap_read_unlock(mm); return 0; } + +static struct page *find_timens_vvar_page(struct vm_area_struct *vma) +{ + if (likely(vma->vm_mm == current->mm)) + return current->nsproxy->time_ns->vvar_page; + + /* + * VM_PFNMAP | VM_IO protect .fault() handler from being called + * through interfaces like /proc/$pid/mem or + * process_vm_{readv,writev}() as long as there's no .access() + * in special_mapping_vmops. + * For more details check_vma_flags() and __access_remote_vm() + */ + WARN(1, "vvar_page accessed remotely"); + + return NULL; +} +#else +static struct page *find_timens_vvar_page(struct vm_area_struct *vma) +{ + return NULL; +} #endif static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf) { - if (vmf->pgoff == 0) - return vmf_insert_pfn(vma, vmf->address, - sym_to_pfn(vdso_data)); - return VM_FAULT_SIGBUS; + struct page *timens_page = find_timens_vvar_page(vma); + unsigned long pfn; + + switch (vmf->pgoff) { + case VVAR_DATA_PAGE_OFFSET: + if (timens_page) + pfn = page_to_pfn(timens_page); + else + pfn = sym_to_pfn(vdso_data); + break; +#ifdef CONFIG_TIME_NS + case VVAR_TIMENS_PAGE_OFFSET: + /* + * If a task belongs to a time namespace then a namespace + * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and + * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET + * offset. + * See also the comment near timens_setup_vdso_data(). + */ + if (!timens_page) + return VM_FAULT_SIGBUS; + pfn = sym_to_pfn(vdso_data); + break; +#endif /* CONFIG_TIME_NS */ + default: + return VM_FAULT_SIGBUS; + } + + return vmf_insert_pfn(vma, vmf->address, pfn); } static int __setup_additional_pages(enum vdso_abi abi, -- cgit v1.2.3 From bcf996434240c611f0fdab2c18cd75dd59cfa3c2 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 24 Jun 2020 01:33:20 -0700 Subject: arm64/vdso: Restrict splitting VVAR VMA Forbid splitting VVAR VMA resulting in a stricter ABI and reducing the amount of corner-cases to consider while working further on VDSO time namespace support. As the offset from timens to VVAR page is computed compile-time, the pages in VVAR should stay together and not being partically mremap()'ed. Signed-off-by: Andrei Vagin Reviewed-by: Vincenzo Frascino Reviewed-by: Dmitry Safonov Link: https://lore.kernel.org/r/20200624083321.144975-6-avagin@gmail.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/vdso.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index c11ee18e3e79..d4202a32abc9 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -223,6 +223,17 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, return vmf_insert_pfn(vma, vmf->address, pfn); } +static int vvar_mremap(const struct vm_special_mapping *sm, + struct vm_area_struct *new_vma) +{ + unsigned long new_size = new_vma->vm_end - new_vma->vm_start; + + if (new_size != VVAR_NR_PAGES * PAGE_SIZE) + return -EINVAL; + + return 0; +} + static int __setup_additional_pages(enum vdso_abi abi, struct mm_struct *mm, struct linux_binprm *bprm, @@ -302,6 +313,7 @@ static struct vm_special_mapping aarch32_vdso_maps[] = { [AA32_MAP_VVAR] = { .name = "[vvar]", .fault = vvar_fault, + .mremap = vvar_mremap, }, [AA32_MAP_VDSO] = { .name = "[vdso]", @@ -468,6 +480,7 @@ static struct vm_special_mapping aarch64_vdso_maps[] __ro_after_init = { [AA64_MAP_VVAR] = { .name = "[vvar]", .fault = vvar_fault, + .mremap = vvar_mremap, }, [AA64_MAP_VDSO] = { .name = "[vdso]", -- cgit v1.2.3 From 9614cc576d76a7449cd51b60ef81fd0ce19ee694 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 24 Jun 2020 01:33:21 -0700 Subject: arm64: enable time namespace support CONFIG_TIME_NS is dependes on GENERIC_VDSO_TIME_NS. Signed-off-by: Andrei Vagin Reviewed-by: Vincenzo Frascino Reviewed-by: Dmitry Safonov Link: https://lore.kernel.org/r/20200624083321.144975-7-avagin@gmail.com Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 66dc41fd49f2..87255f02ec5b 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -118,6 +118,7 @@ config ARM64 select GENERIC_STRNLEN_USER select GENERIC_TIME_VSYSCALL select GENERIC_GETTIMEOFDAY + select GENERIC_VDSO_TIME_NS select HANDLE_DOMAIN_IRQ select HARDIRQS_SW_RESEND select HAVE_PCI -- cgit v1.2.3 From 07d2e59f27cd728e6982b52441673886a6d04267 Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Fri, 19 Jun 2020 09:20:02 +0100 Subject: ACPI/IORT: Make iort_match_node_callback walk the ACPI namespace for NC When the iort_match_node_callback is invoked for a named component the match should be executed upon a device with an ACPI companion. For devices with no ACPI companion set-up the ACPI device tree must be walked in order to find the first parent node with a companion set and check the parent node against the named component entry to check whether there is a match and therefore an IORT node describing the in/out ID translation for the device has been found. Signed-off-by: Lorenzo Pieralisi Cc: Will Deacon Cc: Hanjun Guo Cc: Sudeep Holla Cc: Robin Murphy Cc: "Rafael J. Wysocki" Link: https://lore.kernel.org/r/20200619082013.13661-2-lorenzo.pieralisi@arm.com Signed-off-by: Catalin Marinas --- drivers/acpi/arm64/iort.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index 28a6b387e80e..5eee81758184 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -264,15 +264,31 @@ static acpi_status iort_match_node_callback(struct acpi_iort_node *node, if (node->type == ACPI_IORT_NODE_NAMED_COMPONENT) { struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL }; - struct acpi_device *adev = to_acpi_device_node(dev->fwnode); + struct acpi_device *adev; struct acpi_iort_named_component *ncomp; + struct device *nc_dev = dev; + + /* + * Walk the device tree to find a device with an + * ACPI companion; there is no point in scanning + * IORT for a device matching a named component if + * the device does not have an ACPI companion to + * start with. + */ + do { + adev = ACPI_COMPANION(nc_dev); + if (adev) + break; + + nc_dev = nc_dev->parent; + } while (nc_dev); if (!adev) goto out; status = acpi_get_name(adev->handle, ACPI_FULL_PATHNAME, &buf); if (ACPI_FAILURE(status)) { - dev_warn(dev, "Can't get device full path name\n"); + dev_warn(nc_dev, "Can't get device full path name\n"); goto out; } -- cgit v1.2.3 From d1718a1b7a86743b9c517bf9521695ba909c734f Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Fri, 19 Jun 2020 09:20:03 +0100 Subject: ACPI/IORT: Make iort_get_device_domain IRQ domain agnostic iort_get_device_domain() is PCI specific but it need not be, since it can be used to retrieve IRQ domain nexus of any kind by adding an irq_domain_bus_token input to it. Make it PCI agnostic by also renaming the requestor ID input to a more generic ID name. Signed-off-by: Lorenzo Pieralisi Acked-by: Bjorn Helgaas # pci/msi.c Cc: Will Deacon Cc: Hanjun Guo Cc: Bjorn Helgaas Cc: Sudeep Holla Cc: Robin Murphy Cc: "Rafael J. Wysocki" Link: https://lore.kernel.org/r/20200619082013.13661-3-lorenzo.pieralisi@arm.com Signed-off-by: Catalin Marinas --- drivers/acpi/arm64/iort.c | 14 +++++++------- drivers/pci/msi.c | 3 ++- include/linux/acpi_iort.h | 7 ++++--- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index 5eee81758184..902e2aaca946 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -550,7 +550,6 @@ static struct acpi_iort_node *iort_find_dev_node(struct device *dev) node = iort_get_iort_node(dev->fwnode); if (node) return node; - /* * if not, then it should be a platform device defined in * DSDT/SSDT (with Named Component node in IORT) @@ -641,13 +640,13 @@ static int __maybe_unused iort_find_its_base(u32 its_id, phys_addr_t *base) /** * iort_dev_find_its_id() - Find the ITS identifier for a device * @dev: The device. - * @req_id: Device's requester ID + * @id: Device's ID * @idx: Index of the ITS identifier list. * @its_id: ITS identifier. * * Returns: 0 on success, appropriate error value otherwise */ -static int iort_dev_find_its_id(struct device *dev, u32 req_id, +static int iort_dev_find_its_id(struct device *dev, u32 id, unsigned int idx, int *its_id) { struct acpi_iort_its_group *its; @@ -657,7 +656,7 @@ static int iort_dev_find_its_id(struct device *dev, u32 req_id, if (!node) return -ENXIO; - node = iort_node_map_id(node, req_id, NULL, IORT_MSI_TYPE); + node = iort_node_map_id(node, id, NULL, IORT_MSI_TYPE); if (!node) return -ENXIO; @@ -680,19 +679,20 @@ static int iort_dev_find_its_id(struct device *dev, u32 req_id, * * Returns: the MSI domain for this device, NULL otherwise */ -struct irq_domain *iort_get_device_domain(struct device *dev, u32 req_id) +struct irq_domain *iort_get_device_domain(struct device *dev, u32 id, + enum irq_domain_bus_token bus_token) { struct fwnode_handle *handle; int its_id; - if (iort_dev_find_its_id(dev, req_id, 0, &its_id)) + if (iort_dev_find_its_id(dev, id, 0, &its_id)) return NULL; handle = iort_find_domain_token(its_id); if (!handle) return NULL; - return irq_find_matching_fwnode(handle, DOMAIN_BUS_PCI_MSI); + return irq_find_matching_fwnode(handle, bus_token); } static void iort_set_device_domain(struct device *dev, diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 6b43a5455c7a..74a91f52ecc0 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -1558,7 +1558,8 @@ struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev) pci_for_each_dma_alias(pdev, get_msi_id_cb, &rid); dom = of_msi_map_get_device_domain(&pdev->dev, rid); if (!dom) - dom = iort_get_device_domain(&pdev->dev, rid); + dom = iort_get_device_domain(&pdev->dev, rid, + DOMAIN_BUS_PCI_MSI); return dom; } #endif /* CONFIG_PCI_MSI_IRQ_DOMAIN */ diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h index 8e7e2ec37f1b..08ec6bd2297f 100644 --- a/include/linux/acpi_iort.h +++ b/include/linux/acpi_iort.h @@ -29,7 +29,8 @@ struct fwnode_handle *iort_find_domain_token(int trans_id); #ifdef CONFIG_ACPI_IORT void acpi_iort_init(void); u32 iort_msi_map_rid(struct device *dev, u32 req_id); -struct irq_domain *iort_get_device_domain(struct device *dev, u32 req_id); +struct irq_domain *iort_get_device_domain(struct device *dev, u32 id, + enum irq_domain_bus_token bus_token); void acpi_configure_pmsi_domain(struct device *dev); int iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id); /* IOMMU interface */ @@ -40,8 +41,8 @@ int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head); static inline void acpi_iort_init(void) { } static inline u32 iort_msi_map_rid(struct device *dev, u32 req_id) { return req_id; } -static inline struct irq_domain *iort_get_device_domain(struct device *dev, - u32 req_id) +static inline struct irq_domain *iort_get_device_domain( + struct device *dev, u32 id, enum irq_domain_bus_token bus_token) { return NULL; } static inline void acpi_configure_pmsi_domain(struct device *dev) { } /* IOMMU interface */ -- cgit v1.2.3 From 39c3cf566ceafa7c1ae331a5f26fbb685d670001 Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Fri, 19 Jun 2020 09:20:04 +0100 Subject: ACPI/IORT: Make iort_msi_map_rid() PCI agnostic There is nothing PCI specific in iort_msi_map_rid(). Rename the function using a bus protocol agnostic name, iort_msi_map_id(), and convert current callers to it. Signed-off-by: Lorenzo Pieralisi Acked-by: Bjorn Helgaas Cc: Will Deacon Cc: Hanjun Guo Cc: Bjorn Helgaas Cc: Sudeep Holla Cc: Robin Murphy Cc: "Rafael J. Wysocki" Link: https://lore.kernel.org/r/20200619082013.13661-4-lorenzo.pieralisi@arm.com Signed-off-by: Catalin Marinas --- drivers/acpi/arm64/iort.c | 12 ++++++------ drivers/pci/msi.c | 2 +- include/linux/acpi_iort.h | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index 902e2aaca946..53f9ef515089 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -568,22 +568,22 @@ static struct acpi_iort_node *iort_find_dev_node(struct device *dev) } /** - * iort_msi_map_rid() - Map a MSI requester ID for a device + * iort_msi_map_id() - Map a MSI input ID for a device * @dev: The device for which the mapping is to be done. - * @req_id: The device requester ID. + * @input_id: The device input ID. * - * Returns: mapped MSI RID on success, input requester ID otherwise + * Returns: mapped MSI ID on success, input ID otherwise */ -u32 iort_msi_map_rid(struct device *dev, u32 req_id) +u32 iort_msi_map_id(struct device *dev, u32 input_id) { struct acpi_iort_node *node; u32 dev_id; node = iort_find_dev_node(dev); if (!node) - return req_id; + return input_id; - iort_node_map_id(node, req_id, &dev_id, IORT_MSI_TYPE); + iort_node_map_id(node, input_id, &dev_id, IORT_MSI_TYPE); return dev_id; } diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 74a91f52ecc0..77f48b95e277 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -1536,7 +1536,7 @@ u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev *pdev) of_node = irq_domain_get_of_node(domain); rid = of_node ? of_msi_map_rid(&pdev->dev, of_node, rid) : - iort_msi_map_rid(&pdev->dev, rid); + iort_msi_map_id(&pdev->dev, rid); return rid; } diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h index 08ec6bd2297f..e51425e083da 100644 --- a/include/linux/acpi_iort.h +++ b/include/linux/acpi_iort.h @@ -28,7 +28,7 @@ void iort_deregister_domain_token(int trans_id); struct fwnode_handle *iort_find_domain_token(int trans_id); #ifdef CONFIG_ACPI_IORT void acpi_iort_init(void); -u32 iort_msi_map_rid(struct device *dev, u32 req_id); +u32 iort_msi_map_id(struct device *dev, u32 id); struct irq_domain *iort_get_device_domain(struct device *dev, u32 id, enum irq_domain_bus_token bus_token); void acpi_configure_pmsi_domain(struct device *dev); @@ -39,8 +39,8 @@ const struct iommu_ops *iort_iommu_configure(struct device *dev); int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head); #else static inline void acpi_iort_init(void) { } -static inline u32 iort_msi_map_rid(struct device *dev, u32 req_id) -{ return req_id; } +static inline u32 iort_msi_map_id(struct device *dev, u32 id) +{ return id; } static inline struct irq_domain *iort_get_device_domain( struct device *dev, u32 id, enum irq_domain_bus_token bus_token) { return NULL; } -- cgit v1.2.3 From 3a3d208beede7ae03f8c80bed01f47d6b98d4ceb Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Fri, 19 Jun 2020 09:20:05 +0100 Subject: ACPI/IORT: Remove useless PCI bus walk The PCI bus domain number (used in the iort_match_node_callback() - pci_domain_nr() call) is cascaded through the PCI bus hierarchy at PCI bus enumeration time, therefore there is no need in iort_find_dev_node() to walk the PCI bus upwards to grab the root bus to be passed to iort_scan_node(), the device->bus PCI bus pointer will do. Remove this useless code. Signed-off-by: Lorenzo Pieralisi Cc: Will Deacon Cc: Hanjun Guo Cc: Sudeep Holla Cc: Robin Murphy Cc: "Rafael J. Wysocki" Link: https://lore.kernel.org/r/20200619082013.13661-5-lorenzo.pieralisi@arm.com Signed-off-by: Catalin Marinas --- drivers/acpi/arm64/iort.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index 53f9ef515089..421c6976ab81 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -558,10 +558,7 @@ static struct acpi_iort_node *iort_find_dev_node(struct device *dev) iort_match_node_callback, dev); } - /* Find a PCI root bus */ pbus = to_pci_dev(dev)->bus; - while (!pci_is_root_bus(pbus)) - pbus = pbus->parent; return iort_scan_node(ACPI_IORT_NODE_PCI_ROOT_COMPLEX, iort_match_node_callback, &pbus->dev); -- cgit v1.2.3 From b8e069a2a8da02137605ba585837a3a0c45df01a Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Fri, 19 Jun 2020 09:20:06 +0100 Subject: ACPI/IORT: Add an input ID to acpi_dma_configure() Some HW devices are created as child devices of proprietary busses, that have a bus specific policy defining how the child devices wires representing the devices ID are translated into IOMMU and IRQ controllers device IDs. Current IORT code provides translations for: - PCI devices, where the device ID is well identified at bus level as the requester ID (RID) - Platform devices that are endpoint devices where the device ID is retrieved from the ACPI object IORT mappings (Named components single mappings). A platform device is represented in IORT as a named component node For devices that are child devices of proprietary busses the IORT firmware represents the bus node as a named component node in IORT and it is up to that named component node to define in/out bus specific ID translations for the bus child devices that are allocated and created in a bus specific manner. In order to make IORT ID translations available for proprietary bus child devices, the current ACPI (and IORT) code must be augmented to provide an additional ID parameter to acpi_dma_configure() representing the child devices input ID. This ID is bus specific and it is retrieved in bus specific code. By adding an ID parameter to acpi_dma_configure(), the IORT code can map the child device ID to an IOMMU stream ID through the IORT named component representing the bus in/out ID mappings. Signed-off-by: Lorenzo Pieralisi Cc: Will Deacon Cc: Hanjun Guo Cc: Sudeep Holla Cc: Robin Murphy Cc: "Rafael J. Wysocki" Link: https://lore.kernel.org/r/20200619082013.13661-6-lorenzo.pieralisi@arm.com Signed-off-by: Catalin Marinas --- drivers/acpi/arm64/iort.c | 59 +++++++++++++++++++++++++++++++++++------------ drivers/acpi/scan.c | 8 ++++--- include/acpi/acpi_bus.h | 9 ++++++-- include/linux/acpi.h | 7 ++++++ include/linux/acpi_iort.h | 7 +++--- 5 files changed, 67 insertions(+), 23 deletions(-) diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index 421c6976ab81..ec782e4a0fe4 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -978,19 +978,54 @@ static void iort_named_component_init(struct device *dev, nc->node_flags); } +static int iort_nc_iommu_map(struct device *dev, struct acpi_iort_node *node) +{ + struct acpi_iort_node *parent; + int err = -ENODEV, i = 0; + u32 streamid = 0; + + do { + + parent = iort_node_map_platform_id(node, &streamid, + IORT_IOMMU_TYPE, + i++); + + if (parent) + err = iort_iommu_xlate(dev, parent, streamid); + } while (parent && !err); + + return err; +} + +static int iort_nc_iommu_map_id(struct device *dev, + struct acpi_iort_node *node, + const u32 *in_id) +{ + struct acpi_iort_node *parent; + u32 streamid; + + parent = iort_node_map_id(node, *in_id, &streamid, IORT_IOMMU_TYPE); + if (parent) + return iort_iommu_xlate(dev, parent, streamid); + + return -ENODEV; +} + + /** - * iort_iommu_configure - Set-up IOMMU configuration for a device. + * iort_iommu_configure_id - Set-up IOMMU configuration for a device. * * @dev: device to configure + * @id_in: optional input id const value pointer * * Returns: iommu_ops pointer on configuration success * NULL on configuration failure */ -const struct iommu_ops *iort_iommu_configure(struct device *dev) +const struct iommu_ops *iort_iommu_configure_id(struct device *dev, + const u32 *id_in) { - struct acpi_iort_node *node, *parent; + struct acpi_iort_node *node; const struct iommu_ops *ops; - u32 streamid = 0; int err = -ENODEV; /* @@ -1019,21 +1054,13 @@ const struct iommu_ops *iort_iommu_configure(struct device *dev) if (fwspec && iort_pci_rc_supports_ats(node)) fwspec->flags |= IOMMU_FWSPEC_PCI_RC_ATS; } else { - int i = 0; - node = iort_scan_node(ACPI_IORT_NODE_NAMED_COMPONENT, iort_match_node_callback, dev); if (!node) return NULL; - do { - parent = iort_node_map_platform_id(node, &streamid, - IORT_IOMMU_TYPE, - i++); - - if (parent) - err = iort_iommu_xlate(dev, parent, streamid); - } while (parent && !err); + err = id_in ? iort_nc_iommu_map_id(dev, node, id_in) : + iort_nc_iommu_map(dev, node); if (!err) iort_named_component_init(dev, node); @@ -1058,6 +1085,7 @@ const struct iommu_ops *iort_iommu_configure(struct device *dev) return ops; } + #else static inline const struct iommu_ops *iort_fwspec_iommu_ops(struct device *dev) { return NULL; } @@ -1066,7 +1094,8 @@ static inline int iort_add_device_replay(const struct iommu_ops *ops, { return 0; } int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head) { return 0; } -const struct iommu_ops *iort_iommu_configure(struct device *dev) +const struct iommu_ops *iort_iommu_configure_id(struct device *dev, + const u32 *input_id) { return NULL; } #endif diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 8777faced51a..2142f1554761 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -1457,8 +1457,10 @@ int acpi_dma_get_range(struct device *dev, u64 *dma_addr, u64 *offset, * acpi_dma_configure - Set-up DMA configuration for the device. * @dev: The pointer to the device * @attr: device dma attributes + * @input_id: input device id const value pointer */ -int acpi_dma_configure(struct device *dev, enum dev_dma_attr attr) +int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr, + const u32 *input_id) { const struct iommu_ops *iommu; u64 dma_addr = 0, size = 0; @@ -1470,7 +1472,7 @@ int acpi_dma_configure(struct device *dev, enum dev_dma_attr attr) iort_dma_setup(dev, &dma_addr, &size); - iommu = iort_iommu_configure(dev); + iommu = iort_iommu_configure_id(dev, input_id); if (PTR_ERR(iommu) == -EPROBE_DEFER) return -EPROBE_DEFER; @@ -1479,7 +1481,7 @@ int acpi_dma_configure(struct device *dev, enum dev_dma_attr attr) return 0; } -EXPORT_SYMBOL_GPL(acpi_dma_configure); +EXPORT_SYMBOL_GPL(acpi_dma_configure_id); static void acpi_init_coherency(struct acpi_device *adev) { diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 5afb6ceb284f..a3abcc4b7d9f 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -588,8 +588,13 @@ bool acpi_dma_supported(struct acpi_device *adev); enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev); int acpi_dma_get_range(struct device *dev, u64 *dma_addr, u64 *offset, u64 *size); -int acpi_dma_configure(struct device *dev, enum dev_dma_attr attr); - +int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr, + const u32 *input_id); +static inline int acpi_dma_configure(struct device *dev, + enum dev_dma_attr attr) +{ + return acpi_dma_configure_id(dev, attr, NULL); +} struct acpi_device *acpi_find_child_device(struct acpi_device *parent, u64 address, bool check_children); int acpi_is_root_bridge(acpi_handle); diff --git a/include/linux/acpi.h b/include/linux/acpi.h index d661cd0ee64d..6d2c47489d90 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -905,6 +905,13 @@ static inline int acpi_dma_configure(struct device *dev, return 0; } +static inline int acpi_dma_configure_id(struct device *dev, + enum dev_dma_attr attr, + const u32 *input_id) +{ + return 0; +} + #define ACPI_PTR(_ptr) (NULL) static inline void acpi_device_set_enumerated(struct acpi_device *adev) diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h index e51425e083da..20a32120bb88 100644 --- a/include/linux/acpi_iort.h +++ b/include/linux/acpi_iort.h @@ -35,7 +35,8 @@ void acpi_configure_pmsi_domain(struct device *dev); int iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id); /* IOMMU interface */ void iort_dma_setup(struct device *dev, u64 *dma_addr, u64 *size); -const struct iommu_ops *iort_iommu_configure(struct device *dev); +const struct iommu_ops *iort_iommu_configure_id(struct device *dev, + const u32 *id_in); int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head); #else static inline void acpi_iort_init(void) { } @@ -48,8 +49,8 @@ static inline void acpi_configure_pmsi_domain(struct device *dev) { } /* IOMMU interface */ static inline void iort_dma_setup(struct device *dev, u64 *dma_addr, u64 *size) { } -static inline const struct iommu_ops *iort_iommu_configure( - struct device *dev) +static inline const struct iommu_ops *iort_iommu_configure_id( + struct device *dev, const u32 *id_in) { return NULL; } static inline int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head) -- cgit v1.2.3 From 746a71d02b5d15817fcb13c956ba999a87773952 Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Fri, 19 Jun 2020 09:20:07 +0100 Subject: of/iommu: Make of_map_rid() PCI agnostic There is nothing PCI specific (other than the RID - requester ID) in the of_map_rid() implementation, so the same function can be reused for input/output IDs mapping for other busses just as well. Rename the RID instances/names to a generic "id" tag. No functionality change intended. Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Acked-by: Joerg Roedel Cc: Rob Herring Cc: Joerg Roedel Cc: Robin Murphy Cc: Marc Zyngier Link: https://lore.kernel.org/r/20200619082013.13661-7-lorenzo.pieralisi@arm.com Signed-off-by: Catalin Marinas --- drivers/iommu/of_iommu.c | 4 ++-- drivers/of/base.c | 42 +++++++++++++++++++++--------------------- drivers/of/irq.c | 2 +- include/linux/of.h | 4 ++-- 4 files changed, 26 insertions(+), 26 deletions(-) diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index 20738aacac89..016316244737 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -129,7 +129,7 @@ static int of_pci_iommu_init(struct pci_dev *pdev, u16 alias, void *data) struct of_phandle_args iommu_spec = { .args_count = 1 }; int err; - err = of_map_rid(info->np, alias, "iommu-map", "iommu-map-mask", + err = of_map_id(info->np, alias, "iommu-map", "iommu-map-mask", &iommu_spec.np, iommu_spec.args); if (err) return err == -ENODEV ? NO_IOMMU : err; @@ -145,7 +145,7 @@ static int of_fsl_mc_iommu_init(struct fsl_mc_device *mc_dev, struct of_phandle_args iommu_spec = { .args_count = 1 }; int err; - err = of_map_rid(master_np, mc_dev->icid, "iommu-map", + err = of_map_id(master_np, mc_dev->icid, "iommu-map", "iommu-map-mask", &iommu_spec.np, iommu_spec.args); if (err) diff --git a/drivers/of/base.c b/drivers/of/base.c index ae03b1218b06..ea44fea99813 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -2201,15 +2201,15 @@ int of_find_last_cache_level(unsigned int cpu) } /** - * of_map_rid - Translate a requester ID through a downstream mapping. + * of_map_id - Translate an ID through a downstream mapping. * @np: root complex device node. - * @rid: device requester ID to map. + * @id: device ID to map. * @map_name: property name of the map to use. * @map_mask_name: optional property name of the mask to use. * @target: optional pointer to a target device node. * @id_out: optional pointer to receive the translated ID. * - * Given a device requester ID, look up the appropriate implementation-defined + * Given a device ID, look up the appropriate implementation-defined * platform ID and/or the target device which receives transactions on that * ID, as per the "iommu-map" and "msi-map" bindings. Either of @target or * @id_out may be NULL if only the other is required. If @target points to @@ -2219,11 +2219,11 @@ int of_find_last_cache_level(unsigned int cpu) * * Return: 0 on success or a standard error code on failure. */ -int of_map_rid(struct device_node *np, u32 rid, +int of_map_id(struct device_node *np, u32 id, const char *map_name, const char *map_mask_name, struct device_node **target, u32 *id_out) { - u32 map_mask, masked_rid; + u32 map_mask, masked_id; int map_len; const __be32 *map = NULL; @@ -2235,7 +2235,7 @@ int of_map_rid(struct device_node *np, u32 rid, if (target) return -ENODEV; /* Otherwise, no map implies no translation */ - *id_out = rid; + *id_out = id; return 0; } @@ -2255,22 +2255,22 @@ int of_map_rid(struct device_node *np, u32 rid, if (map_mask_name) of_property_read_u32(np, map_mask_name, &map_mask); - masked_rid = map_mask & rid; + masked_id = map_mask & id; for ( ; map_len > 0; map_len -= 4 * sizeof(*map), map += 4) { struct device_node *phandle_node; - u32 rid_base = be32_to_cpup(map + 0); + u32 id_base = be32_to_cpup(map + 0); u32 phandle = be32_to_cpup(map + 1); u32 out_base = be32_to_cpup(map + 2); - u32 rid_len = be32_to_cpup(map + 3); + u32 id_len = be32_to_cpup(map + 3); - if (rid_base & ~map_mask) { - pr_err("%pOF: Invalid %s translation - %s-mask (0x%x) ignores rid-base (0x%x)\n", + if (id_base & ~map_mask) { + pr_err("%pOF: Invalid %s translation - %s-mask (0x%x) ignores id-base (0x%x)\n", np, map_name, map_name, - map_mask, rid_base); + map_mask, id_base); return -EFAULT; } - if (masked_rid < rid_base || masked_rid >= rid_base + rid_len) + if (masked_id < id_base || masked_id >= id_base + id_len) continue; phandle_node = of_find_node_by_phandle(phandle); @@ -2288,20 +2288,20 @@ int of_map_rid(struct device_node *np, u32 rid, } if (id_out) - *id_out = masked_rid - rid_base + out_base; + *id_out = masked_id - id_base + out_base; - pr_debug("%pOF: %s, using mask %08x, rid-base: %08x, out-base: %08x, length: %08x, rid: %08x -> %08x\n", - np, map_name, map_mask, rid_base, out_base, - rid_len, rid, masked_rid - rid_base + out_base); + pr_debug("%pOF: %s, using mask %08x, id-base: %08x, out-base: %08x, length: %08x, id: %08x -> %08x\n", + np, map_name, map_mask, id_base, out_base, + id_len, id, masked_id - id_base + out_base); return 0; } - pr_info("%pOF: no %s translation for rid 0x%x on %pOF\n", np, map_name, - rid, target && *target ? *target : NULL); + pr_info("%pOF: no %s translation for id 0x%x on %pOF\n", np, map_name, + id, target && *target ? *target : NULL); /* Bypasses translation */ if (id_out) - *id_out = rid; + *id_out = id; return 0; } -EXPORT_SYMBOL_GPL(of_map_rid); +EXPORT_SYMBOL_GPL(of_map_id); diff --git a/drivers/of/irq.c b/drivers/of/irq.c index a296eaf52a5b..d632bc5b3a2d 100644 --- a/drivers/of/irq.c +++ b/drivers/of/irq.c @@ -587,7 +587,7 @@ static u32 __of_msi_map_rid(struct device *dev, struct device_node **np, * "msi-map" property. */ for (parent_dev = dev; parent_dev; parent_dev = parent_dev->parent) - if (!of_map_rid(parent_dev->of_node, rid_in, "msi-map", + if (!of_map_id(parent_dev->of_node, rid_in, "msi-map", "msi-map-mask", np, &rid_out)) break; return rid_out; diff --git a/include/linux/of.h b/include/linux/of.h index c669c0a4732f..60abe3f636ad 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -554,7 +554,7 @@ bool of_console_check(struct device_node *dn, char *name, int index); extern int of_cpu_node_to_id(struct device_node *np); -int of_map_rid(struct device_node *np, u32 rid, +int of_map_id(struct device_node *np, u32 id, const char *map_name, const char *map_mask_name, struct device_node **target, u32 *id_out); @@ -978,7 +978,7 @@ static inline int of_cpu_node_to_id(struct device_node *np) return -ENODEV; } -static inline int of_map_rid(struct device_node *np, u32 rid, +static inline int of_map_id(struct device_node *np, u32 id, const char *map_name, const char *map_mask_name, struct device_node **target, u32 *id_out) { -- cgit v1.2.3 From a081bd4af4ce80d845a0bab355ab5d0822db8058 Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Fri, 19 Jun 2020 09:20:08 +0100 Subject: of/device: Add input id to of_dma_configure() Devices sitting on proprietary busses have a device ID space that is owned by the respective bus and related firmware bindings. In order to let the generic OF layer handle the input translations to an IOMMU id, for such busses the current of_dma_configure() interface should be extended in order to allow the bus layer to provide the device input id parameter - that is retrieved/assigned in bus specific code and firmware. Augment of_dma_configure() to add an optional input_id parameter, leaving current functionality unchanged. Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Cc: Rob Herring Cc: Robin Murphy Cc: Joerg Roedel Cc: Laurentiu Tudor Link: https://lore.kernel.org/r/20200619082013.13661-8-lorenzo.pieralisi@arm.com Signed-off-by: Catalin Marinas --- drivers/bus/fsl-mc/fsl-mc-bus.c | 4 +- drivers/iommu/of_iommu.c | 81 ++++++++++++++++++++++------------------- drivers/of/device.c | 8 ++-- include/linux/of_device.h | 16 +++++++- include/linux/of_iommu.h | 6 ++- 5 files changed, 70 insertions(+), 45 deletions(-) diff --git a/drivers/bus/fsl-mc/fsl-mc-bus.c b/drivers/bus/fsl-mc/fsl-mc-bus.c index 40526da5c6a6..8ead3f0238f2 100644 --- a/drivers/bus/fsl-mc/fsl-mc-bus.c +++ b/drivers/bus/fsl-mc/fsl-mc-bus.c @@ -118,11 +118,13 @@ static int fsl_mc_bus_uevent(struct device *dev, struct kobj_uevent_env *env) static int fsl_mc_dma_configure(struct device *dev) { struct device *dma_dev = dev; + struct fsl_mc_device *mc_dev = to_fsl_mc_device(dev); + u32 input_id = mc_dev->icid; while (dev_is_fsl_mc(dma_dev)) dma_dev = dma_dev->parent; - return of_dma_configure(dev, dma_dev->of_node, 0); + return of_dma_configure_id(dev, dma_dev->of_node, 0, &input_id); } static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index 016316244737..e505b9130a1c 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -118,46 +118,66 @@ static int of_iommu_xlate(struct device *dev, return ret; } -struct of_pci_iommu_alias_info { - struct device *dev; - struct device_node *np; -}; - -static int of_pci_iommu_init(struct pci_dev *pdev, u16 alias, void *data) +static int of_iommu_configure_dev_id(struct device_node *master_np, + struct device *dev, + const u32 *id) { - struct of_pci_iommu_alias_info *info = data; struct of_phandle_args iommu_spec = { .args_count = 1 }; int err; - err = of_map_id(info->np, alias, "iommu-map", "iommu-map-mask", - &iommu_spec.np, iommu_spec.args); + err = of_map_id(master_np, *id, "iommu-map", + "iommu-map-mask", &iommu_spec.np, + iommu_spec.args); if (err) return err == -ENODEV ? NO_IOMMU : err; - err = of_iommu_xlate(info->dev, &iommu_spec); + err = of_iommu_xlate(dev, &iommu_spec); of_node_put(iommu_spec.np); return err; } -static int of_fsl_mc_iommu_init(struct fsl_mc_device *mc_dev, - struct device_node *master_np) +static int of_iommu_configure_dev(struct device_node *master_np, + struct device *dev) { - struct of_phandle_args iommu_spec = { .args_count = 1 }; - int err; - - err = of_map_id(master_np, mc_dev->icid, "iommu-map", - "iommu-map-mask", &iommu_spec.np, - iommu_spec.args); - if (err) - return err == -ENODEV ? NO_IOMMU : err; + struct of_phandle_args iommu_spec; + int err = NO_IOMMU, idx = 0; + + while (!of_parse_phandle_with_args(master_np, "iommus", + "#iommu-cells", + idx, &iommu_spec)) { + err = of_iommu_xlate(dev, &iommu_spec); + of_node_put(iommu_spec.np); + idx++; + if (err) + break; + } - err = of_iommu_xlate(&mc_dev->dev, &iommu_spec); - of_node_put(iommu_spec.np); return err; } +struct of_pci_iommu_alias_info { + struct device *dev; + struct device_node *np; +}; + +static int of_pci_iommu_init(struct pci_dev *pdev, u16 alias, void *data) +{ + struct of_pci_iommu_alias_info *info = data; + u32 input_id = alias; + + return of_iommu_configure_dev_id(info->np, info->dev, &input_id); +} + +static int of_iommu_configure_device(struct device_node *master_np, + struct device *dev, const u32 *id) +{ + return (id) ? of_iommu_configure_dev_id(master_np, dev, id) : + of_iommu_configure_dev(master_np, dev); +} + const struct iommu_ops *of_iommu_configure(struct device *dev, - struct device_node *master_np) + struct device_node *master_np, + const u32 *id) { const struct iommu_ops *ops = NULL; struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); @@ -188,21 +208,8 @@ const struct iommu_ops *of_iommu_configure(struct device *dev, pci_request_acs(); err = pci_for_each_dma_alias(to_pci_dev(dev), of_pci_iommu_init, &info); - } else if (dev_is_fsl_mc(dev)) { - err = of_fsl_mc_iommu_init(to_fsl_mc_device(dev), master_np); } else { - struct of_phandle_args iommu_spec; - int idx = 0; - - while (!of_parse_phandle_with_args(master_np, "iommus", - "#iommu-cells", - idx, &iommu_spec)) { - err = of_iommu_xlate(dev, &iommu_spec); - of_node_put(iommu_spec.np); - idx++; - if (err) - break; - } + err = of_iommu_configure_device(master_np, dev, id); fwspec = dev_iommu_fwspec_get(dev); if (!err && fwspec) diff --git a/drivers/of/device.c b/drivers/of/device.c index 27203bfd0b22..b439c1e05434 100644 --- a/drivers/of/device.c +++ b/drivers/of/device.c @@ -78,6 +78,7 @@ int of_device_add(struct platform_device *ofdev) * @np: Pointer to OF node having DMA configuration * @force_dma: Whether device is to be set up by of_dma_configure() even if * DMA capability is not explicitly described by firmware. + * @id: Optional const pointer value input id * * Try to get devices's DMA configuration from DT and update it * accordingly. @@ -86,7 +87,8 @@ int of_device_add(struct platform_device *ofdev) * can use a platform bus notifier and handle BUS_NOTIFY_ADD_DEVICE events * to fix up DMA configuration. */ -int of_dma_configure(struct device *dev, struct device_node *np, bool force_dma) +int of_dma_configure_id(struct device *dev, struct device_node *np, + bool force_dma, const u32 *id) { u64 dma_addr, paddr, size = 0; int ret; @@ -160,7 +162,7 @@ int of_dma_configure(struct device *dev, struct device_node *np, bool force_dma) dev_dbg(dev, "device is%sdma coherent\n", coherent ? " " : " not "); - iommu = of_iommu_configure(dev, np); + iommu = of_iommu_configure(dev, np, id); if (PTR_ERR(iommu) == -EPROBE_DEFER) return -EPROBE_DEFER; @@ -171,7 +173,7 @@ int of_dma_configure(struct device *dev, struct device_node *np, bool force_dma) return 0; } -EXPORT_SYMBOL_GPL(of_dma_configure); +EXPORT_SYMBOL_GPL(of_dma_configure_id); int of_device_register(struct platform_device *pdev) { diff --git a/include/linux/of_device.h b/include/linux/of_device.h index 8d31e39dd564..07ca187fc5e4 100644 --- a/include/linux/of_device.h +++ b/include/linux/of_device.h @@ -55,9 +55,15 @@ static inline struct device_node *of_cpu_device_node_get(int cpu) return of_node_get(cpu_dev->of_node); } -int of_dma_configure(struct device *dev, +int of_dma_configure_id(struct device *dev, struct device_node *np, - bool force_dma); + bool force_dma, const u32 *id); +static inline int of_dma_configure(struct device *dev, + struct device_node *np, + bool force_dma) +{ + return of_dma_configure_id(dev, np, force_dma, NULL); +} #else /* CONFIG_OF */ static inline int of_driver_match_device(struct device *dev, @@ -106,6 +112,12 @@ static inline struct device_node *of_cpu_device_node_get(int cpu) return NULL; } +static inline int of_dma_configure_id(struct device *dev, + struct device_node *np, + bool force_dma) +{ + return 0; +} static inline int of_dma_configure(struct device *dev, struct device_node *np, bool force_dma) diff --git a/include/linux/of_iommu.h b/include/linux/of_iommu.h index f3d40dd7bb66..16f4b3e87f20 100644 --- a/include/linux/of_iommu.h +++ b/include/linux/of_iommu.h @@ -13,7 +13,8 @@ extern int of_get_dma_window(struct device_node *dn, const char *prefix, size_t *size); extern const struct iommu_ops *of_iommu_configure(struct device *dev, - struct device_node *master_np); + struct device_node *master_np, + const u32 *id); #else @@ -25,7 +26,8 @@ static inline int of_get_dma_window(struct device_node *dn, const char *prefix, } static inline const struct iommu_ops *of_iommu_configure(struct device *dev, - struct device_node *master_np) + struct device_node *master_np, + const u32 *id) { return NULL; } -- cgit v1.2.3 From 5bda70c6162de9536cc983eacd24261c9c5de596 Mon Sep 17 00:00:00 2001 From: Laurentiu Tudor Date: Fri, 19 Jun 2020 09:20:09 +0100 Subject: dt-bindings: arm: fsl: Add msi-map device-tree binding for fsl-mc bus The existing bindings cannot be used to specify the relationship between fsl-mc devices and GIC ITSes. Add a generic binding for mapping fsl-mc devices to GIC ITSes, using msi-map property. In addition, deprecate msi-parent property which no longer makes sense now that we support translating the MSIs. Signed-off-by: Laurentiu Tudor Signed-off-by: Diana Craciun Reviewed-by: Rob Herring Cc: Rob Herring Link: https://lore.kernel.org/r/20200619082013.13661-9-lorenzo.pieralisi@arm.com Signed-off-by: Catalin Marinas --- .../devicetree/bindings/misc/fsl,qoriq-mc.txt | 50 +++++++++++++++++++--- 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt b/Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt index 9134e9bcca56..ebd329181c14 100644 --- a/Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt +++ b/Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt @@ -28,6 +28,16 @@ Documentation/devicetree/bindings/iommu/iommu.txt. For arm-smmu binding, see: Documentation/devicetree/bindings/iommu/arm,smmu.yaml. +The MSI writes are accompanied by sideband data which is derived from the ICID. +The msi-map property is used to associate the devices with both the ITS +controller and the sideband data which accompanies the writes. + +For generic MSI bindings, see +Documentation/devicetree/bindings/interrupt-controller/msi.txt. + +For GICv3 and GIC ITS bindings, see: +Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.yaml. + Required properties: - compatible @@ -49,11 +59,6 @@ Required properties: region may not be present in some scenarios, such as in the device tree presented to a virtual machine. - - msi-parent - Value type: - Definition: Must be present and point to the MSI controller node - handling message interrupts for the MC. - - ranges Value type: Definition: A standard property. Defines the mapping between the child @@ -119,6 +124,28 @@ Optional properties: associated with the listed IOMMU, with the iommu-specifier (i - icid-base + iommu-base). +- msi-map: Maps an ICID to a GIC ITS and associated msi-specifier + data. + + The property is an arbitrary number of tuples of + (icid-base,gic-its,msi-base,length). + + Any ICID in the interval [icid-base, icid-base + length) is + associated with the listed GIC ITS, with the msi-specifier + (i - icid-base + msi-base). + +Deprecated properties: + + - msi-parent + Value type: + Definition: Describes the MSI controller node handling message + interrupts for the MC. When there is no translation + between the ICID and deviceID this property can be used + to describe the MSI controller used by the devices on the + mc-bus. + The use of this property for mc-bus is deprecated. Please + use msi-map. + Example: smmu: iommu@5000000 { @@ -128,13 +155,24 @@ Example: ... }; + gic: interrupt-controller@6000000 { + compatible = "arm,gic-v3"; + ... + } + its: gic-its@6020000 { + compatible = "arm,gic-v3-its"; + msi-controller; + ... + }; + fsl_mc: fsl-mc@80c000000 { compatible = "fsl,qoriq-mc"; reg = <0x00000008 0x0c000000 0 0x40>, /* MC portal base */ <0x00000000 0x08340000 0 0x40000>; /* MC control reg */ - msi-parent = <&its>; /* define map for ICIDs 23-64 */ iommu-map = <23 &smmu 23 41>; + /* define msi map for ICIDs 23-64 */ + msi-map = <23 &its 23 41>; #address-cells = <3>; #size-cells = <1>; -- cgit v1.2.3 From 6f881aba01109a01a43e4f135673c19190f61133 Mon Sep 17 00:00:00 2001 From: Diana Craciun Date: Fri, 19 Jun 2020 09:20:10 +0100 Subject: of/irq: make of_msi_map_get_device_domain() bus agnostic of_msi_map_get_device_domain() is PCI specific but it need not be and can be easily changed to be bus agnostic in order to be used by other busses by adding an IRQ domain bus token as an input parameter. Signed-off-by: Diana Craciun Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Acked-by: Bjorn Helgaas # pci/msi.c Cc: Bjorn Helgaas Cc: Rob Herring Cc: Marc Zyngier Link: https://lore.kernel.org/r/20200619082013.13661-10-lorenzo.pieralisi@arm.com Signed-off-by: Catalin Marinas --- drivers/of/irq.c | 8 +++++--- drivers/pci/msi.c | 2 +- include/linux/of_irq.h | 5 +++-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/of/irq.c b/drivers/of/irq.c index d632bc5b3a2d..1005e4f349ef 100644 --- a/drivers/of/irq.c +++ b/drivers/of/irq.c @@ -613,18 +613,20 @@ u32 of_msi_map_rid(struct device *dev, struct device_node *msi_np, u32 rid_in) * of_msi_map_get_device_domain - Use msi-map to find the relevant MSI domain * @dev: device for which the mapping is to be done. * @rid: Requester ID for the device. + * @bus_token: Bus token * * Walk up the device hierarchy looking for devices with a "msi-map" * property. * * Returns: the MSI domain for this device (or NULL on failure) */ -struct irq_domain *of_msi_map_get_device_domain(struct device *dev, u32 rid) +struct irq_domain *of_msi_map_get_device_domain(struct device *dev, u32 id, + u32 bus_token) { struct device_node *np = NULL; - __of_msi_map_rid(dev, &np, rid); - return irq_find_matching_host(np, DOMAIN_BUS_PCI_MSI); + __of_msi_map_rid(dev, &np, id); + return irq_find_matching_host(np, bus_token); } /** diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 77f48b95e277..b4bfe0b03b2d 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -1556,7 +1556,7 @@ struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev) u32 rid = pci_dev_id(pdev); pci_for_each_dma_alias(pdev, get_msi_id_cb, &rid); - dom = of_msi_map_get_device_domain(&pdev->dev, rid); + dom = of_msi_map_get_device_domain(&pdev->dev, rid, DOMAIN_BUS_PCI_MSI); if (!dom) dom = iort_get_device_domain(&pdev->dev, rid, DOMAIN_BUS_PCI_MSI); diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h index 1214cabb2247..7142a3722758 100644 --- a/include/linux/of_irq.h +++ b/include/linux/of_irq.h @@ -52,7 +52,8 @@ extern struct irq_domain *of_msi_get_domain(struct device *dev, struct device_node *np, enum irq_domain_bus_token token); extern struct irq_domain *of_msi_map_get_device_domain(struct device *dev, - u32 rid); + u32 id, + u32 bus_token); extern void of_msi_configure(struct device *dev, struct device_node *np); u32 of_msi_map_rid(struct device *dev, struct device_node *msi_np, u32 rid_in); #else @@ -85,7 +86,7 @@ static inline struct irq_domain *of_msi_get_domain(struct device *dev, return NULL; } static inline struct irq_domain *of_msi_map_get_device_domain(struct device *dev, - u32 rid) + u32 id, u32 bus_token) { return NULL; } -- cgit v1.2.3 From 2bcdd8f2c07f1aa1bfd34fa0dab8e06949e34846 Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Fri, 19 Jun 2020 09:20:11 +0100 Subject: of/irq: Make of_msi_map_rid() PCI bus agnostic There is nothing PCI bus specific in the of_msi_map_rid() implementation other than the requester ID tag for the input ID space. Rename requester ID to a more generic ID so that the translation code can be used by all busses that require input/output ID translations. No functional change intended. Signed-off-by: Lorenzo Pieralisi Reviewed-by: Rob Herring Cc: Bjorn Helgaas Cc: Rob Herring Cc: Marc Zyngier Link: https://lore.kernel.org/r/20200619082013.13661-11-lorenzo.pieralisi@arm.com Signed-off-by: Catalin Marinas --- drivers/of/irq.c | 28 ++++++++++++++-------------- drivers/pci/msi.c | 2 +- include/linux/of_irq.h | 8 ++++---- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/drivers/of/irq.c b/drivers/of/irq.c index 1005e4f349ef..25d17b8a1a1a 100644 --- a/drivers/of/irq.c +++ b/drivers/of/irq.c @@ -576,43 +576,43 @@ err: } } -static u32 __of_msi_map_rid(struct device *dev, struct device_node **np, - u32 rid_in) +static u32 __of_msi_map_id(struct device *dev, struct device_node **np, + u32 id_in) { struct device *parent_dev; - u32 rid_out = rid_in; + u32 id_out = id_in; /* * Walk up the device parent links looking for one with a * "msi-map" property. */ for (parent_dev = dev; parent_dev; parent_dev = parent_dev->parent) - if (!of_map_id(parent_dev->of_node, rid_in, "msi-map", - "msi-map-mask", np, &rid_out)) + if (!of_map_id(parent_dev->of_node, id_in, "msi-map", + "msi-map-mask", np, &id_out)) break; - return rid_out; + return id_out; } /** - * of_msi_map_rid - Map a MSI requester ID for a device. + * of_msi_map_id - Map a MSI ID for a device. * @dev: device for which the mapping is to be done. * @msi_np: device node of the expected msi controller. - * @rid_in: unmapped MSI requester ID for the device. + * @id_in: unmapped MSI ID for the device. * * Walk up the device hierarchy looking for devices with a "msi-map" - * property. If found, apply the mapping to @rid_in. + * property. If found, apply the mapping to @id_in. * - * Returns the mapped MSI requester ID. + * Returns the mapped MSI ID. */ -u32 of_msi_map_rid(struct device *dev, struct device_node *msi_np, u32 rid_in) +u32 of_msi_map_id(struct device *dev, struct device_node *msi_np, u32 id_in) { - return __of_msi_map_rid(dev, &msi_np, rid_in); + return __of_msi_map_id(dev, &msi_np, id_in); } /** * of_msi_map_get_device_domain - Use msi-map to find the relevant MSI domain * @dev: device for which the mapping is to be done. - * @rid: Requester ID for the device. + * @id: Device ID. * @bus_token: Bus token * * Walk up the device hierarchy looking for devices with a "msi-map" @@ -625,7 +625,7 @@ struct irq_domain *of_msi_map_get_device_domain(struct device *dev, u32 id, { struct device_node *np = NULL; - __of_msi_map_rid(dev, &np, id); + __of_msi_map_id(dev, &np, id); return irq_find_matching_host(np, bus_token); } diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index b4bfe0b03b2d..19aeadb22f11 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -1535,7 +1535,7 @@ u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev *pdev) pci_for_each_dma_alias(pdev, get_msi_id_cb, &rid); of_node = irq_domain_get_of_node(domain); - rid = of_node ? of_msi_map_rid(&pdev->dev, of_node, rid) : + rid = of_node ? of_msi_map_id(&pdev->dev, of_node, rid) : iort_msi_map_id(&pdev->dev, rid); return rid; diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h index 7142a3722758..e8b78139f78c 100644 --- a/include/linux/of_irq.h +++ b/include/linux/of_irq.h @@ -55,7 +55,7 @@ extern struct irq_domain *of_msi_map_get_device_domain(struct device *dev, u32 id, u32 bus_token); extern void of_msi_configure(struct device *dev, struct device_node *np); -u32 of_msi_map_rid(struct device *dev, struct device_node *msi_np, u32 rid_in); +u32 of_msi_map_id(struct device *dev, struct device_node *msi_np, u32 id_in); #else static inline int of_irq_count(struct device_node *dev) { @@ -93,10 +93,10 @@ static inline struct irq_domain *of_msi_map_get_device_domain(struct device *dev static inline void of_msi_configure(struct device *dev, struct device_node *np) { } -static inline u32 of_msi_map_rid(struct device *dev, - struct device_node *msi_np, u32 rid_in) +static inline u32 of_msi_map_id(struct device *dev, + struct device_node *msi_np, u32 id_in) { - return rid_in; + return id_in; } #endif -- cgit v1.2.3 From 998fb7badf0362a2057694878098642ef363d899 Mon Sep 17 00:00:00 2001 From: Diana Craciun Date: Fri, 19 Jun 2020 09:20:12 +0100 Subject: bus/fsl-mc: Refactor the MSI domain creation in the DPRC driver The DPRC driver is not taking into account the msi-map property and assumes that the icid is the same as the stream ID. Although this assumption is correct, generalize the code to include a translation between icid and streamID. Furthermore do not just copy the MSI domain from parent (for child containers), but use the information provided by the msi-map property. If the msi-map property is missing from the device tree retain the old behaviour for backward compatibility ie the child DPRC objects inherit the MSI domain from the parent. Signed-off-by: Diana Craciun Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20200619082013.13661-12-lorenzo.pieralisi@arm.com Signed-off-by: Catalin Marinas --- drivers/bus/fsl-mc/dprc-driver.c | 31 ++++++++--------------------- drivers/bus/fsl-mc/fsl-mc-bus.c | 4 ++-- drivers/bus/fsl-mc/fsl-mc-msi.c | 31 ++++++++++++++++++----------- drivers/bus/fsl-mc/fsl-mc-private.h | 6 ++++-- drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c | 15 +++++++++++++- 5 files changed, 47 insertions(+), 40 deletions(-) diff --git a/drivers/bus/fsl-mc/dprc-driver.c b/drivers/bus/fsl-mc/dprc-driver.c index c8b1c3842c1a..189bff2115a8 100644 --- a/drivers/bus/fsl-mc/dprc-driver.c +++ b/drivers/bus/fsl-mc/dprc-driver.c @@ -592,6 +592,7 @@ static int dprc_probe(struct fsl_mc_device *mc_dev) bool mc_io_created = false; bool msi_domain_set = false; u16 major_ver, minor_ver; + struct irq_domain *mc_msi_domain; if (!is_fsl_mc_bus_dprc(mc_dev)) return -EINVAL; @@ -621,31 +622,15 @@ static int dprc_probe(struct fsl_mc_device *mc_dev) return error; mc_io_created = true; + } - /* - * Inherit parent MSI domain: - */ - dev_set_msi_domain(&mc_dev->dev, - dev_get_msi_domain(parent_dev)); - msi_domain_set = true; + mc_msi_domain = fsl_mc_find_msi_domain(&mc_dev->dev); + if (!mc_msi_domain) { + dev_warn(&mc_dev->dev, + "WARNING: MC bus without interrupt support\n"); } else { - /* - * This is a root DPRC - */ - struct irq_domain *mc_msi_domain; - - if (dev_is_fsl_mc(parent_dev)) - return -EINVAL; - - error = fsl_mc_find_msi_domain(parent_dev, - &mc_msi_domain); - if (error < 0) { - dev_warn(&mc_dev->dev, - "WARNING: MC bus without interrupt support\n"); - } else { - dev_set_msi_domain(&mc_dev->dev, mc_msi_domain); - msi_domain_set = true; - } + dev_set_msi_domain(&mc_dev->dev, mc_msi_domain); + msi_domain_set = true; } error = dprc_open(mc_dev->mc_io, 0, mc_dev->obj_desc.id, diff --git a/drivers/bus/fsl-mc/fsl-mc-bus.c b/drivers/bus/fsl-mc/fsl-mc-bus.c index 8ead3f0238f2..824ff77bbe86 100644 --- a/drivers/bus/fsl-mc/fsl-mc-bus.c +++ b/drivers/bus/fsl-mc/fsl-mc-bus.c @@ -370,8 +370,8 @@ EXPORT_SYMBOL_GPL(fsl_mc_get_version); /** * fsl_mc_get_root_dprc - function to traverse to the root dprc */ -static void fsl_mc_get_root_dprc(struct device *dev, - struct device **root_dprc_dev) +void fsl_mc_get_root_dprc(struct device *dev, + struct device **root_dprc_dev) { if (!dev) { *root_dprc_dev = NULL; diff --git a/drivers/bus/fsl-mc/fsl-mc-msi.c b/drivers/bus/fsl-mc/fsl-mc-msi.c index 8b9c66d7c4ff..e7bbff445a83 100644 --- a/drivers/bus/fsl-mc/fsl-mc-msi.c +++ b/drivers/bus/fsl-mc/fsl-mc-msi.c @@ -177,23 +177,30 @@ struct irq_domain *fsl_mc_msi_create_irq_domain(struct fwnode_handle *fwnode, return domain; } -int fsl_mc_find_msi_domain(struct device *mc_platform_dev, - struct irq_domain **mc_msi_domain) +struct irq_domain *fsl_mc_find_msi_domain(struct device *dev) { - struct irq_domain *msi_domain; - struct device_node *mc_of_node = mc_platform_dev->of_node; + struct irq_domain *msi_domain = NULL; + struct fsl_mc_device *mc_dev = to_fsl_mc_device(dev); - msi_domain = of_msi_get_domain(mc_platform_dev, mc_of_node, - DOMAIN_BUS_FSL_MC_MSI); - if (!msi_domain) { - pr_err("Unable to find fsl-mc MSI domain for %pOF\n", - mc_of_node); + msi_domain = of_msi_map_get_device_domain(dev, mc_dev->icid, + DOMAIN_BUS_FSL_MC_MSI); - return -ENOENT; + /* + * if the msi-map property is missing assume that all the + * child containers inherit the domain from the parent + */ + if (!msi_domain) { + struct device *root_dprc_dev; + struct device *bus_dev; + + fsl_mc_get_root_dprc(dev, &root_dprc_dev); + bus_dev = root_dprc_dev->parent; + msi_domain = of_msi_get_domain(bus_dev, + bus_dev->of_node, + DOMAIN_BUS_FSL_MC_MSI); } - *mc_msi_domain = msi_domain; - return 0; + return msi_domain; } static void fsl_mc_msi_free_descs(struct device *dev) diff --git a/drivers/bus/fsl-mc/fsl-mc-private.h b/drivers/bus/fsl-mc/fsl-mc-private.h index 21ca8c756ee7..7a46a12eb747 100644 --- a/drivers/bus/fsl-mc/fsl-mc-private.h +++ b/drivers/bus/fsl-mc/fsl-mc-private.h @@ -595,8 +595,7 @@ int fsl_mc_msi_domain_alloc_irqs(struct device *dev, void fsl_mc_msi_domain_free_irqs(struct device *dev); -int fsl_mc_find_msi_domain(struct device *mc_platform_dev, - struct irq_domain **mc_msi_domain); +struct irq_domain *fsl_mc_find_msi_domain(struct device *dev); int fsl_mc_populate_irq_pool(struct fsl_mc_bus *mc_bus, unsigned int irq_count); @@ -613,6 +612,9 @@ void fsl_destroy_mc_io(struct fsl_mc_io *mc_io); bool fsl_mc_is_root_dprc(struct device *dev); +void fsl_mc_get_root_dprc(struct device *dev, + struct device **root_dprc_dev); + struct fsl_mc_device *fsl_mc_device_lookup(struct fsl_mc_obj_desc *obj_desc, struct fsl_mc_device *mc_bus_dev); diff --git a/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c b/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c index 606efa64adff..a5c8d577e424 100644 --- a/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c +++ b/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c @@ -23,6 +23,18 @@ static struct irq_chip its_msi_irq_chip = { .irq_set_affinity = msi_domain_set_affinity }; +static u32 fsl_mc_msi_domain_get_msi_id(struct irq_domain *domain, + struct fsl_mc_device *mc_dev) +{ + struct device_node *of_node; + u32 out_id; + + of_node = irq_domain_get_of_node(domain); + out_id = of_msi_map_id(&mc_dev->dev, of_node, mc_dev->icid); + + return out_id; +} + static int its_fsl_mc_msi_prepare(struct irq_domain *msi_domain, struct device *dev, int nvec, msi_alloc_info_t *info) @@ -43,7 +55,8 @@ static int its_fsl_mc_msi_prepare(struct irq_domain *msi_domain, * NOTE: This device id corresponds to the IOMMU stream ID * associated with the DPRC object (ICID). */ - info->scratchpad[0].ul = mc_bus_dev->icid; + info->scratchpad[0].ul = fsl_mc_msi_domain_get_msi_id(msi_domain, + mc_bus_dev); msi_info = msi_get_domain_info(msi_domain->parent); /* Allocate at least 32 MSIs, and always as a power of 2 */ -- cgit v1.2.3 From 6305166c8771c33a8d5992fb53f93cfecedc14fd Mon Sep 17 00:00:00 2001 From: Makarand Pawagi Date: Fri, 19 Jun 2020 09:20:13 +0100 Subject: bus: fsl-mc: Add ACPI support for fsl-mc Add ACPI support in the fsl-mc driver. Driver parses MC DSDT table to extract memory and other resources. Interrupt (GIC ITS) information is extracted from the MADT table by drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c. IORT table is parsed to configure DMA. Signed-off-by: Makarand Pawagi Signed-off-by: Diana Craciun Signed-off-by: Laurentiu Tudor Link: https://lore.kernel.org/r/20200619082013.13661-13-lorenzo.pieralisi@arm.com Signed-off-by: Catalin Marinas --- drivers/bus/fsl-mc/fsl-mc-bus.c | 73 +++++++++++++++++------ drivers/bus/fsl-mc/fsl-mc-msi.c | 37 +++++++----- drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c | 92 +++++++++++++++++++++++------ 3 files changed, 150 insertions(+), 52 deletions(-) diff --git a/drivers/bus/fsl-mc/fsl-mc-bus.c b/drivers/bus/fsl-mc/fsl-mc-bus.c index 824ff77bbe86..324d49d6df89 100644 --- a/drivers/bus/fsl-mc/fsl-mc-bus.c +++ b/drivers/bus/fsl-mc/fsl-mc-bus.c @@ -18,6 +18,8 @@ #include #include #include +#include +#include #include "fsl-mc-private.h" @@ -38,6 +40,7 @@ struct fsl_mc { struct fsl_mc_device *root_mc_bus_dev; u8 num_translation_ranges; struct fsl_mc_addr_translation_range *translation_ranges; + void *fsl_mc_regs; }; /** @@ -56,6 +59,10 @@ struct fsl_mc_addr_translation_range { phys_addr_t start_phys_addr; }; +#define FSL_MC_FAPR 0x28 +#define MC_FAPR_PL BIT(18) +#define MC_FAPR_BMT BIT(17) + /** * fsl_mc_bus_match - device to driver matching callback * @dev: the fsl-mc device to match against @@ -124,7 +131,10 @@ static int fsl_mc_dma_configure(struct device *dev) while (dev_is_fsl_mc(dma_dev)) dma_dev = dma_dev->parent; - return of_dma_configure_id(dev, dma_dev->of_node, 0, &input_id); + if (dev_of_node(dma_dev)) + return of_dma_configure_id(dev, dma_dev->of_node, 0, &input_id); + + return acpi_dma_configure_id(dev, DEV_DMA_COHERENT, &input_id); } static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, @@ -865,8 +875,11 @@ static int fsl_mc_bus_probe(struct platform_device *pdev) struct fsl_mc_io *mc_io = NULL; int container_id; phys_addr_t mc_portal_phys_addr; - u32 mc_portal_size; - struct resource res; + u32 mc_portal_size, mc_stream_id; + struct resource *plat_res; + + if (!iommu_present(&fsl_mc_bus_type)) + return -EPROBE_DEFER; mc = devm_kzalloc(&pdev->dev, sizeof(*mc), GFP_KERNEL); if (!mc) @@ -874,19 +887,33 @@ static int fsl_mc_bus_probe(struct platform_device *pdev) platform_set_drvdata(pdev, mc); + plat_res = platform_get_resource(pdev, IORESOURCE_MEM, 1); + mc->fsl_mc_regs = devm_ioremap_resource(&pdev->dev, plat_res); + if (IS_ERR(mc->fsl_mc_regs)) + return PTR_ERR(mc->fsl_mc_regs); + + if (IS_ENABLED(CONFIG_ACPI) && !dev_of_node(&pdev->dev)) { + mc_stream_id = readl(mc->fsl_mc_regs + FSL_MC_FAPR); + /* + * HW ORs the PL and BMT bit, places the result in bit 15 of + * the StreamID and ORs in the ICID. Calculate it accordingly. + */ + mc_stream_id = (mc_stream_id & 0xffff) | + ((mc_stream_id & (MC_FAPR_PL | MC_FAPR_BMT)) ? + 0x4000 : 0); + error = acpi_dma_configure_id(&pdev->dev, DEV_DMA_COHERENT, + &mc_stream_id); + if (error) + dev_warn(&pdev->dev, "failed to configure dma: %d.\n", + error); + } + /* * Get physical address of MC portal for the root DPRC: */ - error = of_address_to_resource(pdev->dev.of_node, 0, &res); - if (error < 0) { - dev_err(&pdev->dev, - "of_address_to_resource() failed for %pOF\n", - pdev->dev.of_node); - return error; - } - - mc_portal_phys_addr = res.start; - mc_portal_size = resource_size(&res); + plat_res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + mc_portal_phys_addr = plat_res->start; + mc_portal_size = resource_size(plat_res); error = fsl_create_mc_io(&pdev->dev, mc_portal_phys_addr, mc_portal_size, NULL, FSL_MC_IO_ATOMIC_CONTEXT_PORTAL, &mc_io); @@ -903,11 +930,13 @@ static int fsl_mc_bus_probe(struct platform_device *pdev) dev_info(&pdev->dev, "MC firmware version: %u.%u.%u\n", mc_version.major, mc_version.minor, mc_version.revision); - error = get_mc_addr_translation_ranges(&pdev->dev, - &mc->translation_ranges, - &mc->num_translation_ranges); - if (error < 0) - goto error_cleanup_mc_io; + if (dev_of_node(&pdev->dev)) { + error = get_mc_addr_translation_ranges(&pdev->dev, + &mc->translation_ranges, + &mc->num_translation_ranges); + if (error < 0) + goto error_cleanup_mc_io; + } error = dprc_get_container_id(mc_io, 0, &container_id); if (error < 0) { @@ -934,6 +963,7 @@ static int fsl_mc_bus_probe(struct platform_device *pdev) goto error_cleanup_mc_io; mc->root_mc_bus_dev = mc_bus_dev; + mc_bus_dev->dev.fwnode = pdev->dev.fwnode; return 0; error_cleanup_mc_io: @@ -967,11 +997,18 @@ static const struct of_device_id fsl_mc_bus_match_table[] = { MODULE_DEVICE_TABLE(of, fsl_mc_bus_match_table); +static const struct acpi_device_id fsl_mc_bus_acpi_match_table[] = { + {"NXP0008", 0 }, + { } +}; +MODULE_DEVICE_TABLE(acpi, fsl_mc_bus_acpi_match_table); + static struct platform_driver fsl_mc_bus_driver = { .driver = { .name = "fsl_mc_bus", .pm = NULL, .of_match_table = fsl_mc_bus_match_table, + .acpi_match_table = fsl_mc_bus_acpi_match_table, }, .probe = fsl_mc_bus_probe, .remove = fsl_mc_bus_remove, diff --git a/drivers/bus/fsl-mc/fsl-mc-msi.c b/drivers/bus/fsl-mc/fsl-mc-msi.c index e7bbff445a83..8edadf05cbb7 100644 --- a/drivers/bus/fsl-mc/fsl-mc-msi.c +++ b/drivers/bus/fsl-mc/fsl-mc-msi.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "fsl-mc-private.h" @@ -179,25 +180,31 @@ struct irq_domain *fsl_mc_msi_create_irq_domain(struct fwnode_handle *fwnode, struct irq_domain *fsl_mc_find_msi_domain(struct device *dev) { - struct irq_domain *msi_domain = NULL; + struct device *root_dprc_dev; + struct device *bus_dev; + struct irq_domain *msi_domain; struct fsl_mc_device *mc_dev = to_fsl_mc_device(dev); - msi_domain = of_msi_map_get_device_domain(dev, mc_dev->icid, + fsl_mc_get_root_dprc(dev, &root_dprc_dev); + bus_dev = root_dprc_dev->parent; + + if (bus_dev->of_node) { + msi_domain = of_msi_map_get_device_domain(dev, + mc_dev->icid, DOMAIN_BUS_FSL_MC_MSI); - /* - * if the msi-map property is missing assume that all the - * child containers inherit the domain from the parent - */ - if (!msi_domain) { - struct device *root_dprc_dev; - struct device *bus_dev; - - fsl_mc_get_root_dprc(dev, &root_dprc_dev); - bus_dev = root_dprc_dev->parent; - msi_domain = of_msi_get_domain(bus_dev, - bus_dev->of_node, - DOMAIN_BUS_FSL_MC_MSI); + /* + * if the msi-map property is missing assume that all the + * child containers inherit the domain from the parent + */ + if (!msi_domain) + + msi_domain = of_msi_get_domain(bus_dev, + bus_dev->of_node, + DOMAIN_BUS_FSL_MC_MSI); + } else { + msi_domain = iort_get_device_domain(dev, mc_dev->icid, + DOMAIN_BUS_FSL_MC_MSI); } return msi_domain; diff --git a/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c b/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c index a5c8d577e424..634263dfd7b5 100644 --- a/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c +++ b/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c @@ -7,6 +7,8 @@ * */ +#include +#include #include #include #include @@ -30,7 +32,8 @@ static u32 fsl_mc_msi_domain_get_msi_id(struct irq_domain *domain, u32 out_id; of_node = irq_domain_get_of_node(domain); - out_id = of_msi_map_id(&mc_dev->dev, of_node, mc_dev->icid); + out_id = of_node ? of_msi_map_id(&mc_dev->dev, of_node, mc_dev->icid) : + iort_msi_map_id(&mc_dev->dev, mc_dev->icid); return out_id; } @@ -79,12 +82,71 @@ static const struct of_device_id its_device_id[] = { {}, }; -static int __init its_fsl_mc_msi_init(void) +static void __init its_fsl_mc_msi_init_one(struct fwnode_handle *handle, + const char *name) { - struct device_node *np; struct irq_domain *parent; struct irq_domain *mc_msi_domain; + parent = irq_find_matching_fwnode(handle, DOMAIN_BUS_NEXUS); + if (!parent || !msi_get_domain_info(parent)) { + pr_err("%s: unable to locate ITS domain\n", name); + return; + } + + mc_msi_domain = fsl_mc_msi_create_irq_domain(handle, + &its_fsl_mc_msi_domain_info, + parent); + if (!mc_msi_domain) { + pr_err("%s: unable to create fsl-mc domain\n", name); + return; + } + + pr_info("fsl-mc MSI: %s domain created\n", name); +} + +#ifdef CONFIG_ACPI +static int __init +its_fsl_mc_msi_parse_madt(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_madt_generic_translator *its_entry; + struct fwnode_handle *dom_handle; + const char *node_name; + int err = 0; + + its_entry = (struct acpi_madt_generic_translator *)header; + node_name = kasprintf(GFP_KERNEL, "ITS@0x%lx", + (long)its_entry->base_address); + + dom_handle = iort_find_domain_token(its_entry->translation_id); + if (!dom_handle) { + pr_err("%s: Unable to locate ITS domain handle\n", node_name); + err = -ENXIO; + goto out; + } + + its_fsl_mc_msi_init_one(dom_handle, node_name); + +out: + kfree(node_name); + return err; +} + + +static void __init its_fsl_mc_acpi_msi_init(void) +{ + acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_TRANSLATOR, + its_fsl_mc_msi_parse_madt, 0); +} +#else +static inline void its_fsl_mc_acpi_msi_init(void) { } +#endif + +static void __init its_fsl_mc_of_msi_init(void) +{ + struct device_node *np; + for (np = of_find_matching_node(NULL, its_device_id); np; np = of_find_matching_node(np, its_device_id)) { if (!of_device_is_available(np)) @@ -92,23 +154,15 @@ static int __init its_fsl_mc_msi_init(void) if (!of_property_read_bool(np, "msi-controller")) continue; - parent = irq_find_matching_host(np, DOMAIN_BUS_NEXUS); - if (!parent || !msi_get_domain_info(parent)) { - pr_err("%pOF: unable to locate ITS domain\n", np); - continue; - } - - mc_msi_domain = fsl_mc_msi_create_irq_domain( - of_node_to_fwnode(np), - &its_fsl_mc_msi_domain_info, - parent); - if (!mc_msi_domain) { - pr_err("%pOF: unable to create fsl-mc domain\n", np); - continue; - } - - pr_info("fsl-mc MSI: %pOF domain created\n", np); + its_fsl_mc_msi_init_one(of_node_to_fwnode(np), + np->full_name); } +} + +static int __init its_fsl_mc_msi_init(void) +{ + its_fsl_mc_of_msi_init(); + its_fsl_mc_acpi_msi_init(); return 0; } -- cgit v1.2.3 From c4334d576cf420a7d0f4349ce0b0a8ed0de3938f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 25 Jul 2020 17:32:05 -0700 Subject: arm64: pgtable-hwdef.h: delete duplicated words Drop the repeated words "at" and "the". Signed-off-by: Randy Dunlap Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20200726003207.20253-2-rdunlap@infradead.org Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable-hwdef.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 9c91a8f93a0e..b18ba4452873 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -29,7 +29,7 @@ * Size mapped by an entry at level n ( 0 <= n <= 3) * We map (PAGE_SHIFT - 3) at all translation levels and PAGE_SHIFT bits * in the final page. The maximum number of translation levels supported by - * the architecture is 4. Hence, starting at at level n, we have further + * the architecture is 4. Hence, starting at level n, we have further * ((4 - n) - 1) levels of translation excluding the offset within the page. * So, the total number of bits mapped by an entry at level n is : * @@ -98,7 +98,7 @@ #define CONT_PMDS (1 << CONT_PMD_SHIFT) #define CONT_PMD_SIZE (CONT_PMDS * PMD_SIZE) #define CONT_PMD_MASK (~(CONT_PMD_SIZE - 1)) -/* the the numerical offset of the PTE within a range of CONT_PTES */ +/* the numerical offset of the PTE within a range of CONT_PTES */ #define CONT_RANGE_OFFSET(addr) (((addr)>>PAGE_SHIFT)&(CONT_PTES-1)) /* -- cgit v1.2.3 From c4b5abba008399dc4450ab6f62b2deb5acd3697e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 25 Jul 2020 17:32:06 -0700 Subject: arm64: ptrace.h: delete duplicated word Drop the repeated word "the". Signed-off-by: Randy Dunlap Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20200726003207.20253-3-rdunlap@infradead.org Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/ptrace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index 953b6a1ce549..966ed30ed5f7 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -27,7 +27,7 @@ * * Some code sections either automatically switch back to PSR.I or explicitly * require to not use priority masking. If bit GIC_PRIO_PSR_I_SET is included - * in the the priority mask, it indicates that PSR.I should be set and + * in the priority mask, it indicates that PSR.I should be set and * interrupt disabling temporarily does not rely on IRQ priorities. */ #define GIC_PRIO_IRQON 0xe0 -- cgit v1.2.3 From 1a9ea25d1874ca457a596738b40fa4f3bec6fc8f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 25 Jul 2020 17:32:07 -0700 Subject: arm64: sigcontext.h: delete duplicated word Drop the repeated word "the". Signed-off-by: Randy Dunlap Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20200726003207.20253-4-rdunlap@infradead.org Signed-off-by: Catalin Marinas --- arch/arm64/include/uapi/asm/sigcontext.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h index 8b0ebce92427..0c796c795dbe 100644 --- a/arch/arm64/include/uapi/asm/sigcontext.h +++ b/arch/arm64/include/uapi/asm/sigcontext.h @@ -179,7 +179,7 @@ struct sve_context { * The same convention applies when returning from a signal: a caller * will need to remove or resize the sve_context block if it wants to * make the SVE registers live when they were previously non-live or - * vice-versa. This may require the the caller to allocate fresh + * vice-versa. This may require the caller to allocate fresh * memory and/or move other context blocks in the signal frame. * * Changing the vector length during signal return is not permitted: -- cgit v1.2.3 From c4885bbb3afee80f41d39a33e49881a18e500f47 Mon Sep 17 00:00:00 2001 From: Pingfan Liu Date: Fri, 10 Jul 2020 22:04:12 +0800 Subject: arm64/mm: save memory access in check_and_switch_context() fast switch path On arm64, smp_processor_id() reads a per-cpu `cpu_number` variable, using the per-cpu offset stored in the tpidr_el1 system register. In some cases we generate a per-cpu address with a sequence like: cpu_ptr = &per_cpu(ptr, smp_processor_id()); Which potentially incurs a cache miss for both `cpu_number` and the in-memory `__per_cpu_offset` array. This can be written more optimally as: cpu_ptr = this_cpu_ptr(ptr); Which only needs the offset from tpidr_el1, and does not need to load from memory. The following two test cases show a small performance improvement measured on a 46-cpus qualcomm machine with 5.8.0-rc4 kernel. Test 1: (about 0.3% improvement) #cat b.sh make clean && make all -j138 #perf stat --repeat 10 --null --sync sh b.sh - before this patch Performance counter stats for 'sh b.sh' (10 runs): 298.62 +- 1.86 seconds time elapsed ( +- 0.62% ) - after this patch Performance counter stats for 'sh b.sh' (10 runs): 297.734 +- 0.954 seconds time elapsed ( +- 0.32% ) Test 2: (about 1.69% improvement) 'perf stat -r 10 perf bench sched messaging' Then sum the total time of 'sched/messaging' by manual. - before this patch total 0.707 sec for 10 times - after this patch totol 0.695 sec for 10 times Signed-off-by: Pingfan Liu Acked-by: Mark Rutland Cc: Will Deacon Cc: Steve Capper Cc: Mark Rutland Cc: Vladimir Murzin Cc: Jean-Philippe Brucker Link: https://lore.kernel.org/r/1594389852-19949-1-git-send-email-kernelfans@gmail.com Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/mmu_context.h | 6 ++---- arch/arm64/mm/context.c | 10 ++++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index b0bd9b55594c..f2d7537d6f83 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -175,7 +175,7 @@ static inline void cpu_replace_ttbr1(pgd_t *pgdp) * take CPU migration into account. */ #define destroy_context(mm) do { } while(0) -void check_and_switch_context(struct mm_struct *mm, unsigned int cpu); +void check_and_switch_context(struct mm_struct *mm); #define init_new_context(tsk,mm) ({ atomic64_set(&(mm)->context.id, 0); 0; }) @@ -214,8 +214,6 @@ enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) static inline void __switch_mm(struct mm_struct *next) { - unsigned int cpu = smp_processor_id(); - /* * init_mm.pgd does not contain any user mappings and it is always * active for kernel addresses in TTBR1. Just set the reserved TTBR0. @@ -225,7 +223,7 @@ static inline void __switch_mm(struct mm_struct *next) return; } - check_and_switch_context(next, cpu); + check_and_switch_context(next); } static inline void diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index d702d60e64da..a206655a39a5 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -198,9 +198,10 @@ set_asid: return idx2asid(asid) | generation; } -void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) +void check_and_switch_context(struct mm_struct *mm) { unsigned long flags; + unsigned int cpu; u64 asid, old_active_asid; if (system_supports_cnp()) @@ -222,9 +223,9 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) * relaxed xchg in flush_context will treat us as reserved * because atomic RmWs are totally ordered for a given location. */ - old_active_asid = atomic64_read(&per_cpu(active_asids, cpu)); + old_active_asid = atomic64_read(this_cpu_ptr(&active_asids)); if (old_active_asid && asid_gen_match(asid) && - atomic64_cmpxchg_relaxed(&per_cpu(active_asids, cpu), + atomic64_cmpxchg_relaxed(this_cpu_ptr(&active_asids), old_active_asid, asid)) goto switch_mm_fastpath; @@ -236,10 +237,11 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) atomic64_set(&mm->context.id, asid); } + cpu = smp_processor_id(); if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending)) local_flush_tlb_all(); - atomic64_set(&per_cpu(active_asids, cpu), asid); + atomic64_set(this_cpu_ptr(&active_asids), asid); raw_spin_unlock_irqrestore(&cpu_asid_lock, flags); switch_mm_fastpath: -- cgit v1.2.3 From 338c11e94e160f80d8352bf9b5da82dd1a910d2f Mon Sep 17 00:00:00 2001 From: Maninder Singh Date: Fri, 31 Jul 2020 17:19:50 +0530 Subject: arm64: use IRQ_STACK_SIZE instead of THREAD_SIZE for irq stack IRQ_STACK_SIZE can be made different from THREAD_SIZE, and as IRQ_STACK_SIZE is used while irq stack allocation, same define should be used while printing information of irq stack. Signed-off-by: Maninder Singh Acked-by: Mark Rutland Link: https://lore.kernel.org/r/1596196190-14141-1-git-send-email-maninder1.s@samsung.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/traps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 47f651df781c..13ebd5ca2070 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -855,7 +855,7 @@ asmlinkage void handle_bad_stack(struct pt_regs *regs) pr_emerg("Task stack: [0x%016lx..0x%016lx]\n", tsk_stk, tsk_stk + THREAD_SIZE); pr_emerg("IRQ stack: [0x%016lx..0x%016lx]\n", - irq_stk, irq_stk + THREAD_SIZE); + irq_stk, irq_stk + IRQ_STACK_SIZE); pr_emerg("Overflow stack: [0x%016lx..0x%016lx]\n", ovf_stk, ovf_stk + OVERFLOW_STACK_SIZE); -- cgit v1.2.3