summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/arch/arm64/index.rst1
-rw-r--r--Documentation/arch/arm64/mpam.rst72
-rw-r--r--Documentation/arch/arm64/silicon-errata.rst9
-rw-r--r--arch/arm64/Kconfig54
-rw-r--r--arch/arm64/include/asm/asm-uaccess.h2
-rw-r--r--arch/arm64/include/asm/cpucaps.h2
-rw-r--r--arch/arm64/include/asm/el2_setup.h3
-rw-r--r--arch/arm64/include/asm/futex.h311
-rw-r--r--arch/arm64/include/asm/hugetlb.h12
-rw-r--r--arch/arm64/include/asm/hwcap.h120
-rw-r--r--arch/arm64/include/asm/lsui.h27
-rw-r--r--arch/arm64/include/asm/mmu.h10
-rw-r--r--arch/arm64/include/asm/mmu_context.h3
-rw-r--r--arch/arm64/include/asm/mpam.h96
-rw-r--r--arch/arm64/include/asm/mte.h6
-rw-r--r--arch/arm64/include/asm/pgtable-hwdef.h9
-rw-r--r--arch/arm64/include/asm/pgtable-prot.h2
-rw-r--r--arch/arm64/include/asm/pgtable.h60
-rw-r--r--arch/arm64/include/asm/resctrl.h2
-rw-r--r--arch/arm64/include/asm/rwonce.h24
-rw-r--r--arch/arm64/include/asm/scs.h8
-rw-r--r--arch/arm64/include/asm/thread_info.h3
-rw-r--r--arch/arm64/include/asm/tlb.h6
-rw-r--r--arch/arm64/include/asm/tlbflush.h461
-rw-r--r--arch/arm64/include/asm/uaccess.h6
-rw-r--r--arch/arm64/kernel/Makefile1
-rw-r--r--arch/arm64/kernel/armv8_deprecated.c14
-rw-r--r--arch/arm64/kernel/cpufeature.c31
-rw-r--r--arch/arm64/kernel/entry-common.c52
-rw-r--r--arch/arm64/kernel/entry.S6
-rw-r--r--arch/arm64/kernel/machine_kexec.c3
-rw-r--r--arch/arm64/kernel/mpam.c62
-rw-r--r--arch/arm64/kernel/mte.c10
-rw-r--r--arch/arm64/kernel/process.c32
-rw-r--r--arch/arm64/kernel/rsi.c2
-rw-r--r--arch/arm64/kernel/sys_compat.c2
-rw-r--r--arch/arm64/kvm/at.c34
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/switch.h12
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-init.S4
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mm.c2
-rw-r--r--arch/arm64/kvm/hyp/nvhe/tlb.c2
-rw-r--r--arch/arm64/kvm/hyp/pgtable.c4
-rw-r--r--arch/arm64/kvm/hyp/vhe/sysreg-sr.c16
-rw-r--r--arch/arm64/kvm/hyp/vhe/tlb.c2
-rw-r--r--arch/arm64/kvm/sys_regs.c5
-rw-r--r--arch/arm64/mm/context.c8
-rw-r--r--arch/arm64/mm/contpte.c12
-rw-r--r--arch/arm64/mm/fault.c30
-rw-r--r--arch/arm64/mm/hugetlbpage.c10
-rw-r--r--arch/arm64/mm/init.c9
-rw-r--r--arch/arm64/mm/mmu.c286
-rw-r--r--arch/arm64/mm/pageattr.c50
-rw-r--r--arch/arm64/mm/trans_pgd.c42
-rw-r--r--arch/arm64/tools/Makefile8
-rw-r--r--arch/arm64/tools/cpucaps1
-rw-r--r--arch/arm64/tools/gen-kernel-hwcaps.sh23
-rw-r--r--arch/arm64/tools/sysreg36
-rw-r--r--drivers/acpi/arm64/agdi.c2
-rw-r--r--drivers/resctrl/Kconfig9
-rw-r--r--drivers/resctrl/Makefile1
-rw-r--r--drivers/resctrl/mpam_devices.c303
-rw-r--r--drivers/resctrl/mpam_internal.h108
-rw-r--r--drivers/resctrl/mpam_resctrl.c1704
-rw-r--r--drivers/resctrl/test_mpam_resctrl.c315
-rw-r--r--include/linux/arm_mpam.h32
-rw-r--r--include/linux/entry-common.h2
-rw-r--r--include/linux/irq-entry-common.h256
-rw-r--r--kernel/entry/common.c107
-rw-r--r--tools/testing/selftests/arm64/abi/hwcap.c3
-rw-r--r--tools/testing/selftests/kvm/arm64/set_id_regs.c1
70 files changed, 4088 insertions, 875 deletions
diff --git a/Documentation/arch/arm64/index.rst b/Documentation/arch/arm64/index.rst
index af52edc8c0ac..98052b4ef4a1 100644
--- a/Documentation/arch/arm64/index.rst
+++ b/Documentation/arch/arm64/index.rst
@@ -23,6 +23,7 @@ ARM64 Architecture
memory
memory-tagging-extension
mops
+ mpam
perf
pointer-authentication
ptdump
diff --git a/Documentation/arch/arm64/mpam.rst b/Documentation/arch/arm64/mpam.rst
new file mode 100644
index 000000000000..570f51a8d4eb
--- /dev/null
+++ b/Documentation/arch/arm64/mpam.rst
@@ -0,0 +1,72 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+====
+MPAM
+====
+
+What is MPAM
+============
+MPAM (Memory Partitioning and Monitoring) is a feature in the CPUs and memory
+system components such as the caches or memory controllers that allow memory
+traffic to be labelled, partitioned and monitored.
+
+Traffic is labelled by the CPU, based on the control or monitor group the
+current task is assigned to using resctrl. Partitioning policy can be set
+using the schemata file in resctrl, and monitor values read via resctrl.
+See Documentation/filesystems/resctrl.rst for more details.
+
+This allows tasks that share memory system resources, such as caches, to be
+isolated from each other according to the partitioning policy (so called noisy
+neighbours).
+
+Supported Platforms
+===================
+Use of this feature requires CPU support, support in the memory system
+components, and a description from firmware of where the MPAM device controls
+are in the MMIO address space. (e.g. the 'MPAM' ACPI table).
+
+The MMIO device that provides MPAM controls/monitors for a memory system
+component is called a memory system component. (MSC).
+
+Because the user interface to MPAM is via resctrl, only MPAM features that are
+compatible with resctrl can be exposed to user-space.
+
+MSC are considered as a group based on the topology. MSC that correspond with
+the L3 cache are considered together, it is not possible to mix MSC between L2
+and L3 to 'cover' a resctrl schema.
+
+The supported features are:
+
+* Cache portion bitmap controls (CPOR) on the L2 or L3 caches. To expose
+ CPOR at L2 or L3, every CPU must have a corresponding CPU cache at this
+ level that also supports the feature. Mismatched big/little platforms are
+ not supported as resctrl's controls would then also depend on task
+ placement.
+
+* Memory bandwidth maximum controls (MBW_MAX) on or after the L3 cache.
+ resctrl uses the L3 cache-id to identify where the memory bandwidth
+ control is applied. For this reason the platform must have an L3 cache
+ with cache-id's supplied by firmware. (It doesn't need to support MPAM.)
+
+ To be exported as the 'MB' schema, the topology of the group of MSC chosen
+ must match the topology of the L3 cache so that the cache-id's can be
+ repainted. For example: Platforms with Memory bandwidth maximum controls
+ on CPU-less NUMA nodes cannot expose the 'MB' schema to resctrl as these
+ nodes do not have a corresponding L3 cache. If the memory bandwidth
+ control is on the memory rather than the L3 then there must be a single
+ global L3 as otherwise it is unknown which L3 the traffic came from. There
+ must be no caches between the L3 and the memory so that the two ends of
+ the path have equivalent traffic.
+
+ When the MPAM driver finds multiple groups of MSC it can use for the 'MB'
+ schema, it prefers the group closest to the L3 cache.
+
+* Cache Storage Usage (CSU) counters can expose the 'llc_occupancy' provided
+ there is at least one CSU monitor on each MSC that makes up the L3 group.
+ Exposing CSU counters from other caches or devices is not supported.
+
+Reporting Bugs
+==============
+If you are not seeing the counters or controls you expect please share the
+debug messages produced when enabling dynamic debug and booting with:
+dyndbg="file mpam_resctrl.c +pl"
diff --git a/Documentation/arch/arm64/silicon-errata.rst b/Documentation/arch/arm64/silicon-errata.rst
index 4c300caad901..65ed6ea33751 100644
--- a/Documentation/arch/arm64/silicon-errata.rst
+++ b/Documentation/arch/arm64/silicon-errata.rst
@@ -214,6 +214,9 @@ stable kernels.
+----------------+-----------------+-----------------+-----------------------------+
| ARM | SI L1 | #4311569 | ARM64_ERRATUM_4311569 |
+----------------+-----------------+-----------------+-----------------------------+
+| ARM | CMN-650 | #3642720 | N/A |
++----------------+-----------------+-----------------+-----------------------------+
++----------------+-----------------+-----------------+-----------------------------+
| Broadcom | Brahma-B53 | N/A | ARM64_ERRATUM_845719 |
+----------------+-----------------+-----------------+-----------------------------+
| Broadcom | Brahma-B53 | N/A | ARM64_ERRATUM_843419 |
@@ -247,6 +250,12 @@ stable kernels.
+----------------+-----------------+-----------------+-----------------------------+
| NVIDIA | T241 GICv3/4.x | T241-FABRIC-4 | N/A |
+----------------+-----------------+-----------------+-----------------------------+
+| NVIDIA | T241 MPAM | T241-MPAM-1 | N/A |
++----------------+-----------------+-----------------+-----------------------------+
+| NVIDIA | T241 MPAM | T241-MPAM-4 | N/A |
++----------------+-----------------+-----------------+-----------------------------+
+| NVIDIA | T241 MPAM | T241-MPAM-6 | N/A |
++----------------+-----------------+-----------------+-----------------------------+
+----------------+-----------------+-----------------+-----------------------------+
| Freescale/NXP | LS2080A/LS1043A | A-008585 | FSL_ERRATUM_A008585 |
+----------------+-----------------+-----------------+-----------------------------+
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 38dba5f7e4d2..a60038520cee 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -61,32 +61,6 @@ config ARM64
select ARCH_HAVE_ELF_PROT
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_HAVE_TRACE_MMIO_ACCESS
- select ARCH_INLINE_READ_LOCK if !PREEMPTION
- select ARCH_INLINE_READ_LOCK_BH if !PREEMPTION
- select ARCH_INLINE_READ_LOCK_IRQ if !PREEMPTION
- select ARCH_INLINE_READ_LOCK_IRQSAVE if !PREEMPTION
- select ARCH_INLINE_READ_UNLOCK if !PREEMPTION
- select ARCH_INLINE_READ_UNLOCK_BH if !PREEMPTION
- select ARCH_INLINE_READ_UNLOCK_IRQ if !PREEMPTION
- select ARCH_INLINE_READ_UNLOCK_IRQRESTORE if !PREEMPTION
- select ARCH_INLINE_WRITE_LOCK if !PREEMPTION
- select ARCH_INLINE_WRITE_LOCK_BH if !PREEMPTION
- select ARCH_INLINE_WRITE_LOCK_IRQ if !PREEMPTION
- select ARCH_INLINE_WRITE_LOCK_IRQSAVE if !PREEMPTION
- select ARCH_INLINE_WRITE_UNLOCK if !PREEMPTION
- select ARCH_INLINE_WRITE_UNLOCK_BH if !PREEMPTION
- select ARCH_INLINE_WRITE_UNLOCK_IRQ if !PREEMPTION
- select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE if !PREEMPTION
- select ARCH_INLINE_SPIN_TRYLOCK if !PREEMPTION
- select ARCH_INLINE_SPIN_TRYLOCK_BH if !PREEMPTION
- select ARCH_INLINE_SPIN_LOCK if !PREEMPTION
- select ARCH_INLINE_SPIN_LOCK_BH if !PREEMPTION
- select ARCH_INLINE_SPIN_LOCK_IRQ if !PREEMPTION
- select ARCH_INLINE_SPIN_LOCK_IRQSAVE if !PREEMPTION
- select ARCH_INLINE_SPIN_UNLOCK if !PREEMPTION
- select ARCH_INLINE_SPIN_UNLOCK_BH if !PREEMPTION
- select ARCH_INLINE_SPIN_UNLOCK_IRQ if !PREEMPTION
- select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPTION
select ARCH_KEEP_MEMBLOCK
select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
select ARCH_USE_CMPXCHG_LOCKREF
@@ -2016,8 +1990,8 @@ config ARM64_TLB_RANGE
config ARM64_MPAM
bool "Enable support for MPAM"
- select ARM64_MPAM_DRIVER if EXPERT # does nothing yet
- select ACPI_MPAM if ACPI
+ select ARM64_MPAM_DRIVER
+ select ARCH_HAS_CPU_RESCTRL
help
Memory System Resource Partitioning and Monitoring (MPAM) is an
optional extension to the Arm architecture that allows each
@@ -2039,6 +2013,8 @@ config ARM64_MPAM
MPAM is exposed to user-space via the resctrl pseudo filesystem.
+ This option enables the extra context switch code.
+
endmenu # "ARMv8.4 architectural features"
menu "ARMv8.5 architectural features"
@@ -2215,6 +2191,26 @@ config ARM64_GCS
endmenu # "ARMv9.4 architectural features"
+config AS_HAS_LSUI
+ def_bool $(as-instr,.arch_extension lsui)
+ help
+ Supported by LLVM 20+ and binutils 2.45+.
+
+menu "ARMv9.6 architectural features"
+
+config ARM64_LSUI
+ bool "Support Unprivileged Load Store Instructions (LSUI)"
+ default y
+ depends on AS_HAS_LSUI && !CPU_BIG_ENDIAN
+ help
+ The Unprivileged Load Store Instructions (LSUI) provides
+ variants load/store instructions that access user-space memory
+ from the kernel without clearing PSTATE.PAN bit.
+
+ This feature is supported by LLVM 20+ and binutils 2.45+.
+
+endmenu # "ARMv9.6 architectural feature"
+
config ARM64_SVE
bool "ARM Scalable Vector Extension support"
default y
@@ -2372,7 +2368,7 @@ config CMDLINE
default ""
help
Provide a set of default command-line options at build time by
- entering them here. As a minimum, you should specify the the
+ entering them here. As a minimum, you should specify the
root device (e.g. root=/dev/nfs).
choice
diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h
index 9148f5a31968..12aa6a283249 100644
--- a/arch/arm64/include/asm/asm-uaccess.h
+++ b/arch/arm64/include/asm/asm-uaccess.h
@@ -15,7 +15,7 @@
#ifdef CONFIG_ARM64_SW_TTBR0_PAN
.macro __uaccess_ttbr0_disable, tmp1
mrs \tmp1, ttbr1_el1 // swapper_pg_dir
- bic \tmp1, \tmp1, #TTBR_ASID_MASK
+ bic \tmp1, \tmp1, #TTBRx_EL1_ASID_MASK
sub \tmp1, \tmp1, #RESERVED_SWAPPER_OFFSET // reserved_pg_dir
msr ttbr0_el1, \tmp1 // set reserved TTBR0_EL1
add \tmp1, \tmp1, #RESERVED_SWAPPER_OFFSET
diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index 177c691914f8..6e3da333442e 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -71,6 +71,8 @@ cpucap_is_possible(const unsigned int cap)
return true;
case ARM64_HAS_PMUV3:
return IS_ENABLED(CONFIG_HW_PERF_EVENTS);
+ case ARM64_HAS_LSUI:
+ return IS_ENABLED(CONFIG_ARM64_LSUI);
}
return true;
diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h
index 85f4c1615472..4d15071a4f3f 100644
--- a/arch/arm64/include/asm/el2_setup.h
+++ b/arch/arm64/include/asm/el2_setup.h
@@ -513,7 +513,8 @@
check_override id_aa64pfr0, ID_AA64PFR0_EL1_MPAM_SHIFT, .Linit_mpam_\@, .Lskip_mpam_\@, x1, x2
.Linit_mpam_\@:
- msr_s SYS_MPAM2_EL2, xzr // use the default partition
+ mov x0, #MPAM2_EL2_EnMPAMSM_MASK
+ msr_s SYS_MPAM2_EL2, x0 // use the default partition,
// and disable lower traps
mrs_s x0, SYS_MPAMIDR_EL1
tbz x0, #MPAMIDR_EL1_HAS_HCR_SHIFT, .Lskip_mpam_\@ // skip if no MPAMHCR reg
diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
index bc06691d2062..d1d2ff9d323a 100644
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -9,71 +9,292 @@
#include <linux/uaccess.h>
#include <asm/errno.h>
+#include <asm/lsui.h>
#define FUTEX_MAX_LOOPS 128 /* What's the largest number you can think of? */
-#define __futex_atomic_op(insn, ret, oldval, uaddr, tmp, oparg) \
-do { \
+#define LLSC_FUTEX_ATOMIC_OP(op, insn) \
+static __always_inline int \
+__llsc_futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval) \
+{ \
unsigned int loops = FUTEX_MAX_LOOPS; \
+ int ret, oldval, newval; \
\
uaccess_enable_privileged(); \
- asm volatile( \
-" prfm pstl1strm, %2\n" \
-"1: ldxr %w1, %2\n" \
+ asm volatile("// __llsc_futex_atomic_" #op "\n" \
+" prfm pstl1strm, %[uaddr]\n" \
+"1: ldxr %w[oldval], %[uaddr]\n" \
insn "\n" \
-"2: stlxr %w0, %w3, %2\n" \
-" cbz %w0, 3f\n" \
-" sub %w4, %w4, %w0\n" \
-" cbnz %w4, 1b\n" \
-" mov %w0, %w6\n" \
+"2: stlxr %w[ret], %w[newval], %[uaddr]\n" \
+" cbz %w[ret], 3f\n" \
+" sub %w[loops], %w[loops], %w[ret]\n" \
+" cbnz %w[loops], 1b\n" \
+" mov %w[ret], %w[err]\n" \
"3:\n" \
" dmb ish\n" \
- _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w0) \
- _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w0) \
- : "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp), \
- "+r" (loops) \
- : "r" (oparg), "Ir" (-EAGAIN) \
+ _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w[ret]) \
+ _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w[ret]) \
+ : [ret] "=&r" (ret), [oldval] "=&r" (oldval), \
+ [uaddr] "+Q" (*uaddr), [newval] "=&r" (newval), \
+ [loops] "+r" (loops) \
+ : [oparg] "r" (oparg), [err] "Ir" (-EAGAIN) \
: "memory"); \
uaccess_disable_privileged(); \
-} while (0)
+ \
+ if (!ret) \
+ *oval = oldval; \
+ \
+ return ret; \
+}
+
+LLSC_FUTEX_ATOMIC_OP(add, "add %w[newval], %w[oldval], %w[oparg]")
+LLSC_FUTEX_ATOMIC_OP(or, "orr %w[newval], %w[oldval], %w[oparg]")
+LLSC_FUTEX_ATOMIC_OP(and, "and %w[newval], %w[oldval], %w[oparg]")
+LLSC_FUTEX_ATOMIC_OP(eor, "eor %w[newval], %w[oldval], %w[oparg]")
+LLSC_FUTEX_ATOMIC_OP(set, "mov %w[newval], %w[oparg]")
+
+static __always_inline int
+__llsc_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
+{
+ int ret = 0;
+ unsigned int loops = FUTEX_MAX_LOOPS;
+ u32 val, tmp;
+
+ uaccess_enable_privileged();
+ asm volatile("//__llsc_futex_cmpxchg\n"
+" prfm pstl1strm, %[uaddr]\n"
+"1: ldxr %w[curval], %[uaddr]\n"
+" eor %w[tmp], %w[curval], %w[oldval]\n"
+" cbnz %w[tmp], 4f\n"
+"2: stlxr %w[tmp], %w[newval], %[uaddr]\n"
+" cbz %w[tmp], 3f\n"
+" sub %w[loops], %w[loops], %w[tmp]\n"
+" cbnz %w[loops], 1b\n"
+" mov %w[ret], %w[err]\n"
+"3:\n"
+" dmb ish\n"
+"4:\n"
+ _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w[ret])
+ _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w[ret])
+ : [ret] "+r" (ret), [curval] "=&r" (val),
+ [uaddr] "+Q" (*uaddr), [tmp] "=&r" (tmp),
+ [loops] "+r" (loops)
+ : [oldval] "r" (oldval), [newval] "r" (newval),
+ [err] "Ir" (-EAGAIN)
+ : "memory");
+ uaccess_disable_privileged();
+
+ if (!ret)
+ *oval = val;
+
+ return ret;
+}
+
+#ifdef CONFIG_ARM64_LSUI
+
+/*
+ * Wrap LSUI instructions with uaccess_ttbr0_enable()/disable(), as
+ * PAN toggling is not required.
+ */
+
+#define LSUI_FUTEX_ATOMIC_OP(op, asm_op) \
+static __always_inline int \
+__lsui_futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval) \
+{ \
+ int ret = 0; \
+ int oldval; \
+ \
+ uaccess_ttbr0_enable(); \
+ \
+ asm volatile("// __lsui_futex_atomic_" #op "\n" \
+ __LSUI_PREAMBLE \
+"1: " #asm_op "al %w[oparg], %w[oldval], %[uaddr]\n" \
+"2:\n" \
+ _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w[ret]) \
+ : [ret] "+r" (ret), [uaddr] "+Q" (*uaddr), \
+ [oldval] "=r" (oldval) \
+ : [oparg] "r" (oparg) \
+ : "memory"); \
+ \
+ uaccess_ttbr0_disable(); \
+ \
+ if (!ret) \
+ *oval = oldval; \
+ return ret; \
+}
+
+LSUI_FUTEX_ATOMIC_OP(add, ldtadd)
+LSUI_FUTEX_ATOMIC_OP(or, ldtset)
+LSUI_FUTEX_ATOMIC_OP(andnot, ldtclr)
+LSUI_FUTEX_ATOMIC_OP(set, swpt)
+
+static __always_inline int
+__lsui_cmpxchg64(u64 __user *uaddr, u64 *oldval, u64 newval)
+{
+ int ret = 0;
+
+ uaccess_ttbr0_enable();
+
+ asm volatile("// __lsui_cmpxchg64\n"
+ __LSUI_PREAMBLE
+"1: casalt %[oldval], %[newval], %[uaddr]\n"
+"2:\n"
+ _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w[ret])
+ : [ret] "+r" (ret), [uaddr] "+Q" (*uaddr),
+ [oldval] "+r" (*oldval)
+ : [newval] "r" (newval)
+ : "memory");
+
+ uaccess_ttbr0_disable();
+
+ return ret;
+}
+
+static __always_inline int
+__lsui_cmpxchg32(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
+{
+ u64 __user *uaddr64;
+ bool futex_pos, other_pos;
+ u32 other, orig_other;
+ union {
+ u32 futex[2];
+ u64 raw;
+ } oval64, orig64, nval64;
+
+ uaddr64 = (u64 __user *)PTR_ALIGN_DOWN(uaddr, sizeof(u64));
+ futex_pos = !IS_ALIGNED((unsigned long)uaddr, sizeof(u64));
+ other_pos = !futex_pos;
+
+ oval64.futex[futex_pos] = oldval;
+ if (get_user(oval64.futex[other_pos], (u32 __user *)uaddr64 + other_pos))
+ return -EFAULT;
+
+ orig64.raw = oval64.raw;
+
+ nval64.futex[futex_pos] = newval;
+ nval64.futex[other_pos] = oval64.futex[other_pos];
+
+ if (__lsui_cmpxchg64(uaddr64, &oval64.raw, nval64.raw))
+ return -EFAULT;
+
+ oldval = oval64.futex[futex_pos];
+ other = oval64.futex[other_pos];
+ orig_other = orig64.futex[other_pos];
+
+ if (other != orig_other)
+ return -EAGAIN;
+
+ *oval = oldval;
+
+ return 0;
+}
+
+static __always_inline int
+__lsui_futex_atomic_and(int oparg, u32 __user *uaddr, int *oval)
+{
+ /*
+ * Undo the bitwise negation applied to the oparg passed from
+ * arch_futex_atomic_op_inuser() with FUTEX_OP_ANDN.
+ */
+ return __lsui_futex_atomic_andnot(~oparg, uaddr, oval);
+}
+
+static __always_inline int
+__lsui_futex_atomic_eor(int oparg, u32 __user *uaddr, int *oval)
+{
+ u32 oldval, newval, val;
+ int ret, i;
+
+ if (get_user(oldval, uaddr))
+ return -EFAULT;
+
+ /*
+ * there are no ldteor/stteor instructions...
+ */
+ for (i = 0; i < FUTEX_MAX_LOOPS; i++) {
+ newval = oldval ^ oparg;
+
+ ret = __lsui_cmpxchg32(uaddr, oldval, newval, &val);
+ switch (ret) {
+ case -EFAULT:
+ return ret;
+ case -EAGAIN:
+ continue;
+ }
+
+ if (val == oldval) {
+ *oval = val;
+ return 0;
+ }
+
+ oldval = val;
+ }
+
+ return -EAGAIN;
+}
+
+static __always_inline int
+__lsui_futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
+{
+ /*
+ * Callers of futex_atomic_cmpxchg_inatomic() already retry on
+ * -EAGAIN, no need for another loop of max retries.
+ */
+ return __lsui_cmpxchg32(uaddr, oldval, newval, oval);
+}
+#endif /* CONFIG_ARM64_LSUI */
+
+
+#define FUTEX_ATOMIC_OP(op) \
+static __always_inline int \
+__futex_atomic_##op(int oparg, u32 __user *uaddr, int *oval) \
+{ \
+ return __lsui_llsc_body(futex_atomic_##op, oparg, uaddr, oval); \
+}
+
+FUTEX_ATOMIC_OP(add)
+FUTEX_ATOMIC_OP(or)
+FUTEX_ATOMIC_OP(and)
+FUTEX_ATOMIC_OP(eor)
+FUTEX_ATOMIC_OP(set)
+
+static __always_inline int
+__futex_cmpxchg(u32 __user *uaddr, u32 oldval, u32 newval, u32 *oval)
+{
+ return __lsui_llsc_body(futex_cmpxchg, uaddr, oldval, newval, oval);
+}
static inline int
arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *_uaddr)
{
- int oldval = 0, ret, tmp;
- u32 __user *uaddr = __uaccess_mask_ptr(_uaddr);
+ int ret;
+ u32 __user *uaddr;
if (!access_ok(_uaddr, sizeof(u32)))
return -EFAULT;
+ uaddr = __uaccess_mask_ptr(_uaddr);
+
switch (op) {
case FUTEX_OP_SET:
- __futex_atomic_op("mov %w3, %w5",
- ret, oldval, uaddr, tmp, oparg);
+ ret = __futex_atomic_set(oparg, uaddr, oval);
break;
case FUTEX_OP_ADD:
- __futex_atomic_op("add %w3, %w1, %w5",
- ret, oldval, uaddr, tmp, oparg);
+ ret = __futex_atomic_add(oparg, uaddr, oval);
break;
case FUTEX_OP_OR:
- __futex_atomic_op("orr %w3, %w1, %w5",
- ret, oldval, uaddr, tmp, oparg);
+ ret = __futex_atomic_or(oparg, uaddr, oval);
break;
case FUTEX_OP_ANDN:
- __futex_atomic_op("and %w3, %w1, %w5",
- ret, oldval, uaddr, tmp, ~oparg);
+ ret = __futex_atomic_and(~oparg, uaddr, oval);
break;
case FUTEX_OP_XOR:
- __futex_atomic_op("eor %w3, %w1, %w5",
- ret, oldval, uaddr, tmp, oparg);
+ ret = __futex_atomic_eor(oparg, uaddr, oval);
break;
default:
ret = -ENOSYS;
}
- if (!ret)
- *oval = oldval;
-
return ret;
}
@@ -81,40 +302,14 @@ static inline int
futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *_uaddr,
u32 oldval, u32 newval)
{
- int ret = 0;
- unsigned int loops = FUTEX_MAX_LOOPS;
- u32 val, tmp;
u32 __user *uaddr;
if (!access_ok(_uaddr, sizeof(u32)))
return -EFAULT;
uaddr = __uaccess_mask_ptr(_uaddr);
- uaccess_enable_privileged();
- asm volatile("// futex_atomic_cmpxchg_inatomic\n"
-" prfm pstl1strm, %2\n"
-"1: ldxr %w1, %2\n"
-" sub %w3, %w1, %w5\n"
-" cbnz %w3, 4f\n"
-"2: stlxr %w3, %w6, %2\n"
-" cbz %w3, 3f\n"
-" sub %w4, %w4, %w3\n"
-" cbnz %w4, 1b\n"
-" mov %w0, %w7\n"
-"3:\n"
-" dmb ish\n"
-"4:\n"
- _ASM_EXTABLE_UACCESS_ERR(1b, 4b, %w0)
- _ASM_EXTABLE_UACCESS_ERR(2b, 4b, %w0)
- : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp), "+r" (loops)
- : "r" (oldval), "r" (newval), "Ir" (-EAGAIN)
- : "memory");
- uaccess_disable_privileged();
- if (!ret)
- *uval = val;
-
- return ret;
+ return __futex_cmpxchg(uaddr, oldval, newval, uval);
}
#endif /* __ASM_FUTEX_H */
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index e6f8ff3cc630..d038ff14d16c 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -71,23 +71,23 @@ static inline void __flush_hugetlb_tlb_range(struct vm_area_struct *vma,
unsigned long start,
unsigned long end,
unsigned long stride,
- bool last_level)
+ tlbf_t flags)
{
switch (stride) {
#ifndef __PAGETABLE_PMD_FOLDED
case PUD_SIZE:
- __flush_tlb_range(vma, start, end, PUD_SIZE, last_level, 1);
+ __flush_tlb_range(vma, start, end, PUD_SIZE, 1, flags);
break;
#endif
case CONT_PMD_SIZE:
case PMD_SIZE:
- __flush_tlb_range(vma, start, end, PMD_SIZE, last_level, 2);
+ __flush_tlb_range(vma, start, end, PMD_SIZE, 2, flags);
break;
case CONT_PTE_SIZE:
- __flush_tlb_range(vma, start, end, PAGE_SIZE, last_level, 3);
+ __flush_tlb_range(vma, start, end, PAGE_SIZE, 3, flags);
break;
default:
- __flush_tlb_range(vma, start, end, PAGE_SIZE, last_level, TLBI_TTL_UNKNOWN);
+ __flush_tlb_range(vma, start, end, PAGE_SIZE, TLBI_TTL_UNKNOWN, flags);
}
}
@@ -98,7 +98,7 @@ static inline void flush_hugetlb_tlb_range(struct vm_area_struct *vma,
{
unsigned long stride = huge_page_size(hstate_vma(vma));
- __flush_hugetlb_tlb_range(vma, start, end, stride, false);
+ __flush_hugetlb_tlb_range(vma, start, end, stride, TLBF_NONE);
}
#endif /* __ASM_HUGETLB_H */
diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h
index 72ea4bda79f3..abe8218b2325 100644
--- a/arch/arm64/include/asm/hwcap.h
+++ b/arch/arm64/include/asm/hwcap.h
@@ -60,126 +60,10 @@
* of KERNEL_HWCAP_{feature}.
*/
#define __khwcap_feature(x) const_ilog2(HWCAP_ ## x)
-#define KERNEL_HWCAP_FP __khwcap_feature(FP)
-#define KERNEL_HWCAP_ASIMD __khwcap_feature(ASIMD)
-#define KERNEL_HWCAP_EVTSTRM __khwcap_feature(EVTSTRM)
-#define KERNEL_HWCAP_AES __khwcap_feature(AES)
-#define KERNEL_HWCAP_PMULL __khwcap_feature(PMULL)
-#define KERNEL_HWCAP_SHA1 __khwcap_feature(SHA1)
-#define KERNEL_HWCAP_SHA2 __khwcap_feature(SHA2)
-#define KERNEL_HWCAP_CRC32 __khwcap_feature(CRC32)
-#define KERNEL_HWCAP_ATOMICS __khwcap_feature(ATOMICS)
-#define KERNEL_HWCAP_FPHP __khwcap_feature(FPHP)
-#define KERNEL_HWCAP_ASIMDHP __khwcap_feature(ASIMDHP)
-#define KERNEL_HWCAP_CPUID __khwcap_feature(CPUID)
-#define KERNEL_HWCAP_ASIMDRDM __khwcap_feature(ASIMDRDM)
-#define KERNEL_HWCAP_JSCVT __khwcap_feature(JSCVT)
-#define KERNEL_HWCAP_FCMA __khwcap_feature(FCMA)
-#define KERNEL_HWCAP_LRCPC __khwcap_feature(LRCPC)
-#define KERNEL_HWCAP_DCPOP __khwcap_feature(DCPOP)
-#define KERNEL_HWCAP_SHA3 __khwcap_feature(SHA3)
-#define KERNEL_HWCAP_SM3 __khwcap_feature(SM3)
-#define KERNEL_HWCAP_SM4 __khwcap_feature(SM4)
-#define KERNEL_HWCAP_ASIMDDP __khwcap_feature(ASIMDDP)
-#define KERNEL_HWCAP_SHA512 __khwcap_feature(SHA512)
-#define KERNEL_HWCAP_SVE __khwcap_feature(SVE)
-#define KERNEL_HWCAP_ASIMDFHM __khwcap_feature(ASIMDFHM)
-#define KERNEL_HWCAP_DIT __khwcap_feature(DIT)
-#define KERNEL_HWCAP_USCAT __khwcap_feature(USCAT)
-#define KERNEL_HWCAP_ILRCPC __khwcap_feature(ILRCPC)
-#define KERNEL_HWCAP_FLAGM __khwcap_feature(FLAGM)
-#define KERNEL_HWCAP_SSBS __khwcap_feature(SSBS)
-#define KERNEL_HWCAP_SB __khwcap_feature(SB)
-#define KERNEL_HWCAP_PACA __khwcap_feature(PACA)
-#define KERNEL_HWCAP_PACG __khwcap_feature(PACG)
-#define KERNEL_HWCAP_GCS __khwcap_feature(GCS)
-#define KERNEL_HWCAP_CMPBR __khwcap_feature(CMPBR)
-#define KERNEL_HWCAP_FPRCVT __khwcap_feature(FPRCVT)
-#define KERNEL_HWCAP_F8MM8 __khwcap_feature(F8MM8)
-#define KERNEL_HWCAP_F8MM4 __khwcap_feature(F8MM4)
-#define KERNEL_HWCAP_SVE_F16MM __khwcap_feature(SVE_F16MM)
-#define KERNEL_HWCAP_SVE_ELTPERM __khwcap_feature(SVE_ELTPERM)
-#define KERNEL_HWCAP_SVE_AES2 __khwcap_feature(SVE_AES2)
-#define KERNEL_HWCAP_SVE_BFSCALE __khwcap_feature(SVE_BFSCALE)
-#define KERNEL_HWCAP_SVE2P2 __khwcap_feature(SVE2P2)
-#define KERNEL_HWCAP_SME2P2 __khwcap_feature(SME2P2)
-#define KERNEL_HWCAP_SME_SBITPERM __khwcap_feature(SME_SBITPERM)
-#define KERNEL_HWCAP_SME_AES __khwcap_feature(SME_AES)
-#define KERNEL_HWCAP_SME_SFEXPA __khwcap_feature(SME_SFEXPA)
-#define KERNEL_HWCAP_SME_STMOP __khwcap_feature(SME_STMOP)
-#define KERNEL_HWCAP_SME_SMOP4 __khwcap_feature(SME_SMOP4)
-
#define __khwcap2_feature(x) (const_ilog2(HWCAP2_ ## x) + 64)
-#define KERNEL_HWCAP_DCPODP __khwcap2_feature(DCPODP)
-#define KERNEL_HWCAP_SVE2 __khwcap2_feature(SVE2)
-#define KERNEL_HWCAP_SVEAES __khwcap2_feature(SVEAES)
-#define KERNEL_HWCAP_SVEPMULL __khwcap2_feature(SVEPMULL)
-#define KERNEL_HWCAP_SVEBITPERM __khwcap2_feature(SVEBITPERM)
-#define KERNEL_HWCAP_SVESHA3 __khwcap2_feature(SVESHA3)
-#define KERNEL_HWCAP_SVESM4 __khwcap2_feature(SVESM4)
-#define KERNEL_HWCAP_FLAGM2 __khwcap2_feature(FLAGM2)
-#define KERNEL_HWCAP_FRINT __khwcap2_feature(FRINT)
-#define KERNEL_HWCAP_SVEI8MM __khwcap2_feature(SVEI8MM)
-#define KERNEL_HWCAP_SVEF32MM __khwcap2_feature(SVEF32MM)
-#define KERNEL_HWCAP_SVEF64MM __khwcap2_feature(SVEF64MM)
-#define KERNEL_HWCAP_SVEBF16 __khwcap2_feature(SVEBF16)
-#define KERNEL_HWCAP_I8MM __khwcap2_feature(I8MM)
-#define KERNEL_HWCAP_BF16 __khwcap2_feature(BF16)
-#define KERNEL_HWCAP_DGH __khwcap2_feature(DGH)
-#define KERNEL_HWCAP_RNG __khwcap2_feature(RNG)
-#define KERNEL_HWCAP_BTI __khwcap2_feature(BTI)
-#define KERNEL_HWCAP_MTE __khwcap2_feature(MTE)
-#define KERNEL_HWCAP_ECV __khwcap2_feature(ECV)
-#define KERNEL_HWCAP_AFP __khwcap2_feature(AFP)
-#define KERNEL_HWCAP_RPRES __khwcap2_feature(RPRES)
-#define KERNEL_HWCAP_MTE3 __khwcap2_feature(MTE3)
-#define KERNEL_HWCAP_SME __khwcap2_feature(SME)
-#define KERNEL_HWCAP_SME_I16I64 __khwcap2_feature(SME_I16I64)
-#define KERNEL_HWCAP_SME_F64F64 __khwcap2_feature(SME_F64F64)
-#define KERNEL_HWCAP_SME_I8I32 __khwcap2_feature(SME_I8I32)
-#define KERNEL_HWCAP_SME_F16F32 __khwcap2_feature(SME_F16F32)
-#define KERNEL_HWCAP_SME_B16F32 __khwcap2_feature(SME_B16F32)
-#define KERNEL_HWCAP_SME_F32F32 __khwcap2_feature(SME_F32F32)
-#define KERNEL_HWCAP_SME_FA64 __khwcap2_feature(SME_FA64)
-#define KERNEL_HWCAP_WFXT __khwcap2_feature(WFXT)
-#define KERNEL_HWCAP_EBF16 __khwcap2_feature(EBF16)
-#define KERNEL_HWCAP_SVE_EBF16 __khwcap2_feature(SVE_EBF16)
-#define KERNEL_HWCAP_CSSC __khwcap2_feature(CSSC)
-#define KERNEL_HWCAP_RPRFM __khwcap2_feature(RPRFM)
-#define KERNEL_HWCAP_SVE2P1 __khwcap2_feature(SVE2P1)
-#define KERNEL_HWCAP_SME2 __khwcap2_feature(SME2)
-#define KERNEL_HWCAP_SME2P1 __khwcap2_feature(SME2P1)
-#define KERNEL_HWCAP_SME_I16I32 __khwcap2_feature(SME_I16I32)
-#define KERNEL_HWCAP_SME_BI32I32 __khwcap2_feature(SME_BI32I32)
-#define KERNEL_HWCAP_SME_B16B16 __khwcap2_feature(SME_B16B16)
-#define KERNEL_HWCAP_SME_F16F16 __khwcap2_feature(SME_F16F16)
-#define KERNEL_HWCAP_MOPS __khwcap2_feature(MOPS)
-#define KERNEL_HWCAP_HBC __khwcap2_feature(HBC)
-#define KERNEL_HWCAP_SVE_B16B16 __khwcap2_feature(SVE_B16B16)
-#define KERNEL_HWCAP_LRCPC3 __khwcap2_feature(LRCPC3)
-#define KERNEL_HWCAP_LSE128 __khwcap2_feature(LSE128)
-#define KERNEL_HWCAP_FPMR __khwcap2_feature(FPMR)
-#define KERNEL_HWCAP_LUT __khwcap2_feature(LUT)
-#define KERNEL_HWCAP_FAMINMAX __khwcap2_feature(FAMINMAX)
-#define KERNEL_HWCAP_F8CVT __khwcap2_feature(F8CVT)
-#define KERNEL_HWCAP_F8FMA __khwcap2_feature(F8FMA)
-#define KERNEL_HWCAP_F8DP4 __khwcap2_feature(F8DP4)
-#define KERNEL_HWCAP_F8DP2 __khwcap2_feature(F8DP2)
-#define KERNEL_HWCAP_F8E4M3 __khwcap2_feature(F8E4M3)
-#define KERNEL_HWCAP_F8E5M2 __khwcap2_feature(F8E5M2)
-#define KERNEL_HWCAP_SME_LUTV2 __khwcap2_feature(SME_LUTV2)
-#define KERNEL_HWCAP_SME_F8F16 __khwcap2_feature(SME_F8F16)
-#define KERNEL_HWCAP_SME_F8F32 __khwcap2_feature(SME_F8F32)
-#define KERNEL_HWCAP_SME_SF8FMA __khwcap2_feature(SME_SF8FMA)
-#define KERNEL_HWCAP_SME_SF8DP4 __khwcap2_feature(SME_SF8DP4)
-#define KERNEL_HWCAP_SME_SF8DP2 __khwcap2_feature(SME_SF8DP2)
-#define KERNEL_HWCAP_POE __khwcap2_feature(POE)
-
#define __khwcap3_feature(x) (const_ilog2(HWCAP3_ ## x) + 128)
-#define KERNEL_HWCAP_MTE_FAR __khwcap3_feature(MTE_FAR)
-#define KERNEL_HWCAP_MTE_STORE_ONLY __khwcap3_feature(MTE_STORE_ONLY)
-#define KERNEL_HWCAP_LSFE __khwcap3_feature(LSFE)
-#define KERNEL_HWCAP_LS64 __khwcap3_feature(LS64)
+
+#include "asm/kernel-hwcap.h"
/*
* This yields a mask that user programs can use to figure out what
diff --git a/arch/arm64/include/asm/lsui.h b/arch/arm64/include/asm/lsui.h
new file mode 100644
index 000000000000..8f0d81953eb6
--- /dev/null
+++ b/arch/arm64/include/asm/lsui.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_LSUI_H
+#define __ASM_LSUI_H
+
+#include <linux/compiler_types.h>
+#include <linux/stringify.h>
+#include <asm/alternative.h>
+#include <asm/alternative-macros.h>
+#include <asm/cpucaps.h>
+
+#define __LSUI_PREAMBLE ".arch_extension lsui\n"
+
+#ifdef CONFIG_ARM64_LSUI
+
+#define __lsui_llsc_body(op, ...) \
+({ \
+ alternative_has_cap_unlikely(ARM64_HAS_LSUI) ? \
+ __lsui_##op(__VA_ARGS__) : __llsc_##op(__VA_ARGS__); \
+})
+
+#else /* CONFIG_ARM64_LSUI */
+
+#define __lsui_llsc_body(op, ...) __llsc_##op(__VA_ARGS__)
+
+#endif /* CONFIG_ARM64_LSUI */
+
+#endif /* __ASM_LSUI_H */
diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index 137a173df1ff..5e1211c540ab 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -10,20 +10,12 @@
#define MMCF_AARCH32 0x1 /* mm context flag for AArch32 executables */
#define USER_ASID_BIT 48
#define USER_ASID_FLAG (UL(1) << USER_ASID_BIT)
-#define TTBR_ASID_MASK (UL(0xffff) << 48)
#ifndef __ASSEMBLER__
#include <linux/refcount.h>
#include <asm/cpufeature.h>
-enum pgtable_type {
- TABLE_PTE,
- TABLE_PMD,
- TABLE_PUD,
- TABLE_P4D,
-};
-
typedef struct {
atomic64_t id;
#ifdef CONFIG_COMPAT
@@ -112,5 +104,7 @@ void kpti_install_ng_mappings(void);
static inline void kpti_install_ng_mappings(void) {}
#endif
+extern bool page_alloc_available;
+
#endif /* !__ASSEMBLER__ */
#endif
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index cc80af59c69e..803b68758152 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -210,7 +210,8 @@ static inline void update_saved_ttbr0(struct task_struct *tsk,
if (mm == &init_mm)
ttbr = phys_to_ttbr(__pa_symbol(reserved_pg_dir));
else
- ttbr = phys_to_ttbr(virt_to_phys(mm->pgd)) | ASID(mm) << 48;
+ ttbr = phys_to_ttbr(virt_to_phys(mm->pgd)) |
+ FIELD_PREP(TTBRx_EL1_ASID_MASK, ASID(mm));
WRITE_ONCE(task_thread_info(tsk)->ttbr0, ttbr);
}
diff --git a/arch/arm64/include/asm/mpam.h b/arch/arm64/include/asm/mpam.h
new file mode 100644
index 000000000000..70d396e7b6da
--- /dev/null
+++ b/arch/arm64/include/asm/mpam.h
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2025 Arm Ltd. */
+
+#ifndef __ASM__MPAM_H
+#define __ASM__MPAM_H
+
+#include <linux/arm_mpam.h>
+#include <linux/bitfield.h>
+#include <linux/jump_label.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+
+#include <asm/sysreg.h>
+
+DECLARE_STATIC_KEY_FALSE(mpam_enabled);
+DECLARE_PER_CPU(u64, arm64_mpam_default);
+DECLARE_PER_CPU(u64, arm64_mpam_current);
+
+/*
+ * The value of the MPAM0_EL1 sysreg when a task is in resctrl's default group.
+ * This is used by the context switch code to use the resctrl CPU property
+ * instead. The value is modified when CDP is enabled/disabled by mounting
+ * the resctrl filesystem.
+ */
+extern u64 arm64_mpam_global_default;
+
+#ifdef CONFIG_ARM64_MPAM
+static inline u64 __mpam_regval(u16 partid_d, u16 partid_i, u8 pmg_d, u8 pmg_i)
+{
+ return FIELD_PREP(MPAM0_EL1_PARTID_D, partid_d) |
+ FIELD_PREP(MPAM0_EL1_PARTID_I, partid_i) |
+ FIELD_PREP(MPAM0_EL1_PMG_D, pmg_d) |
+ FIELD_PREP(MPAM0_EL1_PMG_I, pmg_i);
+}
+
+static inline void mpam_set_cpu_defaults(int cpu, u16 partid_d, u16 partid_i,
+ u8 pmg_d, u8 pmg_i)
+{
+ u64 default_val = __mpam_regval(partid_d, partid_i, pmg_d, pmg_i);
+
+ WRITE_ONCE(per_cpu(arm64_mpam_default, cpu), default_val);
+}
+
+/*
+ * The resctrl filesystem writes to the partid/pmg values for threads and CPUs,
+ * which may race with reads in mpam_thread_switch(). Ensure only one of the old
+ * or new values are used. Particular care should be taken with the pmg field as
+ * mpam_thread_switch() may read a partid and pmg that don't match, causing this
+ * value to be stored with cache allocations, despite being considered 'free' by
+ * resctrl.
+ */
+static inline u64 mpam_get_regval(struct task_struct *tsk)
+{
+ return READ_ONCE(task_thread_info(tsk)->mpam_partid_pmg);
+}
+
+static inline void mpam_set_task_partid_pmg(struct task_struct *tsk,
+ u16 partid_d, u16 partid_i,
+ u8 pmg_d, u8 pmg_i)
+{
+ u64 regval = __mpam_regval(partid_d, partid_i, pmg_d, pmg_i);
+
+ WRITE_ONCE(task_thread_info(tsk)->mpam_partid_pmg, regval);
+}
+
+static inline void mpam_thread_switch(struct task_struct *tsk)
+{
+ u64 oldregval;
+ int cpu = smp_processor_id();
+ u64 regval = mpam_get_regval(tsk);
+
+ if (!static_branch_likely(&mpam_enabled))
+ return;
+
+ if (regval == READ_ONCE(arm64_mpam_global_default))
+ regval = READ_ONCE(per_cpu(arm64_mpam_default, cpu));
+
+ oldregval = READ_ONCE(per_cpu(arm64_mpam_current, cpu));
+ if (oldregval == regval)
+ return;
+
+ write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1);
+ if (system_supports_sme())
+ write_sysreg_s(regval & (MPAMSM_EL1_PARTID_D | MPAMSM_EL1_PMG_D), SYS_MPAMSM_EL1);
+ isb();
+
+ /* Synchronising the EL0 write is left until the ERET to EL0 */
+ write_sysreg_s(regval, SYS_MPAM0_EL1);
+
+ WRITE_ONCE(per_cpu(arm64_mpam_current, cpu), regval);
+}
+#else
+static inline void mpam_thread_switch(struct task_struct *tsk) {}
+#endif /* CONFIG_ARM64_MPAM */
+
+#endif /* __ASM__MPAM_H */
diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h
index 6d4a78b9dc3e..7f7b97e09996 100644
--- a/arch/arm64/include/asm/mte.h
+++ b/arch/arm64/include/asm/mte.h
@@ -252,6 +252,9 @@ static inline void mte_check_tfsr_entry(void)
if (!kasan_hw_tags_enabled())
return;
+ if (!system_uses_mte_async_or_asymm_mode())
+ return;
+
mte_check_tfsr_el1();
}
@@ -260,6 +263,9 @@ static inline void mte_check_tfsr_exit(void)
if (!kasan_hw_tags_enabled())
return;
+ if (!system_uses_mte_async_or_asymm_mode())
+ return;
+
/*
* The asynchronous faults are sync'ed automatically with
* TFSR_EL1 on kernel entry but for exit an explicit dsb()
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index d49180bb7cb3..72f31800c703 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -223,8 +223,6 @@
*/
#define S1_TABLE_AP (_AT(pmdval_t, 3) << 61)
-#define TTBR_CNP_BIT (UL(1) << 0)
-
/*
* TCR flags.
*/
@@ -287,9 +285,12 @@
#endif
#ifdef CONFIG_ARM64_VA_BITS_52
+#define PTRS_PER_PGD_52_VA (UL(1) << (52 - PGDIR_SHIFT))
+#define PTRS_PER_PGD_48_VA (UL(1) << (48 - PGDIR_SHIFT))
+#define PTRS_PER_PGD_EXTRA (PTRS_PER_PGD_52_VA - PTRS_PER_PGD_48_VA)
+
/* Must be at least 64-byte aligned to prevent corruption of the TTBR */
-#define TTBR1_BADDR_4852_OFFSET (((UL(1) << (52 - PGDIR_SHIFT)) - \
- (UL(1) << (48 - PGDIR_SHIFT))) * 8)
+#define TTBR1_BADDR_4852_OFFSET (PTRS_PER_PGD_EXTRA << PTDESC_ORDER)
#endif
#endif
diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index f560e6420267..212ce1b02e15 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -25,6 +25,8 @@
*/
#define PTE_PRESENT_INVALID (PTE_NG) /* only when !PTE_VALID */
+#define PTE_PRESENT_VALID_KERNEL (PTE_VALID | PTE_MAYBE_NG)
+
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
#define PTE_UFFD_WP (_AT(pteval_t, 1) << 58) /* uffd-wp tracking */
#define PTE_SWP_UFFD_WP (_AT(pteval_t, 1) << 3) /* only for swp ptes */
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index b3e58735c49b..308e29e829b8 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -89,9 +89,9 @@ static inline void arch_leave_lazy_mmu_mode(void)
/* Set stride and tlb_level in flush_*_tlb_range */
#define flush_pmd_tlb_range(vma, addr, end) \
- __flush_tlb_range(vma, addr, end, PMD_SIZE, false, 2)
+ __flush_tlb_range(vma, addr, end, PMD_SIZE, 2, TLBF_NONE)
#define flush_pud_tlb_range(vma, addr, end) \
- __flush_tlb_range(vma, addr, end, PUD_SIZE, false, 1)
+ __flush_tlb_range(vma, addr, end, PUD_SIZE, 1, TLBF_NONE)
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/*
@@ -101,10 +101,11 @@ static inline void arch_leave_lazy_mmu_mode(void)
* entries exist.
*/
#define flush_tlb_fix_spurious_fault(vma, address, ptep) \
- local_flush_tlb_page_nonotify(vma, address)
+ __flush_tlb_page(vma, address, TLBF_NOBROADCAST | TLBF_NONOTIFY)
-#define flush_tlb_fix_spurious_fault_pmd(vma, address, pmdp) \
- local_flush_tlb_page_nonotify(vma, address)
+#define flush_tlb_fix_spurious_fault_pmd(vma, address, pmdp) \
+ __flush_tlb_range(vma, address, address + PMD_SIZE, PMD_SIZE, 2, \
+ TLBF_NOBROADCAST | TLBF_NONOTIFY | TLBF_NOWALKCACHE)
/*
* ZERO_PAGE is a global shared page that is always zero: used
@@ -322,9 +323,11 @@ static inline pte_t pte_mknoncont(pte_t pte)
return clear_pte_bit(pte, __pgprot(PTE_CONT));
}
-static inline pte_t pte_mkvalid(pte_t pte)
+static inline pte_t pte_mkvalid_k(pte_t pte)
{
- return set_pte_bit(pte, __pgprot(PTE_VALID));
+ pte = clear_pte_bit(pte, __pgprot(PTE_PRESENT_INVALID));
+ pte = set_pte_bit(pte, __pgprot(PTE_PRESENT_VALID_KERNEL));
+ return pte;
}
static inline pte_t pte_mkinvalid(pte_t pte)
@@ -594,6 +597,7 @@ static inline int pmd_protnone(pmd_t pmd)
#define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd)))
#define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd)))
#define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd)))
+#define pmd_mkvalid_k(pmd) pte_pmd(pte_mkvalid_k(pmd_pte(pmd)))
#define pmd_mkinvalid(pmd) pte_pmd(pte_mkinvalid(pmd_pte(pmd)))
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
#define pmd_uffd_wp(pmd) pte_uffd_wp(pmd_pte(pmd))
@@ -635,6 +639,8 @@ static inline pmd_t pmd_mkspecial(pmd_t pmd)
#define pud_young(pud) pte_young(pud_pte(pud))
#define pud_mkyoung(pud) pte_pud(pte_mkyoung(pud_pte(pud)))
+#define pud_mkwrite_novma(pud) pte_pud(pte_mkwrite_novma(pud_pte(pud)))
+#define pud_mkvalid_k(pud) pte_pud(pte_mkvalid_k(pud_pte(pud)))
#define pud_write(pud) pte_write(pud_pte(pud))
static inline pud_t pud_mkhuge(pud_t pud)
@@ -779,9 +785,13 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
#define pmd_table(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \
PMD_TYPE_TABLE)
-#define pmd_sect(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \
- PMD_TYPE_SECT)
-#define pmd_leaf(pmd) (pmd_present(pmd) && !pmd_table(pmd))
+
+#define pmd_leaf pmd_leaf
+static inline bool pmd_leaf(pmd_t pmd)
+{
+ return pmd_present(pmd) && !pmd_table(pmd);
+}
+
#define pmd_bad(pmd) (!pmd_table(pmd))
#define pmd_leaf_size(pmd) (pmd_cont(pmd) ? CONT_PMD_SIZE : PMD_SIZE)
@@ -799,11 +809,8 @@ static inline int pmd_trans_huge(pmd_t pmd)
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3
-static inline bool pud_sect(pud_t pud) { return false; }
static inline bool pud_table(pud_t pud) { return true; }
#else
-#define pud_sect(pud) ((pud_val(pud) & PUD_TYPE_MASK) == \
- PUD_TYPE_SECT)
#define pud_table(pud) ((pud_val(pud) & PUD_TYPE_MASK) == \
PUD_TYPE_TABLE)
#endif
@@ -873,7 +880,11 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
PUD_TYPE_TABLE)
#define pud_present(pud) pte_present(pud_pte(pud))
#ifndef __PAGETABLE_PMD_FOLDED
-#define pud_leaf(pud) (pud_present(pud) && !pud_table(pud))
+#define pud_leaf pud_leaf
+static inline bool pud_leaf(pud_t pud)
+{
+ return pud_present(pud) && !pud_table(pud);
+}
#else
#define pud_leaf(pud) false
#endif
@@ -1247,9 +1258,18 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
return pte_pmd(pte_modify(pmd_pte(pmd), newprot));
}
-extern int __ptep_set_access_flags(struct vm_area_struct *vma,
- unsigned long address, pte_t *ptep,
- pte_t entry, int dirty);
+extern int __ptep_set_access_flags_anysz(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ pte_t entry, int dirty,
+ unsigned long pgsize);
+
+static inline int __ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ pte_t entry, int dirty)
+{
+ return __ptep_set_access_flags_anysz(vma, address, ptep, entry, dirty,
+ PAGE_SIZE);
+}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
@@ -1257,8 +1277,8 @@ static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp,
pmd_t entry, int dirty)
{
- return __ptep_set_access_flags(vma, address, (pte_t *)pmdp,
- pmd_pte(entry), dirty);
+ return __ptep_set_access_flags_anysz(vma, address, (pte_t *)pmdp,
+ pmd_pte(entry), dirty, PMD_SIZE);
}
#endif
@@ -1320,7 +1340,7 @@ static inline int __ptep_clear_flush_young(struct vm_area_struct *vma,
* context-switch, which provides a DSB to complete the TLB
* invalidation.
*/
- flush_tlb_page_nosync(vma, address);
+ __flush_tlb_page(vma, address, TLBF_NOSYNC);
}
return young;
diff --git a/arch/arm64/include/asm/resctrl.h b/arch/arm64/include/asm/resctrl.h
new file mode 100644
index 000000000000..b506e95cf6e3
--- /dev/null
+++ b/arch/arm64/include/asm/resctrl.h
@@ -0,0 +1,2 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/arm_mpam.h>
diff --git a/arch/arm64/include/asm/rwonce.h b/arch/arm64/include/asm/rwonce.h
index fc0fb42b0b64..0f3a01d30f66 100644
--- a/arch/arm64/include/asm/rwonce.h
+++ b/arch/arm64/include/asm/rwonce.h
@@ -20,6 +20,17 @@
ARM64_HAS_LDAPR)
/*
+ * Replace this with typeof_unqual() when minimum compiler versions are
+ * increased to GCC 14 and Clang 19. For the time being, we need this
+ * workaround, which relies on function return values dropping qualifiers.
+ */
+#define __rwonce_typeof_unqual(x) typeof(({ \
+ __diag_push() \
+ __diag_ignore_all("-Wignored-qualifiers", "") \
+ ((typeof(x)(*)(void))0)(); \
+ __diag_pop() }))
+
+/*
* When building with LTO, there is an increased risk of the compiler
* converting an address dependency headed by a READ_ONCE() invocation
* into a control dependency and consequently allowing for harmful
@@ -31,9 +42,12 @@
*/
#define __READ_ONCE(x) \
({ \
- typeof(&(x)) __x = &(x); \
- int atomic = 1; \
- union { __unqual_scalar_typeof(*__x) __val; char __c[1]; } __u; \
+ auto __x = &(x); \
+ auto __ret = (__rwonce_typeof_unqual(*__x) *)__x; \
+ /* Hides alias reassignment from Clang's -Wthread-safety. */ \
+ auto __retp = &__ret; \
+ union { typeof(*__ret) __val; char __c[1]; } __u; \
+ *__retp = &__u.__val; \
switch (sizeof(x)) { \
case 1: \
asm volatile(__LOAD_RCPC(b, %w0, %1) \
@@ -56,9 +70,9 @@
: "Q" (*__x) : "memory"); \
break; \
default: \
- atomic = 0; \
+ __u.__val = *(volatile typeof(*__x) *)__x; \
} \
- atomic ? (typeof(*__x))__u.__val : (*(volatile typeof(*__x) *)__x);\
+ *__ret; \
})
#endif /* !BUILD_VDSO */
diff --git a/arch/arm64/include/asm/scs.h b/arch/arm64/include/asm/scs.h
index 0fbc2e7867d3..a15a2968e7b6 100644
--- a/arch/arm64/include/asm/scs.h
+++ b/arch/arm64/include/asm/scs.h
@@ -10,6 +10,11 @@
#ifdef CONFIG_SHADOW_CALL_STACK
scs_sp .req x18
+ .macro scs_load_current_base
+ get_current_task scs_sp
+ ldr scs_sp, [scs_sp, #TSK_TI_SCS_BASE]
+ .endm
+
.macro scs_load_current
get_current_task scs_sp
ldr scs_sp, [scs_sp, #TSK_TI_SCS_SP]
@@ -19,6 +24,9 @@
str scs_sp, [\tsk, #TSK_TI_SCS_SP]
.endm
#else
+ .macro scs_load_current_base
+ .endm
+
.macro scs_load_current
.endm
diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h
index 7942478e4065..5d7fe3e153c8 100644
--- a/arch/arm64/include/asm/thread_info.h
+++ b/arch/arm64/include/asm/thread_info.h
@@ -42,6 +42,9 @@ struct thread_info {
void *scs_base;
void *scs_sp;
#endif
+#ifdef CONFIG_ARM64_MPAM
+ u64 mpam_partid_pmg;
+#endif
u32 cpu;
};
diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h
index 8d762607285c..10869d7731b8 100644
--- a/arch/arm64/include/asm/tlb.h
+++ b/arch/arm64/include/asm/tlb.h
@@ -53,7 +53,7 @@ static inline int tlb_get_level(struct mmu_gather *tlb)
static inline void tlb_flush(struct mmu_gather *tlb)
{
struct vm_area_struct vma = TLB_FLUSH_VMA(tlb->mm, 0);
- bool last_level = !tlb->freed_tables;
+ tlbf_t flags = tlb->freed_tables ? TLBF_NONE : TLBF_NOWALKCACHE;
unsigned long stride = tlb_get_unmap_size(tlb);
int tlb_level = tlb_get_level(tlb);
@@ -63,13 +63,13 @@ static inline void tlb_flush(struct mmu_gather *tlb)
* reallocate our ASID without invalidating the entire TLB.
*/
if (tlb->fullmm) {
- if (!last_level)
+ if (tlb->freed_tables)
flush_tlb_mm(tlb->mm);
return;
}
__flush_tlb_range(&vma, tlb->start, tlb->end, stride,
- last_level, tlb_level);
+ tlb_level, flags);
}
static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index 1416e652612b..47fa4d39a461 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -97,24 +97,69 @@ static inline unsigned long get_trans_granule(void)
#define TLBI_TTL_UNKNOWN INT_MAX
-#define __tlbi_level(op, addr, level) do { \
- u64 arg = addr; \
- \
- if (alternative_has_cap_unlikely(ARM64_HAS_ARMv8_4_TTL) && \
- level >= 0 && level <= 3) { \
- u64 ttl = level & 3; \
- ttl |= get_trans_granule() << 2; \
- arg &= ~TLBI_TTL_MASK; \
- arg |= FIELD_PREP(TLBI_TTL_MASK, ttl); \
- } \
- \
- __tlbi(op, arg); \
-} while(0)
-
-#define __tlbi_user_level(op, arg, level) do { \
- if (arm64_kernel_unmapped_at_el0()) \
- __tlbi_level(op, (arg | USER_ASID_FLAG), level); \
-} while (0)
+typedef void (*tlbi_op)(u64 arg);
+
+static __always_inline void vae1is(u64 arg)
+{
+ __tlbi(vae1is, arg);
+ __tlbi_user(vae1is, arg);
+}
+
+static __always_inline void vae2is(u64 arg)
+{
+ __tlbi(vae2is, arg);
+}
+
+static __always_inline void vale1(u64 arg)
+{
+ __tlbi(vale1, arg);
+ __tlbi_user(vale1, arg);
+}
+
+static __always_inline void vale1is(u64 arg)
+{
+ __tlbi(vale1is, arg);
+ __tlbi_user(vale1is, arg);
+}
+
+static __always_inline void vale2is(u64 arg)
+{
+ __tlbi(vale2is, arg);
+}
+
+static __always_inline void vaale1is(u64 arg)
+{
+ __tlbi(vaale1is, arg);
+}
+
+static __always_inline void ipas2e1(u64 arg)
+{
+ __tlbi(ipas2e1, arg);
+}
+
+static __always_inline void ipas2e1is(u64 arg)
+{
+ __tlbi(ipas2e1is, arg);
+}
+
+static __always_inline void __tlbi_level_asid(tlbi_op op, u64 addr, u32 level,
+ u16 asid)
+{
+ u64 arg = __TLBI_VADDR(addr, asid);
+
+ if (alternative_has_cap_unlikely(ARM64_HAS_ARMv8_4_TTL) && level <= 3) {
+ u64 ttl = level | (get_trans_granule() << 2);
+
+ FIELD_MODIFY(TLBI_TTL_MASK, &arg, ttl);
+ }
+
+ op(arg);
+}
+
+static inline void __tlbi_level(tlbi_op op, u64 addr, u32 level)
+{
+ __tlbi_level_asid(op, addr, level, 0);
+}
/*
* This macro creates a properly formatted VA operand for the TLB RANGE. The
@@ -141,19 +186,6 @@ static inline unsigned long get_trans_granule(void)
#define TLBIR_TTL_MASK GENMASK_ULL(38, 37)
#define TLBIR_BADDR_MASK GENMASK_ULL(36, 0)
-#define __TLBI_VADDR_RANGE(baddr, asid, scale, num, ttl) \
- ({ \
- unsigned long __ta = 0; \
- unsigned long __ttl = (ttl >= 1 && ttl <= 3) ? ttl : 0; \
- __ta |= FIELD_PREP(TLBIR_BADDR_MASK, baddr); \
- __ta |= FIELD_PREP(TLBIR_TTL_MASK, __ttl); \
- __ta |= FIELD_PREP(TLBIR_NUM_MASK, num); \
- __ta |= FIELD_PREP(TLBIR_SCALE_MASK, scale); \
- __ta |= FIELD_PREP(TLBIR_TG_MASK, get_trans_granule()); \
- __ta |= FIELD_PREP(TLBIR_ASID_MASK, asid); \
- __ta; \
- })
-
/* These macros are used by the TLBI RANGE feature. */
#define __TLBI_RANGE_PAGES(num, scale) \
((unsigned long)((num) + 1) << (5 * (scale) + 1))
@@ -167,11 +199,7 @@ static inline unsigned long get_trans_granule(void)
* range.
*/
#define __TLBI_RANGE_NUM(pages, scale) \
- ({ \
- int __pages = min((pages), \
- __TLBI_RANGE_PAGES(31, (scale))); \
- (__pages >> (5 * (scale) + 1)) - 1; \
- })
+ (((pages) >> (5 * (scale) + 1)) - 1)
#define __repeat_tlbi_sync(op, arg...) \
do { \
@@ -241,10 +269,7 @@ static inline void __tlbi_sync_s1ish_hyp(void)
* unmapping pages from vmalloc/io space.
*
* flush_tlb_page(vma, addr)
- * Invalidate a single user mapping for address 'addr' in the
- * address space corresponding to 'vma->mm'. Note that this
- * operation only invalidates a single, last-level page-table
- * entry and therefore does not affect any walk-caches.
+ * Equivalent to __flush_tlb_page(..., flags=TLBF_NONE)
*
*
* Next, we have some undocumented invalidation routines that you probably
@@ -258,30 +283,28 @@ static inline void __tlbi_sync_s1ish_hyp(void)
* CPUs, ensuring that any walk-cache entries associated with the
* translation are also invalidated.
*
- * __flush_tlb_range(vma, start, end, stride, last_level, tlb_level)
+ * __flush_tlb_range(vma, start, end, stride, tlb_level, flags)
* Invalidate the virtual-address range '[start, end)' on all
* CPUs for the user address space corresponding to 'vma->mm'.
* The invalidation operations are issued at a granularity
- * determined by 'stride' and only affect any walk-cache entries
- * if 'last_level' is equal to false. tlb_level is the level at
+ * determined by 'stride'. tlb_level is the level at
* which the invalidation must take place. If the level is wrong,
* no invalidation may take place. In the case where the level
* cannot be easily determined, the value TLBI_TTL_UNKNOWN will
- * perform a non-hinted invalidation.
+ * perform a non-hinted invalidation. flags may be TLBF_NONE (0) or
+ * any combination of TLBF_NOWALKCACHE (elide eviction of walk
+ * cache entries), TLBF_NONOTIFY (don't call mmu notifiers),
+ * TLBF_NOSYNC (don't issue trailing dsb) and TLBF_NOBROADCAST
+ * (only perform the invalidation for the local cpu).
*
- * local_flush_tlb_page(vma, addr)
- * Local variant of flush_tlb_page(). Stale TLB entries may
- * remain in remote CPUs.
- *
- * local_flush_tlb_page_nonotify(vma, addr)
- * Same as local_flush_tlb_page() except MMU notifier will not be
- * called.
- *
- * local_flush_tlb_contpte(vma, addr)
- * Invalidate the virtual-address range
- * '[addr, addr+CONT_PTE_SIZE)' mapped with contpte on local CPU
- * for the user address space corresponding to 'vma->mm'. Stale
- * TLB entries may remain in remote CPUs.
+ * __flush_tlb_page(vma, addr, flags)
+ * Invalidate a single user mapping for address 'addr' in the
+ * address space corresponding to 'vma->mm'. Note that this
+ * operation only invalidates a single level 3 page-table entry
+ * and therefore does not affect any walk-caches. flags may contain
+ * any combination of TLBF_NONOTIFY (don't call mmu notifiers),
+ * TLBF_NOSYNC (don't issue trailing dsb) and TLBF_NOBROADCAST
+ * (only perform the invalidation for the local cpu).
*
* Finally, take a look at asm/tlb.h to see how tlb_flush() is implemented
* on top of these routines, since that is our interface to the mmu_gather
@@ -315,59 +338,6 @@ static inline void flush_tlb_mm(struct mm_struct *mm)
mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
}
-static inline void __local_flush_tlb_page_nonotify_nosync(struct mm_struct *mm,
- unsigned long uaddr)
-{
- unsigned long addr;
-
- dsb(nshst);
- addr = __TLBI_VADDR(uaddr, ASID(mm));
- __tlbi(vale1, addr);
- __tlbi_user(vale1, addr);
-}
-
-static inline void local_flush_tlb_page_nonotify(struct vm_area_struct *vma,
- unsigned long uaddr)
-{
- __local_flush_tlb_page_nonotify_nosync(vma->vm_mm, uaddr);
- dsb(nsh);
-}
-
-static inline void local_flush_tlb_page(struct vm_area_struct *vma,
- unsigned long uaddr)
-{
- __local_flush_tlb_page_nonotify_nosync(vma->vm_mm, uaddr);
- mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, uaddr & PAGE_MASK,
- (uaddr & PAGE_MASK) + PAGE_SIZE);
- dsb(nsh);
-}
-
-static inline void __flush_tlb_page_nosync(struct mm_struct *mm,
- unsigned long uaddr)
-{
- unsigned long addr;
-
- dsb(ishst);
- addr = __TLBI_VADDR(uaddr, ASID(mm));
- __tlbi(vale1is, addr);
- __tlbi_user(vale1is, addr);
- mmu_notifier_arch_invalidate_secondary_tlbs(mm, uaddr & PAGE_MASK,
- (uaddr & PAGE_MASK) + PAGE_SIZE);
-}
-
-static inline void flush_tlb_page_nosync(struct vm_area_struct *vma,
- unsigned long uaddr)
-{
- return __flush_tlb_page_nosync(vma->vm_mm, uaddr);
-}
-
-static inline void flush_tlb_page(struct vm_area_struct *vma,
- unsigned long uaddr)
-{
- flush_tlb_page_nosync(vma, uaddr);
- __tlbi_sync_s1ish();
-}
-
static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm)
{
return true;
@@ -397,14 +367,13 @@ static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
/*
* __flush_tlb_range_op - Perform TLBI operation upon a range
*
- * @op: TLBI instruction that operates on a range (has 'r' prefix)
+ * @lop: TLBI level operation to perform
+ * @rop: TLBI range operation to perform
* @start: The start address of the range
* @pages: Range as the number of pages from 'start'
* @stride: Flush granularity
* @asid: The ASID of the task (0 for IPA instructions)
- * @tlb_level: Translation Table level hint, if known
- * @tlbi_user: If 'true', call an additional __tlbi_user()
- * (typically for user ASIDs). 'flase' for IPA instructions
+ * @level: Translation Table level hint, if known
* @lpa2: If 'true', the lpa2 scheme is used as set out below
*
* When the CPU does not support TLB range operations, flush the TLB
@@ -427,116 +396,181 @@ static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
* operations can only span an even number of pages. We save this for last to
* ensure 64KB start alignment is maintained for the LPA2 case.
*/
-#define __flush_tlb_range_op(op, start, pages, stride, \
- asid, tlb_level, tlbi_user, lpa2) \
-do { \
- typeof(start) __flush_start = start; \
- typeof(pages) __flush_pages = pages; \
- int num = 0; \
- int scale = 3; \
- int shift = lpa2 ? 16 : PAGE_SHIFT; \
- unsigned long addr; \
- \
- while (__flush_pages > 0) { \
- if (!system_supports_tlb_range() || \
- __flush_pages == 1 || \
- (lpa2 && __flush_start != ALIGN(__flush_start, SZ_64K))) { \
- addr = __TLBI_VADDR(__flush_start, asid); \
- __tlbi_level(op, addr, tlb_level); \
- if (tlbi_user) \
- __tlbi_user_level(op, addr, tlb_level); \
- __flush_start += stride; \
- __flush_pages -= stride >> PAGE_SHIFT; \
- continue; \
- } \
- \
- num = __TLBI_RANGE_NUM(__flush_pages, scale); \
- if (num >= 0) { \
- addr = __TLBI_VADDR_RANGE(__flush_start >> shift, asid, \
- scale, num, tlb_level); \
- __tlbi(r##op, addr); \
- if (tlbi_user) \
- __tlbi_user(r##op, addr); \
- __flush_start += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT; \
- __flush_pages -= __TLBI_RANGE_PAGES(num, scale);\
- } \
- scale--; \
- } \
-} while (0)
+static __always_inline void rvae1is(u64 arg)
+{
+ __tlbi(rvae1is, arg);
+ __tlbi_user(rvae1is, arg);
+}
+
+static __always_inline void rvale1(u64 arg)
+{
+ __tlbi(rvale1, arg);
+ __tlbi_user(rvale1, arg);
+}
+
+static __always_inline void rvale1is(u64 arg)
+{
+ __tlbi(rvale1is, arg);
+ __tlbi_user(rvale1is, arg);
+}
+
+static __always_inline void rvaale1is(u64 arg)
+{
+ __tlbi(rvaale1is, arg);
+}
+
+static __always_inline void ripas2e1is(u64 arg)
+{
+ __tlbi(ripas2e1is, arg);
+}
+
+static __always_inline void __tlbi_range(tlbi_op op, u64 addr,
+ u16 asid, int scale, int num,
+ u32 level, bool lpa2)
+{
+ u64 arg = 0;
+
+ arg |= FIELD_PREP(TLBIR_BADDR_MASK, addr >> (lpa2 ? 16 : PAGE_SHIFT));
+ arg |= FIELD_PREP(TLBIR_TTL_MASK, level > 3 ? 0 : level);
+ arg |= FIELD_PREP(TLBIR_NUM_MASK, num);
+ arg |= FIELD_PREP(TLBIR_SCALE_MASK, scale);
+ arg |= FIELD_PREP(TLBIR_TG_MASK, get_trans_granule());
+ arg |= FIELD_PREP(TLBIR_ASID_MASK, asid);
+
+ op(arg);
+}
+
+static __always_inline void __flush_tlb_range_op(tlbi_op lop, tlbi_op rop,
+ u64 start, size_t pages,
+ u64 stride, u16 asid,
+ u32 level, bool lpa2)
+{
+ u64 addr = start, end = start + pages * PAGE_SIZE;
+ int scale = 3;
+
+ while (addr != end) {
+ int num;
+
+ pages = (end - addr) >> PAGE_SHIFT;
+
+ if (!system_supports_tlb_range() || pages == 1)
+ goto invalidate_one;
+
+ if (lpa2 && !IS_ALIGNED(addr, SZ_64K))
+ goto invalidate_one;
+
+ num = __TLBI_RANGE_NUM(pages, scale);
+ if (num >= 0) {
+ __tlbi_range(rop, addr, asid, scale, num, level, lpa2);
+ addr += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT;
+ }
+
+ scale--;
+ continue;
+invalidate_one:
+ __tlbi_level_asid(lop, addr, level, asid);
+ addr += stride;
+ }
+}
+
+#define __flush_s1_tlb_range_op(op, start, pages, stride, asid, tlb_level) \
+ __flush_tlb_range_op(op, r##op, start, pages, stride, asid, tlb_level, lpa2_is_enabled())
#define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \
- __flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false, kvm_lpa2_is_enabled());
+ __flush_tlb_range_op(op, r##op, start, pages, stride, 0, tlb_level, kvm_lpa2_is_enabled())
-static inline bool __flush_tlb_range_limit_excess(unsigned long start,
- unsigned long end, unsigned long pages, unsigned long stride)
+static inline bool __flush_tlb_range_limit_excess(unsigned long pages,
+ unsigned long stride)
{
/*
- * When the system does not support TLB range based flush
- * operation, (MAX_DVM_OPS - 1) pages can be handled. But
- * with TLB range based operation, MAX_TLBI_RANGE_PAGES
- * pages can be handled.
+ * Assume that the worst case number of DVM ops required to flush a
+ * given range on a system that supports tlb-range is 20 (4 scales, 1
+ * final page, 15 for alignment on LPA2 systems), which is much smaller
+ * than MAX_DVM_OPS.
*/
- if ((!system_supports_tlb_range() &&
- (end - start) >= (MAX_DVM_OPS * stride)) ||
- pages > MAX_TLBI_RANGE_PAGES)
- return true;
+ if (system_supports_tlb_range())
+ return pages > MAX_TLBI_RANGE_PAGES;
- return false;
+ return pages >= (MAX_DVM_OPS * stride) >> PAGE_SHIFT;
}
-static inline void __flush_tlb_range_nosync(struct mm_struct *mm,
- unsigned long start, unsigned long end,
- unsigned long stride, bool last_level,
- int tlb_level)
+typedef unsigned __bitwise tlbf_t;
+
+/* No special behaviour. */
+#define TLBF_NONE ((__force tlbf_t)0)
+
+/* Invalidate tlb entries only, leaving the page table walk cache intact. */
+#define TLBF_NOWALKCACHE ((__force tlbf_t)BIT(0))
+
+/* Skip the trailing dsb after issuing tlbi. */
+#define TLBF_NOSYNC ((__force tlbf_t)BIT(1))
+
+/* Suppress tlb notifier callbacks for this flush operation. */
+#define TLBF_NONOTIFY ((__force tlbf_t)BIT(2))
+
+/* Perform the tlbi locally without broadcasting to other CPUs. */
+#define TLBF_NOBROADCAST ((__force tlbf_t)BIT(3))
+
+static __always_inline void __do_flush_tlb_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ unsigned long stride, int tlb_level,
+ tlbf_t flags)
{
+ struct mm_struct *mm = vma->vm_mm;
unsigned long asid, pages;
- start = round_down(start, stride);
- end = round_up(end, stride);
pages = (end - start) >> PAGE_SHIFT;
- if (__flush_tlb_range_limit_excess(start, end, pages, stride)) {
+ if (__flush_tlb_range_limit_excess(pages, stride)) {
flush_tlb_mm(mm);
return;
}
- dsb(ishst);
+ if (!(flags & TLBF_NOBROADCAST))
+ dsb(ishst);
+ else
+ dsb(nshst);
+
asid = ASID(mm);
- if (last_level)
- __flush_tlb_range_op(vale1is, start, pages, stride, asid,
- tlb_level, true, lpa2_is_enabled());
- else
- __flush_tlb_range_op(vae1is, start, pages, stride, asid,
- tlb_level, true, lpa2_is_enabled());
+ switch (flags & (TLBF_NOWALKCACHE | TLBF_NOBROADCAST)) {
+ case TLBF_NONE:
+ __flush_s1_tlb_range_op(vae1is, start, pages, stride,
+ asid, tlb_level);
+ break;
+ case TLBF_NOWALKCACHE:
+ __flush_s1_tlb_range_op(vale1is, start, pages, stride,
+ asid, tlb_level);
+ break;
+ case TLBF_NOBROADCAST:
+ /* Combination unused */
+ BUG();
+ break;
+ case TLBF_NOWALKCACHE | TLBF_NOBROADCAST:
+ __flush_s1_tlb_range_op(vale1, start, pages, stride,
+ asid, tlb_level);
+ break;
+ }
- mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
+ if (!(flags & TLBF_NONOTIFY))
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
+
+ if (!(flags & TLBF_NOSYNC)) {
+ if (!(flags & TLBF_NOBROADCAST))
+ __tlbi_sync_s1ish();
+ else
+ dsb(nsh);
+ }
}
static inline void __flush_tlb_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
- unsigned long stride, bool last_level,
- int tlb_level)
-{
- __flush_tlb_range_nosync(vma->vm_mm, start, end, stride,
- last_level, tlb_level);
- __tlbi_sync_s1ish();
-}
-
-static inline void local_flush_tlb_contpte(struct vm_area_struct *vma,
- unsigned long addr)
+ unsigned long stride, int tlb_level,
+ tlbf_t flags)
{
- unsigned long asid;
-
- addr = round_down(addr, CONT_PTE_SIZE);
-
- dsb(nshst);
- asid = ASID(vma->vm_mm);
- __flush_tlb_range_op(vale1, addr, CONT_PTES, PAGE_SIZE, asid,
- 3, true, lpa2_is_enabled());
- mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, addr,
- addr + CONT_PTE_SIZE);
- dsb(nsh);
+ start = round_down(start, stride);
+ end = round_up(end, stride);
+ __do_flush_tlb_range(vma, start, end, stride, tlb_level, flags);
}
static inline void flush_tlb_range(struct vm_area_struct *vma,
@@ -548,7 +582,23 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
* Set the tlb_level to TLBI_TTL_UNKNOWN because we can not get enough
* information here.
*/
- __flush_tlb_range(vma, start, end, PAGE_SIZE, false, TLBI_TTL_UNKNOWN);
+ __flush_tlb_range(vma, start, end, PAGE_SIZE, TLBI_TTL_UNKNOWN, TLBF_NONE);
+}
+
+static inline void __flush_tlb_page(struct vm_area_struct *vma,
+ unsigned long uaddr, tlbf_t flags)
+{
+ unsigned long start = round_down(uaddr, PAGE_SIZE);
+ unsigned long end = start + PAGE_SIZE;
+
+ __do_flush_tlb_range(vma, start, end, PAGE_SIZE, 3,
+ TLBF_NOWALKCACHE | flags);
+}
+
+static inline void flush_tlb_page(struct vm_area_struct *vma,
+ unsigned long uaddr)
+{
+ __flush_tlb_page(vma, uaddr, TLBF_NONE);
}
static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end)
@@ -560,14 +610,14 @@ static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end
end = round_up(end, stride);
pages = (end - start) >> PAGE_SHIFT;
- if (__flush_tlb_range_limit_excess(start, end, pages, stride)) {
+ if (__flush_tlb_range_limit_excess(pages, stride)) {
flush_tlb_all();
return;
}
dsb(ishst);
- __flush_tlb_range_op(vaale1is, start, pages, stride, 0,
- TLBI_TTL_UNKNOWN, false, lpa2_is_enabled());
+ __flush_s1_tlb_range_op(vaale1is, start, pages, stride, 0,
+ TLBI_TTL_UNKNOWN);
__tlbi_sync_s1ish();
isb();
}
@@ -589,7 +639,10 @@ static inline void __flush_tlb_kernel_pgtable(unsigned long kaddr)
static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
struct mm_struct *mm, unsigned long start, unsigned long end)
{
- __flush_tlb_range_nosync(mm, start, end, PAGE_SIZE, true, 3);
+ struct vm_area_struct vma = { .vm_mm = mm, .vm_flags = 0 };
+
+ __flush_tlb_range(&vma, start, end, PAGE_SIZE, 3,
+ TLBF_NOWALKCACHE | TLBF_NOSYNC);
}
static inline bool __pte_flags_need_flush(ptdesc_t oldval, ptdesc_t newval)
@@ -618,6 +671,8 @@ static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd)
}
#define huge_pmd_needs_flush huge_pmd_needs_flush
+#undef __tlbi_user
+#undef __TLBI_VADDR
#endif
#endif
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 9810106a3f66..86dfc356ee6e 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -62,7 +62,7 @@ static inline void __uaccess_ttbr0_disable(void)
local_irq_save(flags);
ttbr = read_sysreg(ttbr1_el1);
- ttbr &= ~TTBR_ASID_MASK;
+ ttbr &= ~TTBRx_EL1_ASID_MASK;
/* reserved_pg_dir placed before swapper_pg_dir */
write_sysreg(ttbr - RESERVED_SWAPPER_OFFSET, ttbr0_el1);
/* Set reserved ASID */
@@ -85,8 +85,8 @@ static inline void __uaccess_ttbr0_enable(void)
/* Restore active ASID */
ttbr1 = read_sysreg(ttbr1_el1);
- ttbr1 &= ~TTBR_ASID_MASK; /* safety measure */
- ttbr1 |= ttbr0 & TTBR_ASID_MASK;
+ ttbr1 &= ~TTBRx_EL1_ASID_MASK; /* safety measure */
+ ttbr1 |= ttbr0 & TTBRx_EL1_ASID_MASK;
write_sysreg(ttbr1, ttbr1_el1);
/* Restore user page table */
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 76f32e424065..15979f366519 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -67,6 +67,7 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o
obj-$(CONFIG_ARM_SDE_INTERFACE) += sdei.o
obj-$(CONFIG_ARM64_PTR_AUTH) += pointer_auth.o
+obj-$(CONFIG_ARM64_MPAM) += mpam.o
obj-$(CONFIG_ARM64_MTE) += mte.o
obj-y += vdso-wrap.o
obj-$(CONFIG_COMPAT_VDSO) += vdso32-wrap.o
diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c
index e737c6295ec7..b7a1f8b788bb 100644
--- a/arch/arm64/kernel/armv8_deprecated.c
+++ b/arch/arm64/kernel/armv8_deprecated.c
@@ -610,6 +610,20 @@ static int __init armv8_deprecated_init(void)
}
#endif
+
+#ifdef CONFIG_SWP_EMULATION
+ /*
+ * The purpose of supporting LSUI is to eliminate PAN toggling. CPUs
+ * that support LSUI are unlikely to support a 32-bit runtime. Rather
+ * than emulating the SWP instruction using LSUI instructions, simply
+ * disable SWP emulation.
+ */
+ if (cpus_have_final_cap(ARM64_HAS_LSUI)) {
+ insn_swp.status = INSN_UNAVAILABLE;
+ pr_info("swp/swpb instruction emulation is not supported on this system\n");
+ }
+#endif
+
for (int i = 0; i < ARRAY_SIZE(insn_emulations); i++) {
struct insn_emulation *ie = insn_emulations[i];
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index cc9783b26443..faffedca5baf 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -87,6 +87,7 @@
#include <asm/kvm_host.h>
#include <asm/mmu.h>
#include <asm/mmu_context.h>
+#include <asm/mpam.h>
#include <asm/mte.h>
#include <asm/hypervisor.h>
#include <asm/processor.h>
@@ -282,6 +283,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar2[] = {
static const struct arm64_ftr_bits ftr_id_aa64isar3[] = {
ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_FPRCVT_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_LSUI_SHIFT, 4, ID_AA64ISAR3_EL1_LSUI_NI),
ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_LSFE_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_FAMINMAX_SHIFT, 4, 0),
ARM64_FTR_END,
@@ -2484,13 +2486,19 @@ test_has_mpam(const struct arm64_cpu_capabilities *entry, int scope)
static void
cpu_enable_mpam(const struct arm64_cpu_capabilities *entry)
{
- /*
- * Access by the kernel (at EL1) should use the reserved PARTID
- * which is configured unrestricted. This avoids priority-inversion
- * where latency sensitive tasks have to wait for a task that has
- * been throttled to release the lock.
- */
- write_sysreg_s(0, SYS_MPAM1_EL1);
+ int cpu = smp_processor_id();
+ u64 regval = 0;
+
+ if (IS_ENABLED(CONFIG_ARM64_MPAM) && static_branch_likely(&mpam_enabled))
+ regval = READ_ONCE(per_cpu(arm64_mpam_current, cpu));
+
+ write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1);
+ if (cpus_have_cap(ARM64_SME))
+ write_sysreg_s(regval & (MPAMSM_EL1_PARTID_D | MPAMSM_EL1_PMG_D), SYS_MPAMSM_EL1);
+ isb();
+
+ /* Synchronising the EL0 write is left until the ERET to EL0 */
+ write_sysreg_s(regval, SYS_MPAM0_EL1);
}
static bool
@@ -3161,6 +3169,15 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.cpu_enable = cpu_enable_ls64_v,
ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, LS64, LS64_V)
},
+#ifdef CONFIG_ARM64_LSUI
+ {
+ .desc = "Unprivileged Load Store Instructions (LSUI)",
+ .capability = ARM64_HAS_LSUI,
+ .type = ARM64_CPUCAP_SYSTEM_FEATURE,
+ .matches = has_cpuid_feature,
+ ARM64_CPUID_FIELDS(ID_AA64ISAR3_EL1, LSUI, IMP)
+ },
+#endif
{},
};
diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c
index 3625797e9ee8..f42ce7b5c67f 100644
--- a/arch/arm64/kernel/entry-common.c
+++ b/arch/arm64/kernel/entry-common.c
@@ -35,11 +35,11 @@
* Before this function is called it is not safe to call regular kernel code,
* instrumentable code, or any code which may trigger an exception.
*/
-static noinstr irqentry_state_t enter_from_kernel_mode(struct pt_regs *regs)
+static noinstr irqentry_state_t arm64_enter_from_kernel_mode(struct pt_regs *regs)
{
irqentry_state_t state;
- state = irqentry_enter(regs);
+ state = irqentry_enter_from_kernel_mode(regs);
mte_check_tfsr_entry();
mte_disable_tco_entry(current);
@@ -51,11 +51,14 @@ static noinstr irqentry_state_t enter_from_kernel_mode(struct pt_regs *regs)
* After this function returns it is not safe to call regular kernel code,
* instrumentable code, or any code which may trigger an exception.
*/
-static void noinstr exit_to_kernel_mode(struct pt_regs *regs,
- irqentry_state_t state)
+static void noinstr arm64_exit_to_kernel_mode(struct pt_regs *regs,
+ irqentry_state_t state)
{
+ local_irq_disable();
+ irqentry_exit_to_kernel_mode_preempt(regs, state);
+ local_daif_mask();
mte_check_tfsr_exit();
- irqentry_exit(regs, state);
+ irqentry_exit_to_kernel_mode_after_preempt(regs, state);
}
/*
@@ -298,11 +301,10 @@ static void noinstr el1_abort(struct pt_regs *regs, unsigned long esr)
unsigned long far = read_sysreg(far_el1);
irqentry_state_t state;
- state = enter_from_kernel_mode(regs);
+ state = arm64_enter_from_kernel_mode(regs);
local_daif_inherit(regs);
do_mem_abort(far, esr, regs);
- local_daif_mask();
- exit_to_kernel_mode(regs, state);
+ arm64_exit_to_kernel_mode(regs, state);
}
static void noinstr el1_pc(struct pt_regs *regs, unsigned long esr)
@@ -310,55 +312,50 @@ static void noinstr el1_pc(struct pt_regs *regs, unsigned long esr)
unsigned long far = read_sysreg(far_el1);
irqentry_state_t state;
- state = enter_from_kernel_mode(regs);
+ state = arm64_enter_from_kernel_mode(regs);
local_daif_inherit(regs);
do_sp_pc_abort(far, esr, regs);
- local_daif_mask();
- exit_to_kernel_mode(regs, state);
+ arm64_exit_to_kernel_mode(regs, state);
}
static void noinstr el1_undef(struct pt_regs *regs, unsigned long esr)
{
irqentry_state_t state;
- state = enter_from_kernel_mode(regs);
+ state = arm64_enter_from_kernel_mode(regs);
local_daif_inherit(regs);
do_el1_undef(regs, esr);
- local_daif_mask();
- exit_to_kernel_mode(regs, state);
+ arm64_exit_to_kernel_mode(regs, state);
}
static void noinstr el1_bti(struct pt_regs *regs, unsigned long esr)
{
irqentry_state_t state;
- state = enter_from_kernel_mode(regs);
+ state = arm64_enter_from_kernel_mode(regs);
local_daif_inherit(regs);
do_el1_bti(regs, esr);
- local_daif_mask();
- exit_to_kernel_mode(regs, state);
+ arm64_exit_to_kernel_mode(regs, state);
}
static void noinstr el1_gcs(struct pt_regs *regs, unsigned long esr)
{
irqentry_state_t state;
- state = enter_from_kernel_mode(regs);
+ state = arm64_enter_from_kernel_mode(regs);
local_daif_inherit(regs);
do_el1_gcs(regs, esr);
- local_daif_mask();
- exit_to_kernel_mode(regs, state);
+ arm64_exit_to_kernel_mode(regs, state);
}
static void noinstr el1_mops(struct pt_regs *regs, unsigned long esr)
{
irqentry_state_t state;
- state = enter_from_kernel_mode(regs);
+ state = arm64_enter_from_kernel_mode(regs);
local_daif_inherit(regs);
do_el1_mops(regs, esr);
- local_daif_mask();
- exit_to_kernel_mode(regs, state);
+ arm64_exit_to_kernel_mode(regs, state);
}
static void noinstr el1_breakpt(struct pt_regs *regs, unsigned long esr)
@@ -420,11 +417,10 @@ static void noinstr el1_fpac(struct pt_regs *regs, unsigned long esr)
{
irqentry_state_t state;
- state = enter_from_kernel_mode(regs);
+ state = arm64_enter_from_kernel_mode(regs);
local_daif_inherit(regs);
do_el1_fpac(regs, esr);
- local_daif_mask();
- exit_to_kernel_mode(regs, state);
+ arm64_exit_to_kernel_mode(regs, state);
}
asmlinkage void noinstr el1h_64_sync_handler(struct pt_regs *regs)
@@ -491,13 +487,13 @@ static __always_inline void __el1_irq(struct pt_regs *regs,
{
irqentry_state_t state;
- state = enter_from_kernel_mode(regs);
+ state = arm64_enter_from_kernel_mode(regs);
irq_enter_rcu();
do_interrupt_handler(regs, handler);
irq_exit_rcu();
- exit_to_kernel_mode(regs, state);
+ arm64_exit_to_kernel_mode(regs, state);
}
static void noinstr el1_interrupt(struct pt_regs *regs,
void (*handler)(struct pt_regs *))
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index f8018b5c1f9a..e0db14e9c843 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -273,7 +273,7 @@ alternative_if ARM64_HAS_ADDRESS_AUTH
alternative_else_nop_endif
1:
- scs_load_current
+ scs_load_current_base
.else
add x21, sp, #PT_REGS_SIZE
get_current_task tsk
@@ -378,8 +378,6 @@ alternative_if ARM64_WORKAROUND_845719
alternative_else_nop_endif
#endif
3:
- scs_save tsk
-
/* Ignore asynchronous tag check faults in the uaccess routines */
ldr x0, [tsk, THREAD_SCTLR_USER]
clear_mte_async_tcf x0
@@ -473,7 +471,7 @@ alternative_else_nop_endif
*/
SYM_CODE_START_LOCAL(__swpan_entry_el1)
mrs x21, ttbr0_el1
- tst x21, #TTBR_ASID_MASK // Check for the reserved ASID
+ tst x21, #TTBRx_EL1_ASID_MASK // Check for the reserved ASID
orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR
b.eq 1f // TTBR0 access already disabled
and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR
diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
index 239c16e3d02f..c5693a32e49b 100644
--- a/arch/arm64/kernel/machine_kexec.c
+++ b/arch/arm64/kernel/machine_kexec.c
@@ -129,9 +129,6 @@ int machine_kexec_post_load(struct kimage *kimage)
}
/* Create a copy of the linear map */
- trans_pgd = kexec_page_alloc(kimage);
- if (!trans_pgd)
- return -ENOMEM;
rc = trans_pgd_create_copy(&info, &trans_pgd, PAGE_OFFSET, PAGE_END);
if (rc)
return rc;
diff --git a/arch/arm64/kernel/mpam.c b/arch/arm64/kernel/mpam.c
new file mode 100644
index 000000000000..3a490de4fa12
--- /dev/null
+++ b/arch/arm64/kernel/mpam.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2025 Arm Ltd. */
+
+#include <asm/mpam.h>
+
+#include <linux/arm_mpam.h>
+#include <linux/cpu_pm.h>
+#include <linux/jump_label.h>
+#include <linux/percpu.h>
+
+DEFINE_STATIC_KEY_FALSE(mpam_enabled);
+DEFINE_PER_CPU(u64, arm64_mpam_default);
+DEFINE_PER_CPU(u64, arm64_mpam_current);
+
+u64 arm64_mpam_global_default;
+
+static int mpam_pm_notifier(struct notifier_block *self,
+ unsigned long cmd, void *v)
+{
+ u64 regval;
+ int cpu = smp_processor_id();
+
+ switch (cmd) {
+ case CPU_PM_EXIT:
+ /*
+ * Don't use mpam_thread_switch() as the system register
+ * value has changed under our feet.
+ */
+ regval = READ_ONCE(per_cpu(arm64_mpam_current, cpu));
+ write_sysreg_s(regval | MPAM1_EL1_MPAMEN, SYS_MPAM1_EL1);
+ if (system_supports_sme()) {
+ write_sysreg_s(regval & (MPAMSM_EL1_PARTID_D | MPAMSM_EL1_PMG_D),
+ SYS_MPAMSM_EL1);
+ }
+ isb();
+
+ write_sysreg_s(regval, SYS_MPAM0_EL1);
+
+ return NOTIFY_OK;
+ default:
+ return NOTIFY_DONE;
+ }
+}
+
+static struct notifier_block mpam_pm_nb = {
+ .notifier_call = mpam_pm_notifier,
+};
+
+static int __init arm64_mpam_register_cpus(void)
+{
+ u64 mpamidr = read_sanitised_ftr_reg(SYS_MPAMIDR_EL1);
+ u16 partid_max = FIELD_GET(MPAMIDR_EL1_PARTID_MAX, mpamidr);
+ u8 pmg_max = FIELD_GET(MPAMIDR_EL1_PMG_MAX, mpamidr);
+
+ if (!system_supports_mpam())
+ return 0;
+
+ cpu_pm_register_notifier(&mpam_pm_nb);
+ return mpam_register_requestor(partid_max, pmg_max);
+}
+/* Must occur before mpam_msc_driver_init() from subsys_initcall() */
+arch_initcall(arm64_mpam_register_cpus)
diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
index 32148bf09c1d..6874b16d0657 100644
--- a/arch/arm64/kernel/mte.c
+++ b/arch/arm64/kernel/mte.c
@@ -291,6 +291,9 @@ void mte_thread_switch(struct task_struct *next)
/* TCO may not have been disabled on exception entry for the current task. */
mte_disable_tco_entry(next);
+ if (!system_uses_mte_async_or_asymm_mode())
+ return;
+
/*
* Check if an async tag exception occurred at EL1.
*
@@ -315,8 +318,8 @@ void mte_cpu_setup(void)
* CnP is not a boot feature so MTE gets enabled before CnP, but let's
* make sure that is the case.
*/
- BUG_ON(read_sysreg(ttbr0_el1) & TTBR_CNP_BIT);
- BUG_ON(read_sysreg(ttbr1_el1) & TTBR_CNP_BIT);
+ BUG_ON(read_sysreg(ttbr0_el1) & TTBRx_EL1_CnP);
+ BUG_ON(read_sysreg(ttbr1_el1) & TTBRx_EL1_CnP);
/* Normal Tagged memory type at the corresponding MAIR index */
sysreg_clear_set(mair_el1,
@@ -350,6 +353,9 @@ void mte_suspend_enter(void)
if (!system_supports_mte())
return;
+ if (!system_uses_mte_async_or_asymm_mode())
+ return;
+
/*
* The barriers are required to guarantee that the indirect writes
* to TFSR_EL1 are synchronized before we report the state.
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 489554931231..c0bf1f46cdc6 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -51,6 +51,7 @@
#include <asm/fpsimd.h>
#include <asm/gcs.h>
#include <asm/mmu_context.h>
+#include <asm/mpam.h>
#include <asm/mte.h>
#include <asm/processor.h>
#include <asm/pointer_auth.h>
@@ -699,6 +700,29 @@ void update_sctlr_el1(u64 sctlr)
isb();
}
+static inline void debug_switch_state(void)
+{
+ if (system_uses_irq_prio_masking()) {
+ unsigned long daif_expected = 0;
+ unsigned long daif_actual = read_sysreg(daif);
+ unsigned long pmr_expected = GIC_PRIO_IRQOFF;
+ unsigned long pmr_actual = read_sysreg_s(SYS_ICC_PMR_EL1);
+
+ WARN_ONCE(daif_actual != daif_expected ||
+ pmr_actual != pmr_expected,
+ "Unexpected DAIF + PMR: 0x%lx + 0x%lx (expected 0x%lx + 0x%lx)\n",
+ daif_actual, pmr_actual,
+ daif_expected, pmr_expected);
+ } else {
+ unsigned long daif_expected = DAIF_PROCCTX_NOIRQ;
+ unsigned long daif_actual = read_sysreg(daif);
+
+ WARN_ONCE(daif_actual != daif_expected,
+ "Unexpected DAIF value: 0x%lx (expected 0x%lx)\n",
+ daif_actual, daif_expected);
+ }
+}
+
/*
* Thread switching.
*/
@@ -708,6 +732,8 @@ struct task_struct *__switch_to(struct task_struct *prev,
{
struct task_struct *last;
+ debug_switch_state();
+
fpsimd_thread_switch(next);
tls_thread_switch(next);
hw_breakpoint_thread_switch(next);
@@ -738,6 +764,12 @@ struct task_struct *__switch_to(struct task_struct *prev,
if (prev->thread.sctlr_user != next->thread.sctlr_user)
update_sctlr_el1(next->thread.sctlr_user);
+ /*
+ * MPAM thread switch happens after the DSB to ensure prev's accesses
+ * use prev's MPAM settings.
+ */
+ mpam_thread_switch(next);
+
/* the actual thread switch */
last = cpu_switch_to(prev, next);
diff --git a/arch/arm64/kernel/rsi.c b/arch/arm64/kernel/rsi.c
index c64a06f58c0b..6e883bc43186 100644
--- a/arch/arm64/kernel/rsi.c
+++ b/arch/arm64/kernel/rsi.c
@@ -144,7 +144,7 @@ void __init arm64_rsi_init(void)
return;
if (!rsi_version_matches())
return;
- if (WARN_ON(rsi_get_realm_config(&config)))
+ if (WARN_ON(rsi_get_realm_config(lm_alias(&config))))
return;
prot_ns_shared = BIT(config.ipa_bits - 1);
diff --git a/arch/arm64/kernel/sys_compat.c b/arch/arm64/kernel/sys_compat.c
index b9d4998c97ef..7e9860143add 100644
--- a/arch/arm64/kernel/sys_compat.c
+++ b/arch/arm64/kernel/sys_compat.c
@@ -36,7 +36,7 @@ __do_compat_cache_op(unsigned long start, unsigned long end)
* The workaround requires an inner-shareable tlbi.
* We pick the reserved-ASID to minimise the impact.
*/
- __tlbi(aside1is, __TLBI_VADDR(0, 0));
+ __tlbi(aside1is, 0UL);
__tlbi_sync_s1ish();
}
diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
index 6588ea251ed7..1adf88a57328 100644
--- a/arch/arm64/kvm/at.c
+++ b/arch/arm64/kvm/at.c
@@ -9,6 +9,7 @@
#include <asm/esr.h>
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
+#include <asm/lsui.h>
static void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool s1ptw)
{
@@ -1681,6 +1682,35 @@ int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level)
}
}
+static int __lsui_swap_desc(u64 __user *ptep, u64 old, u64 new)
+{
+ u64 tmp = old;
+ int ret = 0;
+
+ /*
+ * Wrap LSUI instructions with uaccess_ttbr0_enable()/disable(),
+ * as PAN toggling is not required.
+ */
+ uaccess_ttbr0_enable();
+
+ asm volatile(__LSUI_PREAMBLE
+ "1: cast %[old], %[new], %[addr]\n"
+ "2:\n"
+ _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w[ret])
+ : [old] "+r" (old), [addr] "+Q" (*ptep), [ret] "+r" (ret)
+ : [new] "r" (new)
+ : "memory");
+
+ uaccess_ttbr0_disable();
+
+ if (ret)
+ return ret;
+ if (tmp != old)
+ return -EAGAIN;
+
+ return ret;
+}
+
static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new)
{
u64 tmp = old;
@@ -1756,7 +1786,9 @@ int __kvm_at_swap_desc(struct kvm *kvm, gpa_t ipa, u64 old, u64 new)
return -EPERM;
ptep = (u64 __user *)hva + offset;
- if (cpus_have_final_cap(ARM64_HAS_LSE_ATOMICS))
+ if (cpus_have_final_cap(ARM64_HAS_LSUI))
+ r = __lsui_swap_desc(ptep, old, new);
+ else if (cpus_have_final_cap(ARM64_HAS_LSE_ATOMICS))
r = __lse_swap_desc(ptep, old, new);
else
r = __llsc_swap_desc(ptep, old, new);
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 2597e8bda867..0b50ddd530f3 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -267,7 +267,8 @@ static inline void __deactivate_traps_hfgxtr(struct kvm_vcpu *vcpu)
static inline void __activate_traps_mpam(struct kvm_vcpu *vcpu)
{
- u64 r = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1;
+ u64 clr = MPAM2_EL2_EnMPAMSM;
+ u64 set = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1;
if (!system_supports_mpam())
return;
@@ -277,18 +278,21 @@ static inline void __activate_traps_mpam(struct kvm_vcpu *vcpu)
write_sysreg_s(MPAMHCR_EL2_TRAP_MPAMIDR_EL1, SYS_MPAMHCR_EL2);
} else {
/* From v1.1 TIDR can trap MPAMIDR, set it unconditionally */
- r |= MPAM2_EL2_TIDR;
+ set |= MPAM2_EL2_TIDR;
}
- write_sysreg_s(r, SYS_MPAM2_EL2);
+ sysreg_clear_set_s(SYS_MPAM2_EL2, clr, set);
}
static inline void __deactivate_traps_mpam(void)
{
+ u64 clr = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1 | MPAM2_EL2_TIDR;
+ u64 set = MPAM2_EL2_EnMPAMSM;
+
if (!system_supports_mpam())
return;
- write_sysreg_s(0, SYS_MPAM2_EL2);
+ sysreg_clear_set_s(SYS_MPAM2_EL2, clr, set);
if (system_supports_mpam_hcr())
write_sysreg_s(MPAMHCR_HOST_FLAGS, SYS_MPAMHCR_EL2);
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
index 0d42eedc7167..445eb0743af2 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
@@ -130,7 +130,7 @@ SYM_CODE_START_LOCAL(___kvm_hyp_init)
ldr x1, [x0, #NVHE_INIT_PGD_PA]
phys_to_ttbr x2, x1
alternative_if ARM64_HAS_CNP
- orr x2, x2, #TTBR_CNP_BIT
+ orr x2, x2, #TTBRx_EL1_CnP
alternative_else_nop_endif
msr ttbr0_el2, x2
@@ -291,7 +291,7 @@ SYM_TYPED_FUNC_START(__pkvm_init_switch_pgd)
/* Install the new pgtables */
phys_to_ttbr x5, x0
alternative_if ARM64_HAS_CNP
- orr x5, x5, #TTBR_CNP_BIT
+ orr x5, x5, #TTBRx_EL1_CnP
alternative_else_nop_endif
msr ttbr0_el2, x5
diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c
index 218976287d3f..4d8fcc7a3a41 100644
--- a/arch/arm64/kvm/hyp/nvhe/mm.c
+++ b/arch/arm64/kvm/hyp/nvhe/mm.c
@@ -270,7 +270,7 @@ static void fixmap_clear_slot(struct hyp_fixmap_slot *slot)
* https://lore.kernel.org/kvm/20221017115209.2099-1-will@kernel.org/T/#mf10dfbaf1eaef9274c581b81c53758918c1d0f03
*/
dsb(ishst);
- __tlbi_level(vale2is, __TLBI_VADDR(addr, 0), level);
+ __tlbi_level(vale2is, addr, level);
__tlbi_sync_s1ish_hyp();
isb();
}
diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c
index 3dc1ce0d27fe..b29140995d48 100644
--- a/arch/arm64/kvm/hyp/nvhe/tlb.c
+++ b/arch/arm64/kvm/hyp/nvhe/tlb.c
@@ -158,7 +158,6 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
* Instead, we invalidate Stage-2 for this IPA, and the
* whole of Stage-1. Weep...
*/
- ipa >>= 12;
__tlbi_level(ipas2e1is, ipa, level);
/*
@@ -188,7 +187,6 @@ void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu,
* Instead, we invalidate Stage-2 for this IPA, and the
* whole of Stage-1. Weep...
*/
- ipa >>= 12;
__tlbi_level(ipas2e1, ipa, level);
/*
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 9b480f947da2..30226f2d5564 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -490,14 +490,14 @@ static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
kvm_clear_pte(ctx->ptep);
dsb(ishst);
- __tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), TLBI_TTL_UNKNOWN);
+ __tlbi_level(vae2is, ctx->addr, TLBI_TTL_UNKNOWN);
} else {
if (ctx->end - ctx->addr < granule)
return -EINVAL;
kvm_clear_pte(ctx->ptep);
dsb(ishst);
- __tlbi_level(vale2is, __TLBI_VADDR(ctx->addr, 0), ctx->level);
+ __tlbi_level(vale2is, ctx->addr, ctx->level);
*unmapped += granule;
}
diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
index b254d442e54e..be685b63e8cf 100644
--- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
@@ -183,6 +183,21 @@ void sysreg_restore_guest_state_vhe(struct kvm_cpu_context *ctxt)
}
NOKPROBE_SYMBOL(sysreg_restore_guest_state_vhe);
+/*
+ * The _EL0 value was written by the host's context switch and belongs to the
+ * VMM. Copy this into the guest's _EL1 register.
+ */
+static inline void __mpam_guest_load(void)
+{
+ u64 mask = MPAM0_EL1_PARTID_D | MPAM0_EL1_PARTID_I | MPAM0_EL1_PMG_D | MPAM0_EL1_PMG_I;
+
+ if (system_supports_mpam()) {
+ u64 val = (read_sysreg_s(SYS_MPAM0_EL1) & mask) | MPAM1_EL1_MPAMEN;
+
+ write_sysreg_el1(val, SYS_MPAM1);
+ }
+}
+
/**
* __vcpu_load_switch_sysregs - Load guest system registers to the physical CPU
*
@@ -222,6 +237,7 @@ void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu)
*/
__sysreg32_restore_state(vcpu);
__sysreg_restore_user_state(guest_ctxt);
+ __mpam_guest_load();
if (unlikely(is_hyp_ctxt(vcpu))) {
__sysreg_restore_vel2_state(vcpu);
diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c
index 35855dadfb1b..f7b9dfe3f3a5 100644
--- a/arch/arm64/kvm/hyp/vhe/tlb.c
+++ b/arch/arm64/kvm/hyp/vhe/tlb.c
@@ -104,7 +104,6 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
* Instead, we invalidate Stage-2 for this IPA, and the
* whole of Stage-1. Weep...
*/
- ipa >>= 12;
__tlbi_level(ipas2e1is, ipa, level);
/*
@@ -136,7 +135,6 @@ void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu,
* Instead, we invalidate Stage-2 for this IPA, and the
* whole of Stage-1. Weep...
*/
- ipa >>= 12;
__tlbi_level(ipas2e1, ipa, level);
/*
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 1b4cacb6e918..c1e0dea903a1 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1805,7 +1805,7 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu,
break;
case SYS_ID_AA64ISAR3_EL1:
val &= ID_AA64ISAR3_EL1_FPRCVT | ID_AA64ISAR3_EL1_LSFE |
- ID_AA64ISAR3_EL1_FAMINMAX;
+ ID_AA64ISAR3_EL1_FAMINMAX | ID_AA64ISAR3_EL1_LSUI;
break;
case SYS_ID_AA64MMFR2_EL1:
val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK;
@@ -3252,6 +3252,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
ID_AA64ISAR2_EL1_GPA3)),
ID_WRITABLE(ID_AA64ISAR3_EL1, (ID_AA64ISAR3_EL1_FPRCVT |
ID_AA64ISAR3_EL1_LSFE |
+ ID_AA64ISAR3_EL1_LSUI |
ID_AA64ISAR3_EL1_FAMINMAX)),
ID_UNALLOCATED(6,4),
ID_UNALLOCATED(6,5),
@@ -3376,6 +3377,8 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_MPAM1_EL1), undef_access },
{ SYS_DESC(SYS_MPAM0_EL1), undef_access },
+ { SYS_DESC(SYS_MPAMSM_EL1), undef_access },
+
{ SYS_DESC(SYS_VBAR_EL1), access_rw, reset_val, VBAR_EL1, 0 },
{ SYS_DESC(SYS_DISR_EL1), NULL, reset_val, DISR_EL1, 0 },
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index b2ac06246327..0f4a28b87469 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -354,15 +354,15 @@ void cpu_do_switch_mm(phys_addr_t pgd_phys, struct mm_struct *mm)
/* Skip CNP for the reserved ASID */
if (system_supports_cnp() && asid)
- ttbr0 |= TTBR_CNP_BIT;
+ ttbr0 |= TTBRx_EL1_CnP;
/* SW PAN needs a copy of the ASID in TTBR0 for entry */
if (IS_ENABLED(CONFIG_ARM64_SW_TTBR0_PAN))
- ttbr0 |= FIELD_PREP(TTBR_ASID_MASK, asid);
+ ttbr0 |= FIELD_PREP(TTBRx_EL1_ASID_MASK, asid);
/* Set ASID in TTBR1 since TCR.A1 is set */
- ttbr1 &= ~TTBR_ASID_MASK;
- ttbr1 |= FIELD_PREP(TTBR_ASID_MASK, asid);
+ ttbr1 &= ~TTBRx_EL1_ASID_MASK;
+ ttbr1 |= FIELD_PREP(TTBRx_EL1_ASID_MASK, asid);
cpu_set_reserved_ttbr0_nosync();
write_sysreg(ttbr1, ttbr1_el1);
diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
index 1519d090d5ea..3970392c4326 100644
--- a/arch/arm64/mm/contpte.c
+++ b/arch/arm64/mm/contpte.c
@@ -225,7 +225,8 @@ static void contpte_convert(struct mm_struct *mm, unsigned long addr,
*/
if (!system_supports_bbml2_noabort())
- __flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, true, 3);
+ __flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, 3,
+ TLBF_NOWALKCACHE);
__set_ptes(mm, start_addr, start_ptep, pte, CONT_PTES);
}
@@ -551,8 +552,8 @@ int contpte_clear_flush_young_ptes(struct vm_area_struct *vma,
* See comment in __ptep_clear_flush_young(); same rationale for
* eliding the trailing DSB applies here.
*/
- __flush_tlb_range_nosync(vma->vm_mm, addr, end,
- PAGE_SIZE, true, 3);
+ __flush_tlb_range(vma, addr, end, PAGE_SIZE, 3,
+ TLBF_NOWALKCACHE | TLBF_NOSYNC);
}
return young;
@@ -685,7 +686,10 @@ int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
__ptep_set_access_flags(vma, addr, ptep, entry, 0);
if (dirty)
- local_flush_tlb_contpte(vma, start_addr);
+ __flush_tlb_range(vma, start_addr,
+ start_addr + CONT_PTE_SIZE,
+ PAGE_SIZE, 3,
+ TLBF_NOWALKCACHE | TLBF_NOBROADCAST);
} else {
__contpte_try_unfold(vma->vm_mm, addr, ptep, orig_pte);
__ptep_set_access_flags(vma, addr, ptep, entry, dirty);
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index be9dab2c7d6a..920a8b244d59 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -204,12 +204,13 @@ static void show_pte(unsigned long addr)
*
* Returns whether or not the PTE actually changed.
*/
-int __ptep_set_access_flags(struct vm_area_struct *vma,
- unsigned long address, pte_t *ptep,
- pte_t entry, int dirty)
+int __ptep_set_access_flags_anysz(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ pte_t entry, int dirty, unsigned long pgsize)
{
pteval_t old_pteval, pteval;
pte_t pte = __ptep_get(ptep);
+ int level;
if (pte_same(pte, entry))
return 0;
@@ -238,8 +239,27 @@ int __ptep_set_access_flags(struct vm_area_struct *vma,
* may still cause page faults and be invalidated via
* flush_tlb_fix_spurious_fault().
*/
- if (dirty)
- local_flush_tlb_page(vma, address);
+ if (dirty) {
+ switch (pgsize) {
+ case PAGE_SIZE:
+ level = 3;
+ break;
+ case PMD_SIZE:
+ level = 2;
+ break;
+#ifndef __PAGETABLE_PMD_FOLDED
+ case PUD_SIZE:
+ level = 1;
+ break;
+#endif
+ default:
+ level = TLBI_TTL_UNKNOWN;
+ WARN_ON(1);
+ }
+
+ __flush_tlb_range(vma, address, address + pgsize, pgsize, level,
+ TLBF_NOWALKCACHE | TLBF_NOBROADCAST);
+ }
return 1;
}
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index a42c05cf5640..30772a909aea 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -181,7 +181,7 @@ static pte_t get_clear_contig_flush(struct mm_struct *mm,
struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
unsigned long end = addr + (pgsize * ncontig);
- __flush_hugetlb_tlb_range(&vma, addr, end, pgsize, true);
+ __flush_hugetlb_tlb_range(&vma, addr, end, pgsize, TLBF_NOWALKCACHE);
return orig_pte;
}
@@ -209,7 +209,7 @@ static void clear_flush(struct mm_struct *mm,
if (mm == &init_mm)
flush_tlb_kernel_range(saddr, addr);
else
- __flush_hugetlb_tlb_range(&vma, saddr, addr, pgsize, true);
+ __flush_hugetlb_tlb_range(&vma, saddr, addr, pgsize, TLBF_NOWALKCACHE);
}
void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -427,11 +427,11 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
pte_t orig_pte;
VM_WARN_ON(!pte_present(pte));
+ ncontig = num_contig_ptes(huge_page_size(hstate_vma(vma)), &pgsize);
if (!pte_cont(pte))
- return __ptep_set_access_flags(vma, addr, ptep, pte, dirty);
-
- ncontig = num_contig_ptes(huge_page_size(hstate_vma(vma)), &pgsize);
+ return __ptep_set_access_flags_anysz(vma, addr, ptep, pte,
+ dirty, pgsize);
if (!__cont_access_flags_changed(ptep, pte, ncontig))
return 0;
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 96711b8578fd..b9b248d24fd1 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -350,7 +350,6 @@ void __init arch_mm_preinit(void)
}
swiotlb_init(swiotlb, flags);
- swiotlb_update_mem_attributes();
/*
* Check boundaries twice: Some fundamental inconsistencies can be
@@ -377,6 +376,14 @@ void __init arch_mm_preinit(void)
}
}
+bool page_alloc_available __ro_after_init;
+
+void __init mem_init(void)
+{
+ page_alloc_available = true;
+ swiotlb_update_mem_attributes();
+}
+
void free_initmem(void)
{
void *lm_init_begin = lm_alias(__init_begin);
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index a6a00accf4f9..7ea743996a61 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -112,7 +112,7 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
}
EXPORT_SYMBOL(phys_mem_access_prot);
-static phys_addr_t __init early_pgtable_alloc(enum pgtable_type pgtable_type)
+static phys_addr_t __init early_pgtable_alloc(enum pgtable_level pgtable_level)
{
phys_addr_t phys;
@@ -197,14 +197,14 @@ static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
unsigned long end, phys_addr_t phys,
pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(enum pgtable_type),
+ phys_addr_t (*pgtable_alloc)(enum pgtable_level),
int flags)
{
unsigned long next;
pmd_t pmd = READ_ONCE(*pmdp);
pte_t *ptep;
- BUG_ON(pmd_sect(pmd));
+ BUG_ON(pmd_leaf(pmd));
if (pmd_none(pmd)) {
pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF;
phys_addr_t pte_phys;
@@ -212,7 +212,7 @@ static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
if (flags & NO_EXEC_MAPPINGS)
pmdval |= PMD_TABLE_PXN;
BUG_ON(!pgtable_alloc);
- pte_phys = pgtable_alloc(TABLE_PTE);
+ pte_phys = pgtable_alloc(PGTABLE_LEVEL_PTE);
if (pte_phys == INVALID_PHYS_ADDR)
return -ENOMEM;
ptep = pte_set_fixmap(pte_phys);
@@ -252,7 +252,7 @@ static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
phys_addr_t phys, pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(enum pgtable_type), int flags)
+ phys_addr_t (*pgtable_alloc)(enum pgtable_level), int flags)
{
unsigned long next;
@@ -292,7 +292,7 @@ static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
unsigned long end, phys_addr_t phys,
pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(enum pgtable_type),
+ phys_addr_t (*pgtable_alloc)(enum pgtable_level),
int flags)
{
int ret;
@@ -303,7 +303,7 @@ static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
/*
* Check for initial section mappings in the pgd/pud.
*/
- BUG_ON(pud_sect(pud));
+ BUG_ON(pud_leaf(pud));
if (pud_none(pud)) {
pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF;
phys_addr_t pmd_phys;
@@ -311,7 +311,7 @@ static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
if (flags & NO_EXEC_MAPPINGS)
pudval |= PUD_TABLE_PXN;
BUG_ON(!pgtable_alloc);
- pmd_phys = pgtable_alloc(TABLE_PMD);
+ pmd_phys = pgtable_alloc(PGTABLE_LEVEL_PMD);
if (pmd_phys == INVALID_PHYS_ADDR)
return -ENOMEM;
pmdp = pmd_set_fixmap(pmd_phys);
@@ -349,7 +349,7 @@ out:
static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
phys_addr_t phys, pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(enum pgtable_type),
+ phys_addr_t (*pgtable_alloc)(enum pgtable_level),
int flags)
{
int ret = 0;
@@ -364,7 +364,7 @@ static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
if (flags & NO_EXEC_MAPPINGS)
p4dval |= P4D_TABLE_PXN;
BUG_ON(!pgtable_alloc);
- pud_phys = pgtable_alloc(TABLE_PUD);
+ pud_phys = pgtable_alloc(PGTABLE_LEVEL_PUD);
if (pud_phys == INVALID_PHYS_ADDR)
return -ENOMEM;
pudp = pud_set_fixmap(pud_phys);
@@ -415,7 +415,7 @@ out:
static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
phys_addr_t phys, pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(enum pgtable_type),
+ phys_addr_t (*pgtable_alloc)(enum pgtable_level),
int flags)
{
int ret;
@@ -430,7 +430,7 @@ static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
if (flags & NO_EXEC_MAPPINGS)
pgdval |= PGD_TABLE_PXN;
BUG_ON(!pgtable_alloc);
- p4d_phys = pgtable_alloc(TABLE_P4D);
+ p4d_phys = pgtable_alloc(PGTABLE_LEVEL_P4D);
if (p4d_phys == INVALID_PHYS_ADDR)
return -ENOMEM;
p4dp = p4d_set_fixmap(p4d_phys);
@@ -467,7 +467,7 @@ out:
static int __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
unsigned long virt, phys_addr_t size,
pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(enum pgtable_type),
+ phys_addr_t (*pgtable_alloc)(enum pgtable_level),
int flags)
{
int ret;
@@ -500,7 +500,7 @@ static int __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
static int __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
unsigned long virt, phys_addr_t size,
pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(enum pgtable_type),
+ phys_addr_t (*pgtable_alloc)(enum pgtable_level),
int flags)
{
int ret;
@@ -516,7 +516,7 @@ static int __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
static void early_create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
unsigned long virt, phys_addr_t size,
pgprot_t prot,
- phys_addr_t (*pgtable_alloc)(enum pgtable_type),
+ phys_addr_t (*pgtable_alloc)(enum pgtable_level),
int flags)
{
int ret;
@@ -528,7 +528,7 @@ static void early_create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
}
static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, gfp_t gfp,
- enum pgtable_type pgtable_type)
+ enum pgtable_level pgtable_level)
{
/* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */
struct ptdesc *ptdesc = pagetable_alloc(gfp & ~__GFP_ZERO, 0);
@@ -539,40 +539,43 @@ static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, gfp_t gfp,
pa = page_to_phys(ptdesc_page(ptdesc));
- switch (pgtable_type) {
- case TABLE_PTE:
+ switch (pgtable_level) {
+ case PGTABLE_LEVEL_PTE:
BUG_ON(!pagetable_pte_ctor(mm, ptdesc));
break;
- case TABLE_PMD:
+ case PGTABLE_LEVEL_PMD:
BUG_ON(!pagetable_pmd_ctor(mm, ptdesc));
break;
- case TABLE_PUD:
+ case PGTABLE_LEVEL_PUD:
pagetable_pud_ctor(ptdesc);
break;
- case TABLE_P4D:
+ case PGTABLE_LEVEL_P4D:
pagetable_p4d_ctor(ptdesc);
break;
+ case PGTABLE_LEVEL_PGD:
+ VM_WARN_ON(1);
+ break;
}
return pa;
}
static phys_addr_t
-pgd_pgtable_alloc_init_mm_gfp(enum pgtable_type pgtable_type, gfp_t gfp)
+pgd_pgtable_alloc_init_mm_gfp(enum pgtable_level pgtable_level, gfp_t gfp)
{
- return __pgd_pgtable_alloc(&init_mm, gfp, pgtable_type);
+ return __pgd_pgtable_alloc(&init_mm, gfp, pgtable_level);
}
static phys_addr_t __maybe_unused
-pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type)
+pgd_pgtable_alloc_init_mm(enum pgtable_level pgtable_level)
{
- return pgd_pgtable_alloc_init_mm_gfp(pgtable_type, GFP_PGTABLE_KERNEL);
+ return pgd_pgtable_alloc_init_mm_gfp(pgtable_level, GFP_PGTABLE_KERNEL);
}
static phys_addr_t
-pgd_pgtable_alloc_special_mm(enum pgtable_type pgtable_type)
+pgd_pgtable_alloc_special_mm(enum pgtable_level pgtable_level)
{
- return __pgd_pgtable_alloc(NULL, GFP_PGTABLE_KERNEL, pgtable_type);
+ return __pgd_pgtable_alloc(NULL, GFP_PGTABLE_KERNEL, pgtable_level);
}
static void split_contpte(pte_t *ptep)
@@ -593,7 +596,7 @@ static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont)
pte_t *ptep;
int i;
- pte_phys = pgd_pgtable_alloc_init_mm_gfp(TABLE_PTE, gfp);
+ pte_phys = pgd_pgtable_alloc_init_mm_gfp(PGTABLE_LEVEL_PTE, gfp);
if (pte_phys == INVALID_PHYS_ADDR)
return -ENOMEM;
ptep = (pte_t *)phys_to_virt(pte_phys);
@@ -602,6 +605,8 @@ static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont)
tableprot |= PMD_TABLE_PXN;
prot = __pgprot((pgprot_val(prot) & ~PTE_TYPE_MASK) | PTE_TYPE_PAGE);
+ if (!pmd_valid(pmd))
+ prot = pte_pgprot(pte_mkinvalid(pfn_pte(0, prot)));
prot = __pgprot(pgprot_val(prot) & ~PTE_CONT);
if (to_cont)
prot = __pgprot(pgprot_val(prot) | PTE_CONT);
@@ -638,7 +643,7 @@ static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont)
pmd_t *pmdp;
int i;
- pmd_phys = pgd_pgtable_alloc_init_mm_gfp(TABLE_PMD, gfp);
+ pmd_phys = pgd_pgtable_alloc_init_mm_gfp(PGTABLE_LEVEL_PMD, gfp);
if (pmd_phys == INVALID_PHYS_ADDR)
return -ENOMEM;
pmdp = (pmd_t *)phys_to_virt(pmd_phys);
@@ -647,6 +652,8 @@ static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont)
tableprot |= PUD_TABLE_PXN;
prot = __pgprot((pgprot_val(prot) & ~PMD_TYPE_MASK) | PMD_TYPE_SECT);
+ if (!pud_valid(pud))
+ prot = pmd_pgprot(pmd_mkinvalid(pfn_pmd(0, prot)));
prot = __pgprot(pgprot_val(prot) & ~PTE_CONT);
if (to_cont)
prot = __pgprot(pgprot_val(prot) | PTE_CONT);
@@ -768,30 +775,51 @@ static inline bool force_pte_mapping(void)
}
static DEFINE_MUTEX(pgtable_split_lock);
+static bool linear_map_requires_bbml2;
int split_kernel_leaf_mapping(unsigned long start, unsigned long end)
{
int ret;
/*
- * !BBML2_NOABORT systems should not be trying to change permissions on
- * anything that is not pte-mapped in the first place. Just return early
- * and let the permission change code raise a warning if not already
- * pte-mapped.
- */
- if (!system_supports_bbml2_noabort())
- return 0;
-
- /*
* If the region is within a pte-mapped area, there is no need to try to
* split. Additionally, CONFIG_DEBUG_PAGEALLOC and CONFIG_KFENCE may
* change permissions from atomic context so for those cases (which are
* always pte-mapped), we must not go any further because taking the
- * mutex below may sleep.
+ * mutex below may sleep. Do not call force_pte_mapping() here because
+ * it could return a confusing result if called from a secondary cpu
+ * prior to finalizing caps. Instead, linear_map_requires_bbml2 gives us
+ * what we need.
*/
- if (force_pte_mapping() || is_kfence_address((void *)start))
+ if (!linear_map_requires_bbml2 || is_kfence_address((void *)start))
return 0;
+ if (!system_supports_bbml2_noabort()) {
+ /*
+ * !BBML2_NOABORT systems should not be trying to change
+ * permissions on anything that is not pte-mapped in the first
+ * place. Just return early and let the permission change code
+ * raise a warning if not already pte-mapped.
+ */
+ if (system_capabilities_finalized())
+ return 0;
+
+ /*
+ * Boot-time: split_kernel_leaf_mapping_locked() allocates from
+ * page allocator. Can't split until it's available.
+ */
+ if (WARN_ON(!page_alloc_available))
+ return -EBUSY;
+
+ /*
+ * Boot-time: Started secondary cpus but don't know if they
+ * support BBML2_NOABORT yet. Can't allow splitting in this
+ * window in case they don't.
+ */
+ if (WARN_ON(num_online_cpus() > 1))
+ return -EBUSY;
+ }
+
/*
* Ensure start and end are at least page-aligned since this is the
* finest granularity we can split to.
@@ -891,8 +919,6 @@ static int range_split_to_ptes(unsigned long start, unsigned long end, gfp_t gfp
return ret;
}
-static bool linear_map_requires_bbml2 __initdata;
-
u32 idmap_kpti_bbml2_flag;
static void __init init_idmap_kpti_bbml2_flag(void)
@@ -1226,7 +1252,7 @@ static void __init declare_vma(struct vm_struct *vma,
static phys_addr_t kpti_ng_temp_alloc __initdata;
-static phys_addr_t __init kpti_ng_pgd_alloc(enum pgtable_type type)
+static phys_addr_t __init kpti_ng_pgd_alloc(enum pgtable_level pgtable_level)
{
kpti_ng_temp_alloc -= PAGE_SIZE;
return kpti_ng_temp_alloc;
@@ -1458,10 +1484,14 @@ static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
WARN_ON(!pte_present(pte));
__pte_clear(&init_mm, addr, ptep);
- flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
- if (free_mapped)
+ if (free_mapped) {
+ /* CONT blocks are not supported in the vmemmap */
+ WARN_ON(pte_cont(pte));
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
free_hotplug_page_range(pte_page(pte),
PAGE_SIZE, altmap);
+ }
+ /* unmap_hotplug_range() flushes TLB for !free_mapped */
} while (addr += PAGE_SIZE, addr < end);
}
@@ -1480,17 +1510,16 @@ static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
continue;
WARN_ON(!pmd_present(pmd));
- if (pmd_sect(pmd)) {
+ if (pmd_leaf(pmd)) {
pmd_clear(pmdp);
-
- /*
- * One TLBI should be sufficient here as the PMD_SIZE
- * range is mapped with a single block entry.
- */
- flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
- if (free_mapped)
+ if (free_mapped) {
+ /* CONT blocks are not supported in the vmemmap */
+ WARN_ON(pmd_cont(pmd));
+ flush_tlb_kernel_range(addr, addr + PMD_SIZE);
free_hotplug_page_range(pmd_page(pmd),
PMD_SIZE, altmap);
+ }
+ /* unmap_hotplug_range() flushes TLB for !free_mapped */
continue;
}
WARN_ON(!pmd_table(pmd));
@@ -1513,17 +1542,14 @@ static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
continue;
WARN_ON(!pud_present(pud));
- if (pud_sect(pud)) {
+ if (pud_leaf(pud)) {
pud_clear(pudp);
-
- /*
- * One TLBI should be sufficient here as the PUD_SIZE
- * range is mapped with a single block entry.
- */
- flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
- if (free_mapped)
+ if (free_mapped) {
+ flush_tlb_kernel_range(addr, addr + PUD_SIZE);
free_hotplug_page_range(pud_page(pud),
PUD_SIZE, altmap);
+ }
+ /* unmap_hotplug_range() flushes TLB for !free_mapped */
continue;
}
WARN_ON(!pud_table(pud));
@@ -1553,6 +1579,7 @@ static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr,
static void unmap_hotplug_range(unsigned long addr, unsigned long end,
bool free_mapped, struct vmem_altmap *altmap)
{
+ unsigned long start = addr;
unsigned long next;
pgd_t *pgdp, pgd;
@@ -1574,6 +1601,9 @@ static void unmap_hotplug_range(unsigned long addr, unsigned long end,
WARN_ON(!pgd_present(pgd));
unmap_hotplug_p4d_range(pgdp, addr, next, free_mapped, altmap);
} while (addr = next, addr < end);
+
+ if (!free_mapped)
+ flush_tlb_kernel_range(start, end);
}
static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
@@ -1627,7 +1657,7 @@ static void free_empty_pmd_table(pud_t *pudp, unsigned long addr,
if (pmd_none(pmd))
continue;
- WARN_ON(!pmd_present(pmd) || !pmd_table(pmd) || pmd_sect(pmd));
+ WARN_ON(!pmd_present(pmd) || !pmd_table(pmd));
free_empty_pte_table(pmdp, addr, next, floor, ceiling);
} while (addr = next, addr < end);
@@ -1667,7 +1697,7 @@ static void free_empty_pud_table(p4d_t *p4dp, unsigned long addr,
if (pud_none(pud))
continue;
- WARN_ON(!pud_present(pud) || !pud_table(pud) || pud_sect(pud));
+ WARN_ON(!pud_present(pud) || !pud_table(pud));
free_empty_pmd_table(pudp, addr, next, floor, ceiling);
} while (addr = next, addr < end);
@@ -1763,7 +1793,7 @@ int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,
{
vmemmap_verify((pte_t *)pmdp, node, addr, next);
- return pmd_sect(READ_ONCE(*pmdp));
+ return pmd_leaf(READ_ONCE(*pmdp));
}
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
@@ -1827,7 +1857,7 @@ void p4d_clear_huge(p4d_t *p4dp)
int pud_clear_huge(pud_t *pudp)
{
- if (!pud_sect(READ_ONCE(*pudp)))
+ if (!pud_leaf(READ_ONCE(*pudp)))
return 0;
pud_clear(pudp);
return 1;
@@ -1835,7 +1865,7 @@ int pud_clear_huge(pud_t *pudp)
int pmd_clear_huge(pmd_t *pmdp)
{
- if (!pmd_sect(READ_ONCE(*pmdp)))
+ if (!pmd_leaf(READ_ONCE(*pmdp)))
return 0;
pmd_clear(pmdp);
return 1;
@@ -2010,6 +2040,107 @@ void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
__remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
}
+
+static bool addr_splits_kernel_leaf(unsigned long addr)
+{
+ pgd_t *pgdp, pgd;
+ p4d_t *p4dp, p4d;
+ pud_t *pudp, pud;
+ pmd_t *pmdp, pmd;
+ pte_t *ptep, pte;
+
+ /*
+ * If the given address points at a the start address of
+ * a possible leaf, we certainly won't split. Otherwise,
+ * check if we would actually split a leaf by traversing
+ * the page tables further.
+ */
+ if (IS_ALIGNED(addr, PGDIR_SIZE))
+ return false;
+
+ pgdp = pgd_offset_k(addr);
+ pgd = pgdp_get(pgdp);
+ if (!pgd_present(pgd))
+ return false;
+
+ if (IS_ALIGNED(addr, P4D_SIZE))
+ return false;
+
+ p4dp = p4d_offset(pgdp, addr);
+ p4d = p4dp_get(p4dp);
+ if (!p4d_present(p4d))
+ return false;
+
+ if (IS_ALIGNED(addr, PUD_SIZE))
+ return false;
+
+ pudp = pud_offset(p4dp, addr);
+ pud = pudp_get(pudp);
+ if (!pud_present(pud))
+ return false;
+
+ if (pud_leaf(pud))
+ return true;
+
+ if (IS_ALIGNED(addr, CONT_PMD_SIZE))
+ return false;
+
+ pmdp = pmd_offset(pudp, addr);
+ pmd = pmdp_get(pmdp);
+ if (!pmd_present(pmd))
+ return false;
+
+ if (pmd_cont(pmd))
+ return true;
+
+ if (IS_ALIGNED(addr, PMD_SIZE))
+ return false;
+
+ if (pmd_leaf(pmd))
+ return true;
+
+ if (IS_ALIGNED(addr, CONT_PTE_SIZE))
+ return false;
+
+ ptep = pte_offset_kernel(pmdp, addr);
+ pte = __ptep_get(ptep);
+ if (!pte_present(pte))
+ return false;
+
+ if (pte_cont(pte))
+ return true;
+
+ return !IS_ALIGNED(addr, PAGE_SIZE);
+}
+
+static bool can_unmap_without_split(unsigned long pfn, unsigned long nr_pages)
+{
+ unsigned long phys_start, phys_end, start, end;
+
+ phys_start = PFN_PHYS(pfn);
+ phys_end = phys_start + nr_pages * PAGE_SIZE;
+
+ /* PFN range's linear map edges are leaf entry aligned */
+ start = __phys_to_virt(phys_start);
+ end = __phys_to_virt(phys_end);
+ if (addr_splits_kernel_leaf(start) || addr_splits_kernel_leaf(end)) {
+ pr_warn("[%lx %lx] splits a leaf entry in linear map\n",
+ phys_start, phys_end);
+ return false;
+ }
+
+ /* PFN range's vmemmap edges are leaf entry aligned */
+ BUILD_BUG_ON(!IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP));
+ start = (unsigned long)pfn_to_page(pfn);
+ end = (unsigned long)pfn_to_page(pfn + nr_pages);
+ if (addr_splits_kernel_leaf(start) || addr_splits_kernel_leaf(end)) {
+ pr_warn("[%lx %lx] splits a leaf entry in vmemmap\n",
+ phys_start, phys_end);
+ return false;
+ }
+ return true;
+}
+
/*
* This memory hotplug notifier helps prevent boot memory from being
* inadvertently removed as it blocks pfn range offlining process in
@@ -2018,8 +2149,11 @@ void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
* In future if and when boot memory could be removed, this notifier
* should be dropped and free_hotplug_page_range() should handle any
* reserved pages allocated during boot.
+ *
+ * This also blocks any memory remove that would have caused a split
+ * in leaf entry in kernel linear or vmemmap mapping.
*/
-static int prevent_bootmem_remove_notifier(struct notifier_block *nb,
+static int prevent_memory_remove_notifier(struct notifier_block *nb,
unsigned long action, void *data)
{
struct mem_section *ms;
@@ -2065,11 +2199,15 @@ static int prevent_bootmem_remove_notifier(struct notifier_block *nb,
return NOTIFY_DONE;
}
}
+
+ if (!can_unmap_without_split(pfn, arg->nr_pages))
+ return NOTIFY_BAD;
+
return NOTIFY_OK;
}
-static struct notifier_block prevent_bootmem_remove_nb = {
- .notifier_call = prevent_bootmem_remove_notifier,
+static struct notifier_block prevent_memory_remove_nb = {
+ .notifier_call = prevent_memory_remove_notifier,
};
/*
@@ -2119,7 +2257,7 @@ static void validate_bootmem_online(void)
}
}
-static int __init prevent_bootmem_remove_init(void)
+static int __init prevent_memory_remove_init(void)
{
int ret = 0;
@@ -2127,13 +2265,13 @@ static int __init prevent_bootmem_remove_init(void)
return ret;
validate_bootmem_online();
- ret = register_memory_notifier(&prevent_bootmem_remove_nb);
+ ret = register_memory_notifier(&prevent_memory_remove_nb);
if (ret)
pr_err("%s: Notifier registration failed %d\n", __func__, ret);
return ret;
}
-early_initcall(prevent_bootmem_remove_init);
+early_initcall(prevent_memory_remove_init);
#endif
pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long addr,
@@ -2149,7 +2287,7 @@ pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long addr,
*/
if (pte_accessible(vma->vm_mm, pte) && pte_user_exec(pte))
__flush_tlb_range(vma, addr, nr * PAGE_SIZE,
- PAGE_SIZE, true, 3);
+ PAGE_SIZE, 3, TLBF_NOWALKCACHE);
}
return pte;
@@ -2188,7 +2326,7 @@ void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp)
phys_addr_t ttbr1 = phys_to_ttbr(virt_to_phys(pgdp));
if (cnp)
- ttbr1 |= TTBR_CNP_BIT;
+ ttbr1 |= TTBRx_EL1_CnP;
replace_phys = (void *)__pa_symbol(idmap_cpu_replace_ttbr1);
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 358d1dc9a576..ce035e1b4eaf 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -25,6 +25,11 @@ static ptdesc_t set_pageattr_masks(ptdesc_t val, struct mm_walk *walk)
{
struct page_change_data *masks = walk->private;
+ /*
+ * Some users clear and set bits which alias each other (e.g. PTE_NG and
+ * PTE_PRESENT_INVALID). It is therefore important that we always clear
+ * first then set.
+ */
val &= ~(pgprot_val(masks->clear_mask));
val |= (pgprot_val(masks->set_mask));
@@ -36,7 +41,7 @@ static int pageattr_pud_entry(pud_t *pud, unsigned long addr,
{
pud_t val = pudp_get(pud);
- if (pud_sect(val)) {
+ if (pud_leaf(val)) {
if (WARN_ON_ONCE((next - addr) != PUD_SIZE))
return -EINVAL;
val = __pud(set_pageattr_masks(pud_val(val), walk));
@@ -52,7 +57,7 @@ static int pageattr_pmd_entry(pmd_t *pmd, unsigned long addr,
{
pmd_t val = pmdp_get(pmd);
- if (pmd_sect(val)) {
+ if (pmd_leaf(val)) {
if (WARN_ON_ONCE((next - addr) != PMD_SIZE))
return -EINVAL;
val = __pmd(set_pageattr_masks(pmd_val(val), walk));
@@ -132,11 +137,12 @@ static int __change_memory_common(unsigned long start, unsigned long size,
ret = update_range_prot(start, size, set_mask, clear_mask);
/*
- * If the memory is being made valid without changing any other bits
- * then a TLBI isn't required as a non-valid entry cannot be cached in
- * the TLB.
+ * If the memory is being switched from present-invalid to valid without
+ * changing any other bits then a TLBI isn't required as a non-valid
+ * entry cannot be cached in the TLB.
*/
- if (pgprot_val(set_mask) != PTE_VALID || pgprot_val(clear_mask))
+ if (pgprot_val(set_mask) != PTE_PRESENT_VALID_KERNEL ||
+ pgprot_val(clear_mask) != PTE_PRESENT_INVALID)
flush_tlb_kernel_range(start, start + size);
return ret;
}
@@ -237,18 +243,18 @@ int set_memory_valid(unsigned long addr, int numpages, int enable)
{
if (enable)
return __change_memory_common(addr, PAGE_SIZE * numpages,
- __pgprot(PTE_VALID),
- __pgprot(0));
+ __pgprot(PTE_PRESENT_VALID_KERNEL),
+ __pgprot(PTE_PRESENT_INVALID));
else
return __change_memory_common(addr, PAGE_SIZE * numpages,
- __pgprot(0),
- __pgprot(PTE_VALID));
+ __pgprot(PTE_PRESENT_INVALID),
+ __pgprot(PTE_PRESENT_VALID_KERNEL));
}
int set_direct_map_invalid_noflush(struct page *page)
{
- pgprot_t clear_mask = __pgprot(PTE_VALID);
- pgprot_t set_mask = __pgprot(0);
+ pgprot_t clear_mask = __pgprot(PTE_PRESENT_VALID_KERNEL);
+ pgprot_t set_mask = __pgprot(PTE_PRESENT_INVALID);
if (!can_set_direct_map())
return 0;
@@ -259,8 +265,8 @@ int set_direct_map_invalid_noflush(struct page *page)
int set_direct_map_default_noflush(struct page *page)
{
- pgprot_t set_mask = __pgprot(PTE_VALID | PTE_WRITE);
- pgprot_t clear_mask = __pgprot(PTE_RDONLY);
+ pgprot_t set_mask = __pgprot(PTE_PRESENT_VALID_KERNEL | PTE_WRITE);
+ pgprot_t clear_mask = __pgprot(PTE_PRESENT_INVALID | PTE_RDONLY);
if (!can_set_direct_map())
return 0;
@@ -296,8 +302,8 @@ static int __set_memory_enc_dec(unsigned long addr,
* entries or Synchronous External Aborts caused by RIPAS_EMPTY
*/
ret = __change_memory_common(addr, PAGE_SIZE * numpages,
- __pgprot(set_prot),
- __pgprot(clear_prot | PTE_VALID));
+ __pgprot(set_prot | PTE_PRESENT_INVALID),
+ __pgprot(clear_prot | PTE_PRESENT_VALID_KERNEL));
if (ret)
return ret;
@@ -311,8 +317,8 @@ static int __set_memory_enc_dec(unsigned long addr,
return ret;
return __change_memory_common(addr, PAGE_SIZE * numpages,
- __pgprot(PTE_VALID),
- __pgprot(0));
+ __pgprot(PTE_PRESENT_VALID_KERNEL),
+ __pgprot(PTE_PRESENT_INVALID));
}
static int realm_set_memory_encrypted(unsigned long addr, int numpages)
@@ -404,15 +410,15 @@ bool kernel_page_present(struct page *page)
pud = READ_ONCE(*pudp);
if (pud_none(pud))
return false;
- if (pud_sect(pud))
- return true;
+ if (pud_leaf(pud))
+ return pud_valid(pud);
pmdp = pmd_offset(pudp, addr);
pmd = READ_ONCE(*pmdp);
if (pmd_none(pmd))
return false;
- if (pmd_sect(pmd))
- return true;
+ if (pmd_leaf(pmd))
+ return pmd_valid(pmd);
ptep = pte_offset_kernel(pmdp, addr);
return pte_valid(__ptep_get(ptep));
diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c
index 18543b603c77..cca9706a875c 100644
--- a/arch/arm64/mm/trans_pgd.c
+++ b/arch/arm64/mm/trans_pgd.c
@@ -31,36 +31,6 @@ static void *trans_alloc(struct trans_pgd_info *info)
return info->trans_alloc_page(info->trans_alloc_arg);
}
-static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr)
-{
- pte_t pte = __ptep_get(src_ptep);
-
- if (pte_valid(pte)) {
- /*
- * Resume will overwrite areas that may be marked
- * read only (code, rodata). Clear the RDONLY bit from
- * the temporary mappings we use during restore.
- */
- __set_pte(dst_ptep, pte_mkwrite_novma(pte));
- } else if (!pte_none(pte)) {
- /*
- * debug_pagealloc will removed the PTE_VALID bit if
- * the page isn't in use by the resume kernel. It may have
- * been in use by the original kernel, in which case we need
- * to put it back in our copy to do the restore.
- *
- * Other cases include kfence / vmalloc / memfd_secret which
- * may call `set_direct_map_invalid_noflush()`.
- *
- * Before marking this entry valid, check the pfn should
- * be mapped.
- */
- BUG_ON(!pfn_valid(pte_pfn(pte)));
-
- __set_pte(dst_ptep, pte_mkvalid(pte_mkwrite_novma(pte)));
- }
-}
-
static int copy_pte(struct trans_pgd_info *info, pmd_t *dst_pmdp,
pmd_t *src_pmdp, unsigned long start, unsigned long end)
{
@@ -76,7 +46,11 @@ static int copy_pte(struct trans_pgd_info *info, pmd_t *dst_pmdp,
src_ptep = pte_offset_kernel(src_pmdp, start);
do {
- _copy_pte(dst_ptep, src_ptep, addr);
+ pte_t pte = __ptep_get(src_ptep);
+
+ if (pte_none(pte))
+ continue;
+ __set_pte(dst_ptep, pte_mkvalid_k(pte_mkwrite_novma(pte)));
} while (dst_ptep++, src_ptep++, addr += PAGE_SIZE, addr != end);
return 0;
@@ -109,8 +83,7 @@ static int copy_pmd(struct trans_pgd_info *info, pud_t *dst_pudp,
if (copy_pte(info, dst_pmdp, src_pmdp, addr, next))
return -ENOMEM;
} else {
- set_pmd(dst_pmdp,
- __pmd(pmd_val(pmd) & ~PMD_SECT_RDONLY));
+ set_pmd(dst_pmdp, pmd_mkvalid_k(pmd_mkwrite_novma(pmd)));
}
} while (dst_pmdp++, src_pmdp++, addr = next, addr != end);
@@ -145,8 +118,7 @@ static int copy_pud(struct trans_pgd_info *info, p4d_t *dst_p4dp,
if (copy_pmd(info, dst_pudp, src_pudp, addr, next))
return -ENOMEM;
} else {
- set_pud(dst_pudp,
- __pud(pud_val(pud) & ~PUD_SECT_RDONLY));
+ set_pud(dst_pudp, pud_mkvalid_k(pud_mkwrite_novma(pud)));
}
} while (dst_pudp++, src_pudp++, addr = next, addr != end);
diff --git a/arch/arm64/tools/Makefile b/arch/arm64/tools/Makefile
index c2b34e761006..a94b3d9caad6 100644
--- a/arch/arm64/tools/Makefile
+++ b/arch/arm64/tools/Makefile
@@ -3,7 +3,7 @@
gen := arch/$(ARCH)/include/generated
kapi := $(gen)/asm
-kapisyshdr-y := cpucap-defs.h sysreg-defs.h
+kapisyshdr-y := cpucap-defs.h kernel-hwcap.h sysreg-defs.h
kapi-hdrs-y := $(addprefix $(kapi)/, $(kapisyshdr-y))
@@ -18,11 +18,17 @@ kapi: $(kapi-hdrs-y)
quiet_cmd_gen_cpucaps = GEN $@
cmd_gen_cpucaps = mkdir -p $(dir $@); $(AWK) -f $(real-prereqs) > $@
+quiet_cmd_gen_kernel_hwcap = GEN $@
+ cmd_gen_kernel_hwcap = mkdir -p $(dir $@); /bin/sh -e $(real-prereqs) > $@
+
quiet_cmd_gen_sysreg = GEN $@
cmd_gen_sysreg = mkdir -p $(dir $@); $(AWK) -f $(real-prereqs) > $@
$(kapi)/cpucap-defs.h: $(src)/gen-cpucaps.awk $(src)/cpucaps FORCE
$(call if_changed,gen_cpucaps)
+$(kapi)/kernel-hwcap.h: $(src)/gen-kernel-hwcaps.sh $(srctree)/arch/arm64/include/uapi/asm/hwcap.h FORCE
+ $(call if_changed,gen_kernel_hwcap)
+
$(kapi)/sysreg-defs.h: $(src)/gen-sysreg.awk $(src)/sysreg FORCE
$(call if_changed,gen_sysreg)
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index 7261553b644b..b7286d977788 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -48,6 +48,7 @@ HAS_LPA2
HAS_LSE_ATOMICS
HAS_LS64
HAS_LS64_V
+HAS_LSUI
HAS_MOPS
HAS_NESTED_VIRT
HAS_BBML2_NOABORT
diff --git a/arch/arm64/tools/gen-kernel-hwcaps.sh b/arch/arm64/tools/gen-kernel-hwcaps.sh
new file mode 100644
index 000000000000..e7cdcf428d91
--- /dev/null
+++ b/arch/arm64/tools/gen-kernel-hwcaps.sh
@@ -0,0 +1,23 @@
+#!/bin/sh -e
+# SPDX-License-Identifier: GPL-2.0
+#
+# gen-kernel-hwcap.sh - Generate kernel internal hwcap.h definitions
+#
+# Copyright 2026 Arm, Ltd.
+
+if [ "$1" = "" ]; then
+ echo "$0: no filename specified"
+ exit 1
+fi
+
+echo "#ifndef __ASM_KERNEL_HWCAPS_H"
+echo "#define __ASM_KERNEL_HWCAPS_H"
+echo ""
+echo "/* Generated file - do not edit */"
+echo ""
+
+grep -E '^#define HWCAP[0-9]*_[A-Z0-9_]+' $1 | \
+ sed 's/.*HWCAP\([0-9]*\)_\([A-Z0-9_]\+\).*/#define KERNEL_HWCAP_\2\t__khwcap\1_feature(\2)/'
+
+echo ""
+echo "#endif /* __ASM_KERNEL_HWCAPS_H */"
diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg
index 9d1c21108057..9d20ec6816d4 100644
--- a/arch/arm64/tools/sysreg
+++ b/arch/arm64/tools/sysreg
@@ -1496,6 +1496,7 @@ UnsignedEnum 27:24 B16B16
0b0000 NI
0b0001 IMP
0b0010 BFSCALE
+ 0b0011 B16MM
EndEnum
UnsignedEnum 23:20 BF16
0b0000 NI
@@ -1522,6 +1523,7 @@ UnsignedEnum 3:0 SVEver
0b0001 SVE2
0b0010 SVE2p1
0b0011 SVE2p2
+ 0b0100 SVE2p3
EndEnum
EndSysreg
@@ -1530,7 +1532,11 @@ UnsignedEnum 63 FA64
0b0 NI
0b1 IMP
EndEnum
-Res0 62:61
+Res0 62
+UnsignedEnum 61 LUT6
+ 0b0 NI
+ 0b1 IMP
+EndEnum
UnsignedEnum 60 LUTv2
0b0 NI
0b1 IMP
@@ -1540,6 +1546,7 @@ UnsignedEnum 59:56 SMEver
0b0001 SME2
0b0010 SME2p1
0b0011 SME2p2
+ 0b0100 SME2p3
EndEnum
UnsignedEnum 55:52 I16I64
0b0000 NI
@@ -1654,7 +1661,13 @@ UnsignedEnum 26 F8MM4
0b0 NI
0b1 IMP
EndEnum
-Res0 25:2
+Res0 25:16
+UnsignedEnum 15 F16MM2
+ 0b0 NI
+ 0b1 IMP
+EndEnum
+Res0 14:8
+Raz 7:2
UnsignedEnum 1 F8E4M3
0b0 NI
0b1 IMP
@@ -1835,6 +1848,8 @@ EndEnum
UnsignedEnum 51:48 FHM
0b0000 NI
0b0001 IMP
+ 0b0010 F16F32DOT
+ 0b0011 F16F32MM
EndEnum
UnsignedEnum 47:44 DP
0b0000 NI
@@ -1976,6 +1991,7 @@ EndEnum
UnsignedEnum 59:56 LUT
0b0000 NI
0b0001 IMP
+ 0b0010 LUT6
EndEnum
UnsignedEnum 55:52 CSSC
0b0000 NI
@@ -3655,11 +3671,15 @@ Field 3:0 BS
EndSysreg
Sysreg SMIDR_EL1 3 1 0 0 6
-Res0 63:32
+Res0 63:60
+Field 59:56 NSMC
+Field 55:52 HIP
+Field 51:32 AFFINITY2
Field 31:24 IMPLEMENTER
Field 23:16 REVISION
Field 15 SMPS
-Res0 14:12
+Field 14:13 SH
+Res0 12
Field 11:0 AFFINITY
EndSysreg
@@ -5172,6 +5192,14 @@ Field 31:16 PARTID_D
Field 15:0 PARTID_I
EndSysreg
+Sysreg MPAMSM_EL1 3 0 10 5 3
+Res0 63:48
+Field 47:40 PMG_D
+Res0 39:32
+Field 31:16 PARTID_D
+Res0 15:0
+EndSysreg
+
Sysreg ISR_EL1 3 0 12 1 0
Res0 63:11
Field 10 IS
diff --git a/drivers/acpi/arm64/agdi.c b/drivers/acpi/arm64/agdi.c
index feb4b2cb4618..0c2d9d6c160b 100644
--- a/drivers/acpi/arm64/agdi.c
+++ b/drivers/acpi/arm64/agdi.c
@@ -36,7 +36,7 @@ static int agdi_sdei_probe(struct platform_device *pdev,
err = sdei_event_register(adata->sdei_event, agdi_sdei_handler, pdev);
if (err) {
- dev_err(&pdev->dev, "Failed to register for SDEI event %d",
+ dev_err(&pdev->dev, "Failed to register for SDEI event %d\n",
adata->sdei_event);
return err;
}
diff --git a/drivers/resctrl/Kconfig b/drivers/resctrl/Kconfig
index c808e0470394..672abea3b03c 100644
--- a/drivers/resctrl/Kconfig
+++ b/drivers/resctrl/Kconfig
@@ -1,6 +1,7 @@
menuconfig ARM64_MPAM_DRIVER
bool "MPAM driver"
- depends on ARM64 && ARM64_MPAM && EXPERT
+ depends on ARM64 && ARM64_MPAM
+ select ACPI_MPAM if ACPI
help
Memory System Resource Partitioning and Monitoring (MPAM) driver for
System IP, e.g. caches and memory controllers.
@@ -22,3 +23,9 @@ config MPAM_KUNIT_TEST
If unsure, say N.
endif
+
+config ARM64_MPAM_RESCTRL_FS
+ bool
+ default y if ARM64_MPAM_DRIVER && RESCTRL_FS
+ select RESCTRL_RMID_DEPENDS_ON_CLOSID
+ select RESCTRL_ASSIGN_FIXED
diff --git a/drivers/resctrl/Makefile b/drivers/resctrl/Makefile
index 898199dcf80d..4f6d0e81f9b8 100644
--- a/drivers/resctrl/Makefile
+++ b/drivers/resctrl/Makefile
@@ -1,4 +1,5 @@
obj-$(CONFIG_ARM64_MPAM_DRIVER) += mpam.o
mpam-y += mpam_devices.o
+mpam-$(CONFIG_ARM64_MPAM_RESCTRL_FS) += mpam_resctrl.o
ccflags-$(CONFIG_ARM64_MPAM_DRIVER_DEBUG) += -DDEBUG
diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c
index 1eebc2602187..3e419e59a36f 100644
--- a/drivers/resctrl/mpam_devices.c
+++ b/drivers/resctrl/mpam_devices.c
@@ -29,7 +29,15 @@
#include "mpam_internal.h"
-DEFINE_STATIC_KEY_FALSE(mpam_enabled); /* This moves to arch code */
+/* Values for the T241 errata workaround */
+#define T241_CHIPS_MAX 4
+#define T241_CHIP_NSLICES 12
+#define T241_SPARE_REG0_OFF 0x1b0000
+#define T241_SPARE_REG1_OFF 0x1c0000
+#define T241_CHIP_ID(phys) FIELD_GET(GENMASK_ULL(44, 43), phys)
+#define T241_SHADOW_REG_OFF(sidx, pid) (0x360048 + (sidx) * 0x10000 + (pid) * 8)
+#define SMCCC_SOC_ID_T241 0x036b0241
+static void __iomem *t241_scratch_regs[T241_CHIPS_MAX];
/*
* mpam_list_lock protects the SRCU lists when writing. Once the
@@ -76,6 +84,14 @@ static DECLARE_WORK(mpam_broken_work, &mpam_disable);
static char *mpam_disable_reason;
/*
+ * Whether resctrl has been setup. Used by cpuhp in preference to
+ * mpam_is_enabled(). The disable call after an error interrupt makes
+ * mpam_is_enabled() false before the cpuhp callbacks are made.
+ * Reads/writes should hold mpam_cpuhp_state_lock, (or be cpuhp callbacks).
+ */
+static bool mpam_resctrl_enabled;
+
+/*
* An MSC is a physical container for controls and monitors, each identified by
* their RIS index. These share a base-address, interrupts and some MMIO
* registers. A vMSC is a virtual container for RIS in an MSC that control or
@@ -624,6 +640,86 @@ static struct mpam_msc_ris *mpam_get_or_create_ris(struct mpam_msc *msc,
return ERR_PTR(-ENOENT);
}
+static int mpam_enable_quirk_nvidia_t241_1(struct mpam_msc *msc,
+ const struct mpam_quirk *quirk)
+{
+ s32 soc_id = arm_smccc_get_soc_id_version();
+ struct resource *r;
+ phys_addr_t phys;
+
+ /*
+ * A mapping to a device other than the MSC is needed, check
+ * SOC_ID is NVIDIA T241 chip (036b:0241)
+ */
+ if (soc_id < 0 || soc_id != SMCCC_SOC_ID_T241)
+ return -EINVAL;
+
+ r = platform_get_resource(msc->pdev, IORESOURCE_MEM, 0);
+ if (!r)
+ return -EINVAL;
+
+ /* Find the internal registers base addr from the CHIP ID */
+ msc->t241_id = T241_CHIP_ID(r->start);
+ phys = FIELD_PREP(GENMASK_ULL(45, 44), msc->t241_id) | 0x19000000ULL;
+
+ t241_scratch_regs[msc->t241_id] = ioremap(phys, SZ_8M);
+ if (WARN_ON_ONCE(!t241_scratch_regs[msc->t241_id]))
+ return -EINVAL;
+
+ pr_info_once("Enabled workaround for NVIDIA T241 erratum T241-MPAM-1\n");
+
+ return 0;
+}
+
+static const struct mpam_quirk mpam_quirks[] = {
+ {
+ /* NVIDIA t241 erratum T241-MPAM-1 */
+ .init = mpam_enable_quirk_nvidia_t241_1,
+ .iidr = MPAM_IIDR_NVIDIA_T241,
+ .iidr_mask = MPAM_IIDR_MATCH_ONE,
+ .workaround = T241_SCRUB_SHADOW_REGS,
+ },
+ {
+ /* NVIDIA t241 erratum T241-MPAM-4 */
+ .iidr = MPAM_IIDR_NVIDIA_T241,
+ .iidr_mask = MPAM_IIDR_MATCH_ONE,
+ .workaround = T241_FORCE_MBW_MIN_TO_ONE,
+ },
+ {
+ /* NVIDIA t241 erratum T241-MPAM-6 */
+ .iidr = MPAM_IIDR_NVIDIA_T241,
+ .iidr_mask = MPAM_IIDR_MATCH_ONE,
+ .workaround = T241_MBW_COUNTER_SCALE_64,
+ },
+ {
+ /* ARM CMN-650 CSU erratum 3642720 */
+ .iidr = MPAM_IIDR_ARM_CMN_650,
+ .iidr_mask = MPAM_IIDR_MATCH_ONE,
+ .workaround = IGNORE_CSU_NRDY,
+ },
+ { NULL } /* Sentinel */
+};
+
+static void mpam_enable_quirks(struct mpam_msc *msc)
+{
+ const struct mpam_quirk *quirk;
+
+ for (quirk = &mpam_quirks[0]; quirk->iidr_mask; quirk++) {
+ int err = 0;
+
+ if (quirk->iidr != (msc->iidr & quirk->iidr_mask))
+ continue;
+
+ if (quirk->init)
+ err = quirk->init(msc, quirk);
+
+ if (err)
+ continue;
+
+ mpam_set_quirk(quirk->workaround, msc);
+ }
+}
+
/*
* IHI009A.a has this nugget: "If a monitor does not support automatic behaviour
* of NRDY, software can use this bit for any purpose" - so hardware might not
@@ -715,6 +811,13 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris)
mpam_set_feature(mpam_feat_mbw_part, props);
props->bwa_wd = FIELD_GET(MPAMF_MBW_IDR_BWA_WD, mbw_features);
+
+ /*
+ * The BWA_WD field can represent 0-63, but the control fields it
+ * describes have a maximum of 16 bits.
+ */
+ props->bwa_wd = min(props->bwa_wd, 16);
+
if (props->bwa_wd && FIELD_GET(MPAMF_MBW_IDR_HAS_MAX, mbw_features))
mpam_set_feature(mpam_feat_mbw_max, props);
@@ -851,8 +954,11 @@ static int mpam_msc_hw_probe(struct mpam_msc *msc)
/* Grab an IDR value to find out how many RIS there are */
mutex_lock(&msc->part_sel_lock);
idr = mpam_msc_read_idr(msc);
+ msc->iidr = mpam_read_partsel_reg(msc, IIDR);
mutex_unlock(&msc->part_sel_lock);
+ mpam_enable_quirks(msc);
+
msc->ris_max = FIELD_GET(MPAMF_IDR_RIS_MAX, idr);
/* Use these values so partid/pmg always starts with a valid value */
@@ -903,6 +1009,7 @@ struct mon_read {
enum mpam_device_features type;
u64 *val;
int err;
+ bool waited_timeout;
};
static bool mpam_ris_has_mbwu_long_counter(struct mpam_msc_ris *ris)
@@ -1052,7 +1159,7 @@ static void write_msmon_ctl_flt_vals(struct mon_read *m, u32 ctl_val,
}
}
-static u64 mpam_msmon_overflow_val(enum mpam_device_features type)
+static u64 __mpam_msmon_overflow_val(enum mpam_device_features type)
{
/* TODO: implement scaling counters */
switch (type) {
@@ -1067,6 +1174,18 @@ static u64 mpam_msmon_overflow_val(enum mpam_device_features type)
}
}
+static u64 mpam_msmon_overflow_val(enum mpam_device_features type,
+ struct mpam_msc *msc)
+{
+ u64 overflow_val = __mpam_msmon_overflow_val(type);
+
+ if (mpam_has_quirk(T241_MBW_COUNTER_SCALE_64, msc) &&
+ type != mpam_feat_msmon_mbwu_63counter)
+ overflow_val *= 64;
+
+ return overflow_val;
+}
+
static void __ris_msmon_read(void *arg)
{
u64 now;
@@ -1137,6 +1256,10 @@ static void __ris_msmon_read(void *arg)
if (mpam_has_feature(mpam_feat_msmon_csu_hw_nrdy, rprops))
nrdy = now & MSMON___NRDY;
now = FIELD_GET(MSMON___VALUE, now);
+
+ if (mpam_has_quirk(IGNORE_CSU_NRDY, msc) && m->waited_timeout)
+ nrdy = false;
+
break;
case mpam_feat_msmon_mbwu_31counter:
case mpam_feat_msmon_mbwu_44counter:
@@ -1157,13 +1280,17 @@ static void __ris_msmon_read(void *arg)
now = FIELD_GET(MSMON___VALUE, now);
}
+ if (mpam_has_quirk(T241_MBW_COUNTER_SCALE_64, msc) &&
+ m->type != mpam_feat_msmon_mbwu_63counter)
+ now *= 64;
+
if (nrdy)
break;
mbwu_state = &ris->mbwu_state[ctx->mon];
if (overflow)
- mbwu_state->correction += mpam_msmon_overflow_val(m->type);
+ mbwu_state->correction += mpam_msmon_overflow_val(m->type, msc);
/*
* Include bandwidth consumed before the last hardware reset and
@@ -1270,6 +1397,7 @@ int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx,
.ctx = ctx,
.type = type,
.val = val,
+ .waited_timeout = true,
};
*val = 0;
@@ -1338,6 +1466,75 @@ static void mpam_reset_msc_bitmap(struct mpam_msc *msc, u16 reg, u16 wd)
__mpam_write_reg(msc, reg, bm);
}
+static void mpam_apply_t241_erratum(struct mpam_msc_ris *ris, u16 partid)
+{
+ int sidx, i, lcount = 1000;
+ void __iomem *regs;
+ u64 val0, val;
+
+ regs = t241_scratch_regs[ris->vmsc->msc->t241_id];
+
+ for (i = 0; i < lcount; i++) {
+ /* Read the shadow register at index 0 */
+ val0 = readq_relaxed(regs + T241_SHADOW_REG_OFF(0, partid));
+
+ /* Check if all the shadow registers have the same value */
+ for (sidx = 1; sidx < T241_CHIP_NSLICES; sidx++) {
+ val = readq_relaxed(regs +
+ T241_SHADOW_REG_OFF(sidx, partid));
+ if (val != val0)
+ break;
+ }
+ if (sidx == T241_CHIP_NSLICES)
+ break;
+ }
+
+ if (i == lcount)
+ pr_warn_once("t241: inconsistent values in shadow regs");
+
+ /* Write a value zero to spare registers to take effect of MBW conf */
+ writeq_relaxed(0, regs + T241_SPARE_REG0_OFF);
+ writeq_relaxed(0, regs + T241_SPARE_REG1_OFF);
+}
+
+static void mpam_quirk_post_config_change(struct mpam_msc_ris *ris, u16 partid,
+ struct mpam_config *cfg)
+{
+ if (mpam_has_quirk(T241_SCRUB_SHADOW_REGS, ris->vmsc->msc))
+ mpam_apply_t241_erratum(ris, partid);
+}
+
+static u16 mpam_wa_t241_force_mbw_min_to_one(struct mpam_props *props)
+{
+ u16 max_hw_value, min_hw_granule, res0_bits;
+
+ res0_bits = 16 - props->bwa_wd;
+ max_hw_value = ((1 << props->bwa_wd) - 1) << res0_bits;
+ min_hw_granule = ~max_hw_value;
+
+ return min_hw_granule + 1;
+}
+
+static u16 mpam_wa_t241_calc_min_from_max(struct mpam_props *props,
+ struct mpam_config *cfg)
+{
+ u16 val = 0;
+ u16 max;
+ u16 delta = ((5 * MPAMCFG_MBW_MAX_MAX) / 100) - 1;
+
+ if (mpam_has_feature(mpam_feat_mbw_max, cfg)) {
+ max = cfg->mbw_max;
+ } else {
+ /* Resetting. Hence, use the ris specific default. */
+ max = GENMASK(15, 16 - props->bwa_wd);
+ }
+
+ if (max > delta)
+ val = max - delta;
+
+ return val;
+}
+
/* Called via IPI. Call while holding an SRCU reference */
static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid,
struct mpam_config *cfg)
@@ -1364,36 +1561,41 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid,
__mpam_intpart_sel(ris->ris_idx, partid, msc);
}
- if (mpam_has_feature(mpam_feat_cpor_part, rprops) &&
- mpam_has_feature(mpam_feat_cpor_part, cfg)) {
- if (cfg->reset_cpbm)
- mpam_reset_msc_bitmap(msc, MPAMCFG_CPBM, rprops->cpbm_wd);
- else
+ if (mpam_has_feature(mpam_feat_cpor_part, rprops)) {
+ if (mpam_has_feature(mpam_feat_cpor_part, cfg))
mpam_write_partsel_reg(msc, CPBM, cfg->cpbm);
+ else
+ mpam_reset_msc_bitmap(msc, MPAMCFG_CPBM, rprops->cpbm_wd);
}
- if (mpam_has_feature(mpam_feat_mbw_part, rprops) &&
- mpam_has_feature(mpam_feat_mbw_part, cfg)) {
- if (cfg->reset_mbw_pbm)
+ if (mpam_has_feature(mpam_feat_mbw_part, rprops)) {
+ if (mpam_has_feature(mpam_feat_mbw_part, cfg))
mpam_reset_msc_bitmap(msc, MPAMCFG_MBW_PBM, rprops->mbw_pbm_bits);
else
mpam_write_partsel_reg(msc, MBW_PBM, cfg->mbw_pbm);
}
- if (mpam_has_feature(mpam_feat_mbw_min, rprops) &&
- mpam_has_feature(mpam_feat_mbw_min, cfg))
- mpam_write_partsel_reg(msc, MBW_MIN, 0);
+ if (mpam_has_feature(mpam_feat_mbw_min, rprops)) {
+ u16 val = 0;
- if (mpam_has_feature(mpam_feat_mbw_max, rprops) &&
- mpam_has_feature(mpam_feat_mbw_max, cfg)) {
- if (cfg->reset_mbw_max)
- mpam_write_partsel_reg(msc, MBW_MAX, MPAMCFG_MBW_MAX_MAX);
- else
+ if (mpam_has_quirk(T241_FORCE_MBW_MIN_TO_ONE, msc)) {
+ u16 min = mpam_wa_t241_force_mbw_min_to_one(rprops);
+
+ val = mpam_wa_t241_calc_min_from_max(rprops, cfg);
+ val = max(val, min);
+ }
+
+ mpam_write_partsel_reg(msc, MBW_MIN, val);
+ }
+
+ if (mpam_has_feature(mpam_feat_mbw_max, rprops)) {
+ if (mpam_has_feature(mpam_feat_mbw_max, cfg))
mpam_write_partsel_reg(msc, MBW_MAX, cfg->mbw_max);
+ else
+ mpam_write_partsel_reg(msc, MBW_MAX, MPAMCFG_MBW_MAX_MAX);
}
- if (mpam_has_feature(mpam_feat_mbw_prop, rprops) &&
- mpam_has_feature(mpam_feat_mbw_prop, cfg))
+ if (mpam_has_feature(mpam_feat_mbw_prop, rprops))
mpam_write_partsel_reg(msc, MBW_PROP, 0);
if (mpam_has_feature(mpam_feat_cmax_cmax, rprops))
@@ -1421,6 +1623,8 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid,
mpam_write_partsel_reg(msc, PRI, pri_val);
}
+ mpam_quirk_post_config_change(ris, partid, cfg);
+
mutex_unlock(&msc->part_sel_lock);
}
@@ -1491,16 +1695,6 @@ static int mpam_save_mbwu_state(void *arg)
return 0;
}
-static void mpam_init_reset_cfg(struct mpam_config *reset_cfg)
-{
- *reset_cfg = (struct mpam_config) {
- .reset_cpbm = true,
- .reset_mbw_pbm = true,
- .reset_mbw_max = true,
- };
- bitmap_fill(reset_cfg->features, MPAM_FEATURE_LAST);
-}
-
/*
* Called via smp_call_on_cpu() to prevent migration, while still being
* pre-emptible. Caller must hold mpam_srcu.
@@ -1508,14 +1702,12 @@ static void mpam_init_reset_cfg(struct mpam_config *reset_cfg)
static int mpam_reset_ris(void *arg)
{
u16 partid, partid_max;
- struct mpam_config reset_cfg;
+ struct mpam_config reset_cfg = {};
struct mpam_msc_ris *ris = arg;
if (ris->in_reset_state)
return 0;
- mpam_init_reset_cfg(&reset_cfg);
-
spin_lock(&partid_max_lock);
partid_max = mpam_partid_max;
spin_unlock(&partid_max_lock);
@@ -1630,6 +1822,9 @@ static int mpam_cpu_online(unsigned int cpu)
mpam_reprogram_msc(msc);
}
+ if (mpam_resctrl_enabled)
+ return mpam_resctrl_online_cpu(cpu);
+
return 0;
}
@@ -1673,6 +1868,9 @@ static int mpam_cpu_offline(unsigned int cpu)
{
struct mpam_msc *msc;
+ if (mpam_resctrl_enabled)
+ mpam_resctrl_offline_cpu(cpu);
+
guard(srcu)(&mpam_srcu);
list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list,
srcu_read_lock_held(&mpam_srcu)) {
@@ -1969,6 +2167,7 @@ static bool mpam_has_cmax_wd_feature(struct mpam_props *props)
* resulting safe value must be compatible with both. When merging values in
* the tree, all the aliasing resources must be handled first.
* On mismatch, parent is modified.
+ * Quirks on an MSC will apply to all MSC in that class.
*/
static void __props_mismatch(struct mpam_props *parent,
struct mpam_props *child, bool alias)
@@ -2088,6 +2287,7 @@ static void __props_mismatch(struct mpam_props *parent,
* nobble the class feature, as we can't configure all the resources.
* e.g. The L3 cache is composed of two resources with 13 and 17 portion
* bitmaps respectively.
+ * Quirks on an MSC will apply to all MSC in that class.
*/
static void
__class_props_mismatch(struct mpam_class *class, struct mpam_vmsc *vmsc)
@@ -2101,6 +2301,9 @@ __class_props_mismatch(struct mpam_class *class, struct mpam_vmsc *vmsc)
dev_dbg(dev, "Merging features for class:0x%lx &= vmsc:0x%lx\n",
(long)cprops->features, (long)vprops->features);
+ /* Merge quirks */
+ class->quirks |= vmsc->msc->quirks;
+
/* Take the safe value for any common features */
__props_mismatch(cprops, vprops, false);
}
@@ -2165,6 +2368,9 @@ static void mpam_enable_merge_class_features(struct mpam_component *comp)
list_for_each_entry(vmsc, &comp->vmsc, comp_list)
__class_props_mismatch(class, vmsc);
+
+ if (mpam_has_quirk(T241_FORCE_MBW_MIN_TO_ONE, class))
+ mpam_clear_feature(mpam_feat_mbw_min, &class->props);
}
/*
@@ -2518,6 +2724,12 @@ static void mpam_enable_once(void)
mutex_unlock(&mpam_list_lock);
cpus_read_unlock();
+ if (!err) {
+ err = mpam_resctrl_setup();
+ if (err)
+ pr_err("Failed to initialise resctrl: %d\n", err);
+ }
+
if (err) {
mpam_disable_reason = "Failed to enable.";
schedule_work(&mpam_broken_work);
@@ -2525,6 +2737,7 @@ static void mpam_enable_once(void)
}
static_branch_enable(&mpam_enabled);
+ mpam_resctrl_enabled = true;
mpam_register_cpuhp_callbacks(mpam_cpu_online, mpam_cpu_offline,
"mpam:online");
@@ -2557,7 +2770,7 @@ static void mpam_reset_component_locked(struct mpam_component *comp)
}
}
-static void mpam_reset_class_locked(struct mpam_class *class)
+void mpam_reset_class_locked(struct mpam_class *class)
{
struct mpam_component *comp;
@@ -2584,24 +2797,39 @@ static void mpam_reset_class(struct mpam_class *class)
void mpam_disable(struct work_struct *ignored)
{
int idx;
+ bool do_resctrl_exit;
struct mpam_class *class;
struct mpam_msc *msc, *tmp;
+ if (mpam_is_enabled())
+ static_branch_disable(&mpam_enabled);
+
mutex_lock(&mpam_cpuhp_state_lock);
if (mpam_cpuhp_state) {
cpuhp_remove_state(mpam_cpuhp_state);
mpam_cpuhp_state = 0;
}
+
+ /*
+ * Removing the cpuhp state called mpam_cpu_offline() and told resctrl
+ * all the CPUs are offline.
+ */
+ do_resctrl_exit = mpam_resctrl_enabled;
+ mpam_resctrl_enabled = false;
mutex_unlock(&mpam_cpuhp_state_lock);
- static_branch_disable(&mpam_enabled);
+ if (do_resctrl_exit)
+ mpam_resctrl_exit();
mpam_unregister_irqs();
idx = srcu_read_lock(&mpam_srcu);
list_for_each_entry_srcu(class, &mpam_classes, classes_list,
- srcu_read_lock_held(&mpam_srcu))
+ srcu_read_lock_held(&mpam_srcu)) {
mpam_reset_class(class);
+ if (do_resctrl_exit)
+ mpam_resctrl_teardown_class(class);
+ }
srcu_read_unlock(&mpam_srcu, idx);
mutex_lock(&mpam_list_lock);
@@ -2692,6 +2920,7 @@ int mpam_apply_config(struct mpam_component *comp, u16 partid,
srcu_read_lock_held(&mpam_srcu)) {
arg.ris = ris;
mpam_touch_msc(msc, __write_config, &arg);
+ ris->in_reset_state = false;
}
mutex_unlock(&msc->cfg_lock);
}
diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h
index e8971842b124..1914aefdcba9 100644
--- a/drivers/resctrl/mpam_internal.h
+++ b/drivers/resctrl/mpam_internal.h
@@ -12,22 +12,31 @@
#include <linux/jump_label.h>
#include <linux/llist.h>
#include <linux/mutex.h>
+#include <linux/resctrl.h>
#include <linux/spinlock.h>
#include <linux/srcu.h>
#include <linux/types.h>
+#include <asm/mpam.h>
+
#define MPAM_MSC_MAX_NUM_RIS 16
struct platform_device;
-DECLARE_STATIC_KEY_FALSE(mpam_enabled);
-
#ifdef CONFIG_MPAM_KUNIT_TEST
#define PACKED_FOR_KUNIT __packed
#else
#define PACKED_FOR_KUNIT
#endif
+/*
+ * This 'mon' values must not alias an actual monitor, so must be larger than
+ * U16_MAX, but not be confused with an errno value, so smaller than
+ * (u32)-SZ_4K.
+ * USE_PRE_ALLOCATED is used to avoid confusion with an actual monitor.
+ */
+#define USE_PRE_ALLOCATED (U16_MAX + 1)
+
static inline bool mpam_is_enabled(void)
{
return static_branch_likely(&mpam_enabled);
@@ -76,6 +85,8 @@ struct mpam_msc {
u8 pmg_max;
unsigned long ris_idxs;
u32 ris_max;
+ u32 iidr;
+ u16 quirks;
/*
* error_irq_lock is taken when registering/unregistering the error
@@ -119,6 +130,9 @@ struct mpam_msc {
void __iomem *mapped_hwpage;
size_t mapped_hwpage_sz;
+ /* Values only used on some platforms for quirks */
+ u32 t241_id;
+
struct mpam_garbage garbage;
};
@@ -207,6 +221,42 @@ struct mpam_props {
#define mpam_set_feature(_feat, x) __set_bit(_feat, (x)->features)
#define mpam_clear_feature(_feat, x) __clear_bit(_feat, (x)->features)
+/* Workaround bits for msc->quirks */
+enum mpam_device_quirks {
+ T241_SCRUB_SHADOW_REGS,
+ T241_FORCE_MBW_MIN_TO_ONE,
+ T241_MBW_COUNTER_SCALE_64,
+ IGNORE_CSU_NRDY,
+ MPAM_QUIRK_LAST
+};
+
+#define mpam_has_quirk(_quirk, x) ((1 << (_quirk) & (x)->quirks))
+#define mpam_set_quirk(_quirk, x) ((x)->quirks |= (1 << (_quirk)))
+
+struct mpam_quirk {
+ int (*init)(struct mpam_msc *msc, const struct mpam_quirk *quirk);
+
+ u32 iidr;
+ u32 iidr_mask;
+
+ enum mpam_device_quirks workaround;
+};
+
+#define MPAM_IIDR_MATCH_ONE (FIELD_PREP_CONST(MPAMF_IIDR_PRODUCTID, 0xfff) | \
+ FIELD_PREP_CONST(MPAMF_IIDR_VARIANT, 0xf) | \
+ FIELD_PREP_CONST(MPAMF_IIDR_REVISION, 0xf) | \
+ FIELD_PREP_CONST(MPAMF_IIDR_IMPLEMENTER, 0xfff))
+
+#define MPAM_IIDR_NVIDIA_T241 (FIELD_PREP_CONST(MPAMF_IIDR_PRODUCTID, 0x241) | \
+ FIELD_PREP_CONST(MPAMF_IIDR_VARIANT, 0) | \
+ FIELD_PREP_CONST(MPAMF_IIDR_REVISION, 0) | \
+ FIELD_PREP_CONST(MPAMF_IIDR_IMPLEMENTER, 0x36b))
+
+#define MPAM_IIDR_ARM_CMN_650 (FIELD_PREP_CONST(MPAMF_IIDR_PRODUCTID, 0) | \
+ FIELD_PREP_CONST(MPAMF_IIDR_VARIANT, 0) | \
+ FIELD_PREP_CONST(MPAMF_IIDR_REVISION, 0) | \
+ FIELD_PREP_CONST(MPAMF_IIDR_IMPLEMENTER, 0x43b))
+
/* The values for MSMON_CFG_MBWU_FLT.RWBW */
enum mon_filter_options {
COUNT_BOTH = 0,
@@ -215,7 +265,11 @@ enum mon_filter_options {
};
struct mon_cfg {
- u16 mon;
+ /*
+ * mon must be large enough to hold out of range values like
+ * USE_PRE_ALLOCATED
+ */
+ u32 mon;
u8 pmg;
bool match_pmg;
bool csu_exclude_clean;
@@ -246,6 +300,7 @@ struct mpam_class {
struct mpam_props props;
u32 nrdy_usec;
+ u16 quirks;
u8 level;
enum mpam_class_types type;
@@ -266,10 +321,6 @@ struct mpam_config {
u32 mbw_pbm;
u16 mbw_max;
- bool reset_cpbm;
- bool reset_mbw_pbm;
- bool reset_mbw_max;
-
struct mpam_garbage garbage;
};
@@ -337,6 +388,32 @@ struct mpam_msc_ris {
struct mpam_garbage garbage;
};
+struct mpam_resctrl_dom {
+ struct mpam_component *ctrl_comp;
+
+ /*
+ * There is no single mon_comp because different events may be backed
+ * by different class/components. mon_comp is indexed by the event
+ * number.
+ */
+ struct mpam_component *mon_comp[QOS_NUM_EVENTS];
+
+ struct rdt_ctrl_domain resctrl_ctrl_dom;
+ struct rdt_l3_mon_domain resctrl_mon_dom;
+};
+
+struct mpam_resctrl_res {
+ struct mpam_class *class;
+ struct rdt_resource resctrl_res;
+ bool cdp_enabled;
+};
+
+struct mpam_resctrl_mon {
+ struct mpam_class *class;
+
+ /* per-class data that resctrl needs will live here */
+};
+
static inline int mpam_alloc_csu_mon(struct mpam_class *class)
{
struct mpam_props *cprops = &class->props;
@@ -381,6 +458,9 @@ extern u8 mpam_pmg_max;
void mpam_enable(struct work_struct *work);
void mpam_disable(struct work_struct *work);
+/* Reset all the RIS in a class under cpus_read_lock() */
+void mpam_reset_class_locked(struct mpam_class *class);
+
int mpam_apply_config(struct mpam_component *comp, u16 partid,
struct mpam_config *cfg);
@@ -391,6 +471,20 @@ void mpam_msmon_reset_mbwu(struct mpam_component *comp, struct mon_cfg *ctx);
int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level,
cpumask_t *affinity);
+#ifdef CONFIG_RESCTRL_FS
+int mpam_resctrl_setup(void);
+void mpam_resctrl_exit(void);
+int mpam_resctrl_online_cpu(unsigned int cpu);
+void mpam_resctrl_offline_cpu(unsigned int cpu);
+void mpam_resctrl_teardown_class(struct mpam_class *class);
+#else
+static inline int mpam_resctrl_setup(void) { return 0; }
+static inline void mpam_resctrl_exit(void) { }
+static inline int mpam_resctrl_online_cpu(unsigned int cpu) { return 0; }
+static inline void mpam_resctrl_offline_cpu(unsigned int cpu) { }
+static inline void mpam_resctrl_teardown_class(struct mpam_class *class) { }
+#endif /* CONFIG_RESCTRL_FS */
+
/*
* MPAM MSCs have the following register layout. See:
* Arm Memory System Resource Partitioning and Monitoring (MPAM) System
diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c
new file mode 100644
index 000000000000..a9938006d0e6
--- /dev/null
+++ b/drivers/resctrl/mpam_resctrl.c
@@ -0,0 +1,1704 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2025 Arm Ltd.
+
+#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__
+
+#include <linux/arm_mpam.h>
+#include <linux/cacheinfo.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/errno.h>
+#include <linux/limits.h>
+#include <linux/list.h>
+#include <linux/math.h>
+#include <linux/printk.h>
+#include <linux/rculist.h>
+#include <linux/resctrl.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+
+#include <asm/mpam.h>
+
+#include "mpam_internal.h"
+
+DECLARE_WAIT_QUEUE_HEAD(resctrl_mon_ctx_waiters);
+
+/*
+ * The classes we've picked to map to resctrl resources, wrapped
+ * in with their resctrl structure.
+ * Class pointer may be NULL.
+ */
+static struct mpam_resctrl_res mpam_resctrl_controls[RDT_NUM_RESOURCES];
+
+#define for_each_mpam_resctrl_control(res, rid) \
+ for (rid = 0, res = &mpam_resctrl_controls[rid]; \
+ rid < RDT_NUM_RESOURCES; \
+ rid++, res = &mpam_resctrl_controls[rid])
+
+/*
+ * The classes we've picked to map to resctrl events.
+ * Resctrl believes all the worlds a Xeon, and these are all on the L3. This
+ * array lets us find the actual class backing the event counters. e.g.
+ * the only memory bandwidth counters may be on the memory controller, but to
+ * make use of them, we pretend they are on L3. Restrict the events considered
+ * to those supported by MPAM.
+ * Class pointer may be NULL.
+ */
+#define MPAM_MAX_EVENT QOS_L3_MBM_TOTAL_EVENT_ID
+static struct mpam_resctrl_mon mpam_resctrl_counters[MPAM_MAX_EVENT + 1];
+
+#define for_each_mpam_resctrl_mon(mon, eventid) \
+ for (eventid = QOS_FIRST_EVENT, mon = &mpam_resctrl_counters[eventid]; \
+ eventid <= MPAM_MAX_EVENT; \
+ eventid++, mon = &mpam_resctrl_counters[eventid])
+
+/* The lock for modifying resctrl's domain lists from cpuhp callbacks. */
+static DEFINE_MUTEX(domain_list_lock);
+
+/*
+ * MPAM emulates CDP by setting different PARTID in the I/D fields of MPAM0_EL1.
+ * This applies globally to all traffic the CPU generates.
+ */
+static bool cdp_enabled;
+
+/*
+ * We use cacheinfo to discover the size of the caches and their id. cacheinfo
+ * populates this from a device_initcall(). mpam_resctrl_setup() must wait.
+ */
+static bool cacheinfo_ready;
+static DECLARE_WAIT_QUEUE_HEAD(wait_cacheinfo_ready);
+
+/*
+ * If resctrl_init() succeeded, resctrl_exit() can be used to remove support
+ * for the filesystem in the event of an error.
+ */
+static bool resctrl_enabled;
+
+bool resctrl_arch_alloc_capable(void)
+{
+ struct mpam_resctrl_res *res;
+ enum resctrl_res_level rid;
+
+ for_each_mpam_resctrl_control(res, rid) {
+ if (res->resctrl_res.alloc_capable)
+ return true;
+ }
+
+ return false;
+}
+
+bool resctrl_arch_mon_capable(void)
+{
+ struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3];
+ struct rdt_resource *l3 = &res->resctrl_res;
+
+ /* All monitors are presented as being on the L3 cache */
+ return l3->mon_capable;
+}
+
+bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt)
+{
+ return false;
+}
+
+void resctrl_arch_mon_event_config_read(void *info)
+{
+}
+
+void resctrl_arch_mon_event_config_write(void *info)
+{
+}
+
+void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d)
+{
+}
+
+void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d,
+ u32 closid, u32 rmid, enum resctrl_event_id eventid)
+{
+}
+
+void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d,
+ u32 closid, u32 rmid, int cntr_id,
+ enum resctrl_event_id eventid)
+{
+}
+
+void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d,
+ enum resctrl_event_id evtid, u32 rmid, u32 closid,
+ u32 cntr_id, bool assign)
+{
+}
+
+int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_l3_mon_domain *d,
+ u32 unused, u32 rmid, int cntr_id,
+ enum resctrl_event_id eventid, u64 *val)
+{
+ return -EOPNOTSUPP;
+}
+
+bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r)
+{
+ return false;
+}
+
+int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable)
+{
+ return -EINVAL;
+}
+
+int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable)
+{
+ return -EOPNOTSUPP;
+}
+
+bool resctrl_arch_get_io_alloc_enabled(struct rdt_resource *r)
+{
+ return false;
+}
+
+void resctrl_arch_pre_mount(void)
+{
+}
+
+bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level rid)
+{
+ return mpam_resctrl_controls[rid].cdp_enabled;
+}
+
+/**
+ * resctrl_reset_task_closids() - Reset the PARTID/PMG values for all tasks.
+ *
+ * At boot, all existing tasks use partid zero for D and I.
+ * To enable/disable CDP emulation, all these tasks need relabelling.
+ */
+static void resctrl_reset_task_closids(void)
+{
+ struct task_struct *p, *t;
+
+ read_lock(&tasklist_lock);
+ for_each_process_thread(p, t) {
+ resctrl_arch_set_closid_rmid(t, RESCTRL_RESERVED_CLOSID,
+ RESCTRL_RESERVED_RMID);
+ }
+ read_unlock(&tasklist_lock);
+}
+
+int resctrl_arch_set_cdp_enabled(enum resctrl_res_level rid, bool enable)
+{
+ u32 partid_i = RESCTRL_RESERVED_CLOSID, partid_d = RESCTRL_RESERVED_CLOSID;
+ struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3];
+ struct rdt_resource *l3 = &res->resctrl_res;
+ int cpu;
+
+ if (!IS_ENABLED(CONFIG_EXPERT) && enable) {
+ /*
+ * If the resctrl fs is mounted more than once, sequentially,
+ * then CDP can lead to the use of out of range PARTIDs.
+ */
+ pr_warn("CDP not supported\n");
+ return -EOPNOTSUPP;
+ }
+
+ if (enable)
+ pr_warn("CDP is an expert feature and may cause MPAM to malfunction.\n");
+
+ /*
+ * resctrl_arch_set_cdp_enabled() is only called with enable set to
+ * false on error and unmount.
+ */
+ cdp_enabled = enable;
+ mpam_resctrl_controls[rid].cdp_enabled = enable;
+
+ if (enable)
+ l3->mon.num_rmid = resctrl_arch_system_num_rmid_idx() / 2;
+ else
+ l3->mon.num_rmid = resctrl_arch_system_num_rmid_idx();
+
+ /* The mbw_max feature can't hide cdp as it's a per-partid maximum. */
+ if (cdp_enabled && !mpam_resctrl_controls[RDT_RESOURCE_MBA].cdp_enabled)
+ mpam_resctrl_controls[RDT_RESOURCE_MBA].resctrl_res.alloc_capable = false;
+
+ if (mpam_resctrl_controls[RDT_RESOURCE_MBA].cdp_enabled &&
+ mpam_resctrl_controls[RDT_RESOURCE_MBA].class)
+ mpam_resctrl_controls[RDT_RESOURCE_MBA].resctrl_res.alloc_capable = true;
+
+ if (enable) {
+ if (mpam_partid_max < 1)
+ return -EINVAL;
+
+ partid_d = resctrl_get_config_index(RESCTRL_RESERVED_CLOSID, CDP_DATA);
+ partid_i = resctrl_get_config_index(RESCTRL_RESERVED_CLOSID, CDP_CODE);
+ }
+
+ mpam_set_task_partid_pmg(current, partid_d, partid_i, 0, 0);
+ WRITE_ONCE(arm64_mpam_global_default, mpam_get_regval(current));
+
+ resctrl_reset_task_closids();
+
+ for_each_possible_cpu(cpu)
+ mpam_set_cpu_defaults(cpu, partid_d, partid_i, 0, 0);
+ on_each_cpu(resctrl_arch_sync_cpu_closid_rmid, NULL, 1);
+
+ return 0;
+}
+
+static bool mpam_resctrl_hide_cdp(enum resctrl_res_level rid)
+{
+ return cdp_enabled && !resctrl_arch_get_cdp_enabled(rid);
+}
+
+/*
+ * MSC may raise an error interrupt if it sees an out or range partid/pmg,
+ * and go on to truncate the value. Regardless of what the hardware supports,
+ * only the system wide safe value is safe to use.
+ */
+u32 resctrl_arch_get_num_closid(struct rdt_resource *ignored)
+{
+ return mpam_partid_max + 1;
+}
+
+u32 resctrl_arch_system_num_rmid_idx(void)
+{
+ return (mpam_pmg_max + 1) * (mpam_partid_max + 1);
+}
+
+u32 resctrl_arch_rmid_idx_encode(u32 closid, u32 rmid)
+{
+ return closid * (mpam_pmg_max + 1) + rmid;
+}
+
+void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid)
+{
+ *closid = idx / (mpam_pmg_max + 1);
+ *rmid = idx % (mpam_pmg_max + 1);
+}
+
+void resctrl_arch_sched_in(struct task_struct *tsk)
+{
+ lockdep_assert_preemption_disabled();
+
+ mpam_thread_switch(tsk);
+}
+
+void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid)
+{
+ WARN_ON_ONCE(closid > U16_MAX);
+ WARN_ON_ONCE(rmid > U8_MAX);
+
+ if (!cdp_enabled) {
+ mpam_set_cpu_defaults(cpu, closid, closid, rmid, rmid);
+ } else {
+ /*
+ * When CDP is enabled, resctrl halves the closid range and we
+ * use odd/even partid for one closid.
+ */
+ u32 partid_d = resctrl_get_config_index(closid, CDP_DATA);
+ u32 partid_i = resctrl_get_config_index(closid, CDP_CODE);
+
+ mpam_set_cpu_defaults(cpu, partid_d, partid_i, rmid, rmid);
+ }
+}
+
+void resctrl_arch_sync_cpu_closid_rmid(void *info)
+{
+ struct resctrl_cpu_defaults *r = info;
+
+ lockdep_assert_preemption_disabled();
+
+ if (r) {
+ resctrl_arch_set_cpu_default_closid_rmid(smp_processor_id(),
+ r->closid, r->rmid);
+ }
+
+ resctrl_arch_sched_in(current);
+}
+
+void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid)
+{
+ WARN_ON_ONCE(closid > U16_MAX);
+ WARN_ON_ONCE(rmid > U8_MAX);
+
+ if (!cdp_enabled) {
+ mpam_set_task_partid_pmg(tsk, closid, closid, rmid, rmid);
+ } else {
+ u32 partid_d = resctrl_get_config_index(closid, CDP_DATA);
+ u32 partid_i = resctrl_get_config_index(closid, CDP_CODE);
+
+ mpam_set_task_partid_pmg(tsk, partid_d, partid_i, rmid, rmid);
+ }
+}
+
+bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid)
+{
+ u64 regval = mpam_get_regval(tsk);
+ u32 tsk_closid = FIELD_GET(MPAM0_EL1_PARTID_D, regval);
+
+ if (cdp_enabled)
+ tsk_closid >>= 1;
+
+ return tsk_closid == closid;
+}
+
+/* The task's pmg is not unique, the partid must be considered too */
+bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 closid, u32 rmid)
+{
+ u64 regval = mpam_get_regval(tsk);
+ u32 tsk_closid = FIELD_GET(MPAM0_EL1_PARTID_D, regval);
+ u32 tsk_rmid = FIELD_GET(MPAM0_EL1_PMG_D, regval);
+
+ if (cdp_enabled)
+ tsk_closid >>= 1;
+
+ return (tsk_closid == closid) && (tsk_rmid == rmid);
+}
+
+struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l)
+{
+ if (l >= RDT_NUM_RESOURCES)
+ return NULL;
+
+ return &mpam_resctrl_controls[l].resctrl_res;
+}
+
+static int resctrl_arch_mon_ctx_alloc_no_wait(enum resctrl_event_id evtid)
+{
+ struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evtid];
+
+ if (!mpam_is_enabled())
+ return -EINVAL;
+
+ if (!mon->class)
+ return -EINVAL;
+
+ switch (evtid) {
+ case QOS_L3_OCCUP_EVENT_ID:
+ /* With CDP, one monitor gets used for both code/data reads */
+ return mpam_alloc_csu_mon(mon->class);
+ case QOS_L3_MBM_LOCAL_EVENT_ID:
+ case QOS_L3_MBM_TOTAL_EVENT_ID:
+ return USE_PRE_ALLOCATED;
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r,
+ enum resctrl_event_id evtid)
+{
+ DEFINE_WAIT(wait);
+ int *ret;
+
+ ret = kmalloc_obj(*ret);
+ if (!ret)
+ return ERR_PTR(-ENOMEM);
+
+ do {
+ prepare_to_wait(&resctrl_mon_ctx_waiters, &wait,
+ TASK_INTERRUPTIBLE);
+ *ret = resctrl_arch_mon_ctx_alloc_no_wait(evtid);
+ if (*ret == -ENOSPC)
+ schedule();
+ } while (*ret == -ENOSPC && !signal_pending(current));
+ finish_wait(&resctrl_mon_ctx_waiters, &wait);
+
+ return ret;
+}
+
+static void resctrl_arch_mon_ctx_free_no_wait(enum resctrl_event_id evtid,
+ u32 mon_idx)
+{
+ struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evtid];
+
+ if (!mpam_is_enabled())
+ return;
+
+ if (!mon->class)
+ return;
+
+ if (evtid == QOS_L3_OCCUP_EVENT_ID)
+ mpam_free_csu_mon(mon->class, mon_idx);
+
+ wake_up(&resctrl_mon_ctx_waiters);
+}
+
+void resctrl_arch_mon_ctx_free(struct rdt_resource *r,
+ enum resctrl_event_id evtid, void *arch_mon_ctx)
+{
+ u32 mon_idx = *(u32 *)arch_mon_ctx;
+
+ kfree(arch_mon_ctx);
+
+ resctrl_arch_mon_ctx_free_no_wait(evtid, mon_idx);
+}
+
+static int __read_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp,
+ enum mpam_device_features mon_type,
+ int mon_idx,
+ enum resctrl_conf_type cdp_type, u32 closid, u32 rmid, u64 *val)
+{
+ struct mon_cfg cfg;
+
+ if (!mpam_is_enabled())
+ return -EINVAL;
+
+ /* Shift closid to account for CDP */
+ closid = resctrl_get_config_index(closid, cdp_type);
+
+ if (irqs_disabled()) {
+ /* Check if we can access this domain without an IPI */
+ return -EIO;
+ }
+
+ cfg = (struct mon_cfg) {
+ .mon = mon_idx,
+ .match_pmg = true,
+ .partid = closid,
+ .pmg = rmid,
+ };
+
+ return mpam_msmon_read(mon_comp, &cfg, mon_type, val);
+}
+
+static int read_mon_cdp_safe(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp,
+ enum mpam_device_features mon_type,
+ int mon_idx, u32 closid, u32 rmid, u64 *val)
+{
+ if (cdp_enabled) {
+ u64 code_val = 0, data_val = 0;
+ int err;
+
+ err = __read_mon(mon, mon_comp, mon_type, mon_idx,
+ CDP_CODE, closid, rmid, &code_val);
+ if (err)
+ return err;
+
+ err = __read_mon(mon, mon_comp, mon_type, mon_idx,
+ CDP_DATA, closid, rmid, &data_val);
+ if (err)
+ return err;
+
+ *val += code_val + data_val;
+ return 0;
+ }
+
+ return __read_mon(mon, mon_comp, mon_type, mon_idx,
+ CDP_NONE, closid, rmid, val);
+}
+
+/* MBWU when not in ABMC mode (not supported), and CSU counters. */
+int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr,
+ u32 closid, u32 rmid, enum resctrl_event_id eventid,
+ void *arch_priv, u64 *val, void *arch_mon_ctx)
+{
+ struct mpam_resctrl_dom *l3_dom;
+ struct mpam_component *mon_comp;
+ u32 mon_idx = *(u32 *)arch_mon_ctx;
+ enum mpam_device_features mon_type;
+ struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[eventid];
+
+ resctrl_arch_rmid_read_context_check();
+
+ if (!mpam_is_enabled())
+ return -EINVAL;
+
+ if (eventid >= QOS_NUM_EVENTS || !mon->class)
+ return -EINVAL;
+
+ l3_dom = container_of(hdr, struct mpam_resctrl_dom, resctrl_mon_dom.hdr);
+ mon_comp = l3_dom->mon_comp[eventid];
+
+ if (eventid != QOS_L3_OCCUP_EVENT_ID)
+ return -EINVAL;
+
+ mon_type = mpam_feat_msmon_csu;
+
+ return read_mon_cdp_safe(mon, mon_comp, mon_type, mon_idx,
+ closid, rmid, val);
+}
+
+/*
+ * The rmid realloc threshold should be for the smallest cache exposed to
+ * resctrl.
+ */
+static int update_rmid_limits(struct mpam_class *class)
+{
+ u32 num_unique_pmg = resctrl_arch_system_num_rmid_idx();
+ struct mpam_props *cprops = &class->props;
+ struct cacheinfo *ci;
+
+ lockdep_assert_cpus_held();
+
+ if (!mpam_has_feature(mpam_feat_msmon_csu, cprops))
+ return 0;
+
+ /*
+ * Assume cache levels are the same size for all CPUs...
+ * The check just requires any online CPU and it can't go offline as we
+ * hold the cpu lock.
+ */
+ ci = get_cpu_cacheinfo_level(raw_smp_processor_id(), class->level);
+ if (!ci || ci->size == 0) {
+ pr_debug("Could not read cache size for class %u\n",
+ class->level);
+ return -EINVAL;
+ }
+
+ if (!resctrl_rmid_realloc_limit ||
+ ci->size < resctrl_rmid_realloc_limit) {
+ resctrl_rmid_realloc_limit = ci->size;
+ resctrl_rmid_realloc_threshold = ci->size / num_unique_pmg;
+ }
+
+ return 0;
+}
+
+static bool cache_has_usable_cpor(struct mpam_class *class)
+{
+ struct mpam_props *cprops = &class->props;
+
+ if (!mpam_has_feature(mpam_feat_cpor_part, cprops))
+ return false;
+
+ /* resctrl uses u32 for all bitmap configurations */
+ return class->props.cpbm_wd <= 32;
+}
+
+static bool mba_class_use_mbw_max(struct mpam_props *cprops)
+{
+ return (mpam_has_feature(mpam_feat_mbw_max, cprops) &&
+ cprops->bwa_wd);
+}
+
+static bool class_has_usable_mba(struct mpam_props *cprops)
+{
+ return mba_class_use_mbw_max(cprops);
+}
+
+static bool cache_has_usable_csu(struct mpam_class *class)
+{
+ struct mpam_props *cprops;
+
+ if (!class)
+ return false;
+
+ cprops = &class->props;
+
+ if (!mpam_has_feature(mpam_feat_msmon_csu, cprops))
+ return false;
+
+ /*
+ * CSU counters settle on the value, so we can get away with
+ * having only one.
+ */
+ if (!cprops->num_csu_mon)
+ return false;
+
+ return true;
+}
+
+/*
+ * Calculate the worst-case percentage change from each implemented step
+ * in the control.
+ */
+static u32 get_mba_granularity(struct mpam_props *cprops)
+{
+ if (!mba_class_use_mbw_max(cprops))
+ return 0;
+
+ /*
+ * bwa_wd is the number of bits implemented in the 0.xxx
+ * fixed point fraction. 1 bit is 50%, 2 is 25% etc.
+ */
+ return DIV_ROUND_UP(MAX_MBA_BW, 1 << cprops->bwa_wd);
+}
+
+/*
+ * Each fixed-point hardware value architecturally represents a range
+ * of values: the full range 0% - 100% is split contiguously into
+ * (1 << cprops->bwa_wd) equal bands.
+ *
+ * Although the bwa_bwd fields have 6 bits the maximum valid value is 16
+ * as it reports the width of fields that are at most 16 bits. When
+ * fewer than 16 bits are valid the least significant bits are
+ * ignored. The implied binary point is kept between bits 15 and 16 and
+ * so the valid bits are leftmost.
+ *
+ * See ARM IHI0099B.a "MPAM system component specification", Section 9.3,
+ * "The fixed-point fractional format" for more information.
+ *
+ * Find the nearest percentage value to the upper bound of the selected band:
+ */
+static u32 mbw_max_to_percent(u16 mbw_max, struct mpam_props *cprops)
+{
+ u32 val = mbw_max;
+
+ val >>= 16 - cprops->bwa_wd;
+ val += 1;
+ val *= MAX_MBA_BW;
+ val = DIV_ROUND_CLOSEST(val, 1 << cprops->bwa_wd);
+
+ return val;
+}
+
+/*
+ * Find the band whose upper bound is closest to the specified percentage.
+ *
+ * A round-to-nearest policy is followed here as a balanced compromise
+ * between unexpected under-commit of the resource (where the total of
+ * a set of resource allocations after conversion is less than the
+ * expected total, due to rounding of the individual converted
+ * percentages) and over-commit (where the total of the converted
+ * allocations is greater than expected).
+ */
+static u16 percent_to_mbw_max(u8 pc, struct mpam_props *cprops)
+{
+ u32 val = pc;
+
+ val <<= cprops->bwa_wd;
+ val = DIV_ROUND_CLOSEST(val, MAX_MBA_BW);
+ val = max(val, 1) - 1;
+ val <<= 16 - cprops->bwa_wd;
+
+ return val;
+}
+
+static u32 get_mba_min(struct mpam_props *cprops)
+{
+ if (!mba_class_use_mbw_max(cprops)) {
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+
+ return mbw_max_to_percent(0, cprops);
+}
+
+/* Find the L3 cache that has affinity with this CPU */
+static int find_l3_equivalent_bitmask(int cpu, cpumask_var_t tmp_cpumask)
+{
+ u32 cache_id = get_cpu_cacheinfo_id(cpu, 3);
+
+ lockdep_assert_cpus_held();
+
+ return mpam_get_cpumask_from_cache_id(cache_id, 3, tmp_cpumask);
+}
+
+/*
+ * topology_matches_l3() - Is the provided class the same shape as L3
+ * @victim: The class we'd like to pretend is L3.
+ *
+ * resctrl expects all the world's a Xeon, and all counters are on the
+ * L3. We allow some mapping counters on other classes. This requires
+ * that the CPU->domain mapping is the same kind of shape.
+ *
+ * Using cacheinfo directly would make this work even if resctrl can't
+ * use the L3 - but cacheinfo can't tell us anything about offline CPUs.
+ * Using the L3 resctrl domain list also depends on CPUs being online.
+ * Using the mpam_class we picked for L3 so we can use its domain list
+ * assumes that there are MPAM controls on the L3.
+ * Instead, this path eventually uses the mpam_get_cpumask_from_cache_id()
+ * helper which can tell us about offline CPUs ... but getting the cache_id
+ * to start with relies on at least one CPU per L3 cache being online at
+ * boot.
+ *
+ * Walk the victim component list and compare the affinity mask with the
+ * corresponding L3. The topology matches if each victim:component's affinity
+ * mask is the same as the CPU's corresponding L3's. These lists/masks are
+ * computed from firmware tables so don't change at runtime.
+ */
+static bool topology_matches_l3(struct mpam_class *victim)
+{
+ int cpu, err;
+ struct mpam_component *victim_iter;
+
+ lockdep_assert_cpus_held();
+
+ cpumask_var_t __free(free_cpumask_var) tmp_cpumask = CPUMASK_VAR_NULL;
+ if (!alloc_cpumask_var(&tmp_cpumask, GFP_KERNEL))
+ return false;
+
+ guard(srcu)(&mpam_srcu);
+ list_for_each_entry_srcu(victim_iter, &victim->components, class_list,
+ srcu_read_lock_held(&mpam_srcu)) {
+ if (cpumask_empty(&victim_iter->affinity)) {
+ pr_debug("class %u has CPU-less component %u - can't match L3!\n",
+ victim->level, victim_iter->comp_id);
+ return false;
+ }
+
+ cpu = cpumask_any_and(&victim_iter->affinity, cpu_online_mask);
+ if (WARN_ON_ONCE(cpu >= nr_cpu_ids))
+ return false;
+
+ cpumask_clear(tmp_cpumask);
+ err = find_l3_equivalent_bitmask(cpu, tmp_cpumask);
+ if (err) {
+ pr_debug("Failed to find L3's equivalent component to class %u component %u\n",
+ victim->level, victim_iter->comp_id);
+ return false;
+ }
+
+ /* Any differing bits in the affinity mask? */
+ if (!cpumask_equal(tmp_cpumask, &victim_iter->affinity)) {
+ pr_debug("class %u component %u has Mismatched CPU mask with L3 equivalent\n"
+ "L3:%*pbl != victim:%*pbl\n",
+ victim->level, victim_iter->comp_id,
+ cpumask_pr_args(tmp_cpumask),
+ cpumask_pr_args(&victim_iter->affinity));
+
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/*
+ * Test if the traffic for a class matches that at egress from the L3. For
+ * MSC at memory controllers this is only possible if there is a single L3
+ * as otherwise the counters at the memory can include bandwidth from the
+ * non-local L3.
+ */
+static bool traffic_matches_l3(struct mpam_class *class)
+{
+ int err, cpu;
+
+ lockdep_assert_cpus_held();
+
+ if (class->type == MPAM_CLASS_CACHE && class->level == 3)
+ return true;
+
+ if (class->type == MPAM_CLASS_CACHE && class->level != 3) {
+ pr_debug("class %u is a different cache from L3\n", class->level);
+ return false;
+ }
+
+ if (class->type != MPAM_CLASS_MEMORY) {
+ pr_debug("class %u is neither of type cache or memory\n", class->level);
+ return false;
+ }
+
+ cpumask_var_t __free(free_cpumask_var) tmp_cpumask = CPUMASK_VAR_NULL;
+ if (!alloc_cpumask_var(&tmp_cpumask, GFP_KERNEL)) {
+ pr_debug("cpumask allocation failed\n");
+ return false;
+ }
+
+ cpu = cpumask_any_and(&class->affinity, cpu_online_mask);
+ err = find_l3_equivalent_bitmask(cpu, tmp_cpumask);
+ if (err) {
+ pr_debug("Failed to find L3 downstream to cpu %d\n", cpu);
+ return false;
+ }
+
+ if (!cpumask_equal(tmp_cpumask, cpu_possible_mask)) {
+ pr_debug("There is more than one L3\n");
+ return false;
+ }
+
+ /* Be strict; the traffic might stop in the intermediate cache. */
+ if (get_cpu_cacheinfo_id(cpu, 4) != -1) {
+ pr_debug("L3 isn't the last level of cache\n");
+ return false;
+ }
+
+ if (num_possible_nodes() > 1) {
+ pr_debug("There is more than one numa node\n");
+ return false;
+ }
+
+#ifdef CONFIG_HMEM_REPORTING
+ if (node_devices[cpu_to_node(cpu)]->cache_dev) {
+ pr_debug("There is a memory side cache\n");
+ return false;
+ }
+#endif
+
+ return true;
+}
+
+/* Test whether we can export MPAM_CLASS_CACHE:{2,3}? */
+static void mpam_resctrl_pick_caches(void)
+{
+ struct mpam_class *class;
+ struct mpam_resctrl_res *res;
+
+ lockdep_assert_cpus_held();
+
+ guard(srcu)(&mpam_srcu);
+ list_for_each_entry_srcu(class, &mpam_classes, classes_list,
+ srcu_read_lock_held(&mpam_srcu)) {
+ if (class->type != MPAM_CLASS_CACHE) {
+ pr_debug("class %u is not a cache\n", class->level);
+ continue;
+ }
+
+ if (class->level != 2 && class->level != 3) {
+ pr_debug("class %u is not L2 or L3\n", class->level);
+ continue;
+ }
+
+ if (!cache_has_usable_cpor(class)) {
+ pr_debug("class %u cache misses CPOR\n", class->level);
+ continue;
+ }
+
+ if (!cpumask_equal(&class->affinity, cpu_possible_mask)) {
+ pr_debug("class %u has missing CPUs, mask %*pb != %*pb\n", class->level,
+ cpumask_pr_args(&class->affinity),
+ cpumask_pr_args(cpu_possible_mask));
+ continue;
+ }
+
+ if (class->level == 2)
+ res = &mpam_resctrl_controls[RDT_RESOURCE_L2];
+ else
+ res = &mpam_resctrl_controls[RDT_RESOURCE_L3];
+ res->class = class;
+ }
+}
+
+static void mpam_resctrl_pick_mba(void)
+{
+ struct mpam_class *class, *candidate_class = NULL;
+ struct mpam_resctrl_res *res;
+
+ lockdep_assert_cpus_held();
+
+ guard(srcu)(&mpam_srcu);
+ list_for_each_entry_srcu(class, &mpam_classes, classes_list,
+ srcu_read_lock_held(&mpam_srcu)) {
+ struct mpam_props *cprops = &class->props;
+
+ if (class->level != 3 && class->type == MPAM_CLASS_CACHE) {
+ pr_debug("class %u is a cache but not the L3\n", class->level);
+ continue;
+ }
+
+ if (!class_has_usable_mba(cprops)) {
+ pr_debug("class %u has no bandwidth control\n",
+ class->level);
+ continue;
+ }
+
+ if (!cpumask_equal(&class->affinity, cpu_possible_mask)) {
+ pr_debug("class %u has missing CPUs\n", class->level);
+ continue;
+ }
+
+ if (!topology_matches_l3(class)) {
+ pr_debug("class %u topology doesn't match L3\n",
+ class->level);
+ continue;
+ }
+
+ if (!traffic_matches_l3(class)) {
+ pr_debug("class %u traffic doesn't match L3 egress\n",
+ class->level);
+ continue;
+ }
+
+ /*
+ * Pick a resource to be MBA that as close as possible to
+ * the L3. mbm_total counts the bandwidth leaving the L3
+ * cache and MBA should correspond as closely as possible
+ * for proper operation of mba_sc.
+ */
+ if (!candidate_class || class->level < candidate_class->level)
+ candidate_class = class;
+ }
+
+ if (candidate_class) {
+ pr_debug("selected class %u to back MBA\n",
+ candidate_class->level);
+ res = &mpam_resctrl_controls[RDT_RESOURCE_MBA];
+ res->class = candidate_class;
+ }
+}
+
+static void counter_update_class(enum resctrl_event_id evt_id,
+ struct mpam_class *class)
+{
+ struct mpam_class *existing_class = mpam_resctrl_counters[evt_id].class;
+
+ if (existing_class) {
+ if (class->level == 3) {
+ pr_debug("Existing class is L3 - L3 wins\n");
+ return;
+ }
+
+ if (existing_class->level < class->level) {
+ pr_debug("Existing class is closer to L3, %u versus %u - closer is better\n",
+ existing_class->level, class->level);
+ return;
+ }
+ }
+
+ mpam_resctrl_counters[evt_id].class = class;
+}
+
+static void mpam_resctrl_pick_counters(void)
+{
+ struct mpam_class *class;
+
+ lockdep_assert_cpus_held();
+
+ guard(srcu)(&mpam_srcu);
+ list_for_each_entry_srcu(class, &mpam_classes, classes_list,
+ srcu_read_lock_held(&mpam_srcu)) {
+ /* The name of the resource is L3... */
+ if (class->type == MPAM_CLASS_CACHE && class->level != 3) {
+ pr_debug("class %u is a cache but not the L3", class->level);
+ continue;
+ }
+
+ if (!cpumask_equal(&class->affinity, cpu_possible_mask)) {
+ pr_debug("class %u does not cover all CPUs",
+ class->level);
+ continue;
+ }
+
+ if (cache_has_usable_csu(class)) {
+ pr_debug("class %u has usable CSU",
+ class->level);
+
+ /* CSU counters only make sense on a cache. */
+ switch (class->type) {
+ case MPAM_CLASS_CACHE:
+ if (update_rmid_limits(class))
+ break;
+
+ counter_update_class(QOS_L3_OCCUP_EVENT_ID, class);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+}
+
+static int mpam_resctrl_control_init(struct mpam_resctrl_res *res)
+{
+ struct mpam_class *class = res->class;
+ struct mpam_props *cprops = &class->props;
+ struct rdt_resource *r = &res->resctrl_res;
+
+ switch (r->rid) {
+ case RDT_RESOURCE_L2:
+ case RDT_RESOURCE_L3:
+ r->schema_fmt = RESCTRL_SCHEMA_BITMAP;
+ r->cache.arch_has_sparse_bitmasks = true;
+
+ r->cache.cbm_len = class->props.cpbm_wd;
+ /* mpam_devices will reject empty bitmaps */
+ r->cache.min_cbm_bits = 1;
+
+ if (r->rid == RDT_RESOURCE_L2) {
+ r->name = "L2";
+ r->ctrl_scope = RESCTRL_L2_CACHE;
+ r->cdp_capable = true;
+ } else {
+ r->name = "L3";
+ r->ctrl_scope = RESCTRL_L3_CACHE;
+ r->cdp_capable = true;
+ }
+
+ /*
+ * Which bits are shared with other ...things... Unknown
+ * devices use partid-0 which uses all the bitmap fields. Until
+ * we have configured the SMMU and GIC not to do this 'all the
+ * bits' is the correct answer here.
+ */
+ r->cache.shareable_bits = resctrl_get_default_ctrl(r);
+ r->alloc_capable = true;
+ break;
+ case RDT_RESOURCE_MBA:
+ r->schema_fmt = RESCTRL_SCHEMA_RANGE;
+ r->ctrl_scope = RESCTRL_L3_CACHE;
+
+ r->membw.delay_linear = true;
+ r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED;
+ r->membw.min_bw = get_mba_min(cprops);
+ r->membw.max_bw = MAX_MBA_BW;
+ r->membw.bw_gran = get_mba_granularity(cprops);
+
+ r->name = "MB";
+ r->alloc_capable = true;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp)
+{
+ struct mpam_class *class = comp->class;
+
+ if (class->type == MPAM_CLASS_CACHE)
+ return comp->comp_id;
+
+ if (topology_matches_l3(class)) {
+ /* Use the corresponding L3 component ID as the domain ID */
+ int id = get_cpu_cacheinfo_id(cpu, 3);
+
+ /* Implies topology_matches_l3() made a mistake */
+ if (WARN_ON_ONCE(id == -1))
+ return comp->comp_id;
+
+ return id;
+ }
+
+ /* Otherwise, expose the ID used by the firmware table code. */
+ return comp->comp_id;
+}
+
+static int mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon,
+ enum resctrl_event_id type)
+{
+ struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3];
+ struct rdt_resource *l3 = &res->resctrl_res;
+
+ lockdep_assert_cpus_held();
+
+ /*
+ * There also needs to be an L3 cache present.
+ * The check just requires any online CPU and it can't go offline as we
+ * hold the cpu lock.
+ */
+ if (get_cpu_cacheinfo_id(raw_smp_processor_id(), 3) == -1)
+ return 0;
+
+ /*
+ * If there are no MPAM resources on L3, force it into existence.
+ * topology_matches_l3() already ensures this looks like the L3.
+ * The domain-ids will be fixed up by mpam_resctrl_domain_hdr_init().
+ */
+ if (!res->class) {
+ pr_warn_once("Faking L3 MSC to enable counters.\n");
+ res->class = mpam_resctrl_counters[type].class;
+ }
+
+ /*
+ * Called multiple times!, once per event type that has a
+ * monitoring class.
+ * Setting name is necessary on monitor only platforms.
+ */
+ l3->name = "L3";
+ l3->mon_scope = RESCTRL_L3_CACHE;
+
+ /*
+ * num-rmid is the upper bound for the number of monitoring groups that
+ * can exist simultaneously, including the default monitoring group for
+ * each control group. Hence, advertise the whole rmid_idx space even
+ * though each control group has its own pmg/rmid space. Unfortunately,
+ * this does mean userspace needs to know the architecture to correctly
+ * interpret this value.
+ */
+ l3->mon.num_rmid = resctrl_arch_system_num_rmid_idx();
+
+ if (resctrl_enable_mon_event(type, false, 0, NULL))
+ l3->mon_capable = true;
+
+ return 0;
+}
+
+u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d,
+ u32 closid, enum resctrl_conf_type type)
+{
+ u32 partid;
+ struct mpam_config *cfg;
+ struct mpam_props *cprops;
+ struct mpam_resctrl_res *res;
+ struct mpam_resctrl_dom *dom;
+ enum mpam_device_features configured_by;
+
+ lockdep_assert_cpus_held();
+
+ if (!mpam_is_enabled())
+ return resctrl_get_default_ctrl(r);
+
+ res = container_of(r, struct mpam_resctrl_res, resctrl_res);
+ dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom);
+ cprops = &res->class->props;
+
+ /*
+ * When CDP is enabled, but the resource doesn't support it,
+ * the control is cloned across both partids.
+ * Pick one at random to read:
+ */
+ if (mpam_resctrl_hide_cdp(r->rid))
+ type = CDP_DATA;
+
+ partid = resctrl_get_config_index(closid, type);
+ cfg = &dom->ctrl_comp->cfg[partid];
+
+ switch (r->rid) {
+ case RDT_RESOURCE_L2:
+ case RDT_RESOURCE_L3:
+ configured_by = mpam_feat_cpor_part;
+ break;
+ case RDT_RESOURCE_MBA:
+ if (mpam_has_feature(mpam_feat_mbw_max, cprops)) {
+ configured_by = mpam_feat_mbw_max;
+ break;
+ }
+ fallthrough;
+ default:
+ return resctrl_get_default_ctrl(r);
+ }
+
+ if (!r->alloc_capable || partid >= resctrl_arch_get_num_closid(r) ||
+ !mpam_has_feature(configured_by, cfg))
+ return resctrl_get_default_ctrl(r);
+
+ switch (configured_by) {
+ case mpam_feat_cpor_part:
+ return cfg->cpbm;
+ case mpam_feat_mbw_max:
+ return mbw_max_to_percent(cfg->mbw_max, cprops);
+ default:
+ return resctrl_get_default_ctrl(r);
+ }
+}
+
+int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d,
+ u32 closid, enum resctrl_conf_type t, u32 cfg_val)
+{
+ int err;
+ u32 partid;
+ struct mpam_config cfg;
+ struct mpam_props *cprops;
+ struct mpam_resctrl_res *res;
+ struct mpam_resctrl_dom *dom;
+
+ lockdep_assert_cpus_held();
+ lockdep_assert_irqs_enabled();
+
+ if (!mpam_is_enabled())
+ return -EINVAL;
+
+ /*
+ * No need to check the CPU as mpam_apply_config() doesn't care, and
+ * resctrl_arch_update_domains() relies on this.
+ */
+ res = container_of(r, struct mpam_resctrl_res, resctrl_res);
+ dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom);
+ cprops = &res->class->props;
+
+ if (mpam_resctrl_hide_cdp(r->rid))
+ t = CDP_DATA;
+
+ partid = resctrl_get_config_index(closid, t);
+ if (!r->alloc_capable || partid >= resctrl_arch_get_num_closid(r)) {
+ pr_debug("Not alloc capable or computed PARTID out of range\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Copy the current config to avoid clearing other resources when the
+ * same component is exposed multiple times through resctrl.
+ */
+ cfg = dom->ctrl_comp->cfg[partid];
+
+ switch (r->rid) {
+ case RDT_RESOURCE_L2:
+ case RDT_RESOURCE_L3:
+ cfg.cpbm = cfg_val;
+ mpam_set_feature(mpam_feat_cpor_part, &cfg);
+ break;
+ case RDT_RESOURCE_MBA:
+ if (mpam_has_feature(mpam_feat_mbw_max, cprops)) {
+ cfg.mbw_max = percent_to_mbw_max(cfg_val, cprops);
+ mpam_set_feature(mpam_feat_mbw_max, &cfg);
+ break;
+ }
+ fallthrough;
+ default:
+ return -EINVAL;
+ }
+
+ /*
+ * When CDP is enabled, but the resource doesn't support it, we need to
+ * apply the same configuration to the other partid.
+ */
+ if (mpam_resctrl_hide_cdp(r->rid)) {
+ partid = resctrl_get_config_index(closid, CDP_CODE);
+ err = mpam_apply_config(dom->ctrl_comp, partid, &cfg);
+ if (err)
+ return err;
+
+ partid = resctrl_get_config_index(closid, CDP_DATA);
+ return mpam_apply_config(dom->ctrl_comp, partid, &cfg);
+ }
+
+ return mpam_apply_config(dom->ctrl_comp, partid, &cfg);
+}
+
+int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
+{
+ int err;
+ struct rdt_ctrl_domain *d;
+
+ lockdep_assert_cpus_held();
+ lockdep_assert_irqs_enabled();
+
+ if (!mpam_is_enabled())
+ return -EINVAL;
+
+ list_for_each_entry_rcu(d, &r->ctrl_domains, hdr.list) {
+ for (enum resctrl_conf_type t = 0; t < CDP_NUM_TYPES; t++) {
+ struct resctrl_staged_config *cfg = &d->staged_config[t];
+
+ if (!cfg->have_new_ctrl)
+ continue;
+
+ err = resctrl_arch_update_one(r, d, closid, t,
+ cfg->new_ctrl);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+void resctrl_arch_reset_all_ctrls(struct rdt_resource *r)
+{
+ struct mpam_resctrl_res *res;
+
+ lockdep_assert_cpus_held();
+
+ if (!mpam_is_enabled())
+ return;
+
+ res = container_of(r, struct mpam_resctrl_res, resctrl_res);
+ mpam_reset_class_locked(res->class);
+}
+
+static void mpam_resctrl_domain_hdr_init(int cpu, struct mpam_component *comp,
+ enum resctrl_res_level rid,
+ struct rdt_domain_hdr *hdr)
+{
+ lockdep_assert_cpus_held();
+
+ INIT_LIST_HEAD(&hdr->list);
+ hdr->id = mpam_resctrl_pick_domain_id(cpu, comp);
+ hdr->rid = rid;
+ cpumask_set_cpu(cpu, &hdr->cpu_mask);
+}
+
+static void mpam_resctrl_online_domain_hdr(unsigned int cpu,
+ struct rdt_domain_hdr *hdr)
+{
+ lockdep_assert_cpus_held();
+
+ cpumask_set_cpu(cpu, &hdr->cpu_mask);
+}
+
+/**
+ * mpam_resctrl_offline_domain_hdr() - Update the domain header to remove a CPU.
+ * @cpu: The CPU to remove from the domain.
+ * @hdr: The domain's header.
+ *
+ * Removes @cpu from the header mask. If this was the last CPU in the domain,
+ * the domain header is removed from its parent list and true is returned,
+ * indicating the parent structure can be freed.
+ * If there are other CPUs in the domain, returns false.
+ */
+static bool mpam_resctrl_offline_domain_hdr(unsigned int cpu,
+ struct rdt_domain_hdr *hdr)
+{
+ lockdep_assert_held(&domain_list_lock);
+
+ cpumask_clear_cpu(cpu, &hdr->cpu_mask);
+ if (cpumask_empty(&hdr->cpu_mask)) {
+ list_del_rcu(&hdr->list);
+ synchronize_rcu();
+ return true;
+ }
+
+ return false;
+}
+
+static void mpam_resctrl_domain_insert(struct list_head *list,
+ struct rdt_domain_hdr *new)
+{
+ struct rdt_domain_hdr *err;
+ struct list_head *pos = NULL;
+
+ lockdep_assert_held(&domain_list_lock);
+
+ err = resctrl_find_domain(list, new->id, &pos);
+ if (WARN_ON_ONCE(err))
+ return;
+
+ list_add_tail_rcu(&new->list, pos);
+}
+
+static struct mpam_component *find_component(struct mpam_class *class, int cpu)
+{
+ struct mpam_component *comp;
+
+ guard(srcu)(&mpam_srcu);
+ list_for_each_entry_srcu(comp, &class->components, class_list,
+ srcu_read_lock_held(&mpam_srcu)) {
+ if (cpumask_test_cpu(cpu, &comp->affinity))
+ return comp;
+ }
+
+ return NULL;
+}
+
+static struct mpam_resctrl_dom *
+mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res)
+{
+ int err;
+ struct mpam_resctrl_dom *dom;
+ struct rdt_l3_mon_domain *mon_d;
+ struct rdt_ctrl_domain *ctrl_d;
+ struct mpam_class *class = res->class;
+ struct mpam_component *comp_iter, *ctrl_comp;
+ struct rdt_resource *r = &res->resctrl_res;
+
+ lockdep_assert_held(&domain_list_lock);
+
+ ctrl_comp = NULL;
+ guard(srcu)(&mpam_srcu);
+ list_for_each_entry_srcu(comp_iter, &class->components, class_list,
+ srcu_read_lock_held(&mpam_srcu)) {
+ if (cpumask_test_cpu(cpu, &comp_iter->affinity)) {
+ ctrl_comp = comp_iter;
+ break;
+ }
+ }
+
+ /* class has no component for this CPU */
+ if (WARN_ON_ONCE(!ctrl_comp))
+ return ERR_PTR(-EINVAL);
+
+ dom = kzalloc_node(sizeof(*dom), GFP_KERNEL, cpu_to_node(cpu));
+ if (!dom)
+ return ERR_PTR(-ENOMEM);
+
+ if (r->alloc_capable) {
+ dom->ctrl_comp = ctrl_comp;
+
+ ctrl_d = &dom->resctrl_ctrl_dom;
+ mpam_resctrl_domain_hdr_init(cpu, ctrl_comp, r->rid, &ctrl_d->hdr);
+ ctrl_d->hdr.type = RESCTRL_CTRL_DOMAIN;
+ err = resctrl_online_ctrl_domain(r, ctrl_d);
+ if (err)
+ goto free_domain;
+
+ mpam_resctrl_domain_insert(&r->ctrl_domains, &ctrl_d->hdr);
+ } else {
+ pr_debug("Skipped control domain online - no controls\n");
+ }
+
+ if (r->mon_capable) {
+ struct mpam_component *any_mon_comp;
+ struct mpam_resctrl_mon *mon;
+ enum resctrl_event_id eventid;
+
+ /*
+ * Even if the monitor domain is backed by a different
+ * component, the L3 component IDs need to be used... only
+ * there may be no ctrl_comp for the L3.
+ * Search each event's class list for a component with
+ * overlapping CPUs and set up the dom->mon_comp array.
+ */
+
+ for_each_mpam_resctrl_mon(mon, eventid) {
+ struct mpam_component *mon_comp;
+
+ if (!mon->class)
+ continue; // dummy resource
+
+ mon_comp = find_component(mon->class, cpu);
+ dom->mon_comp[eventid] = mon_comp;
+ if (mon_comp)
+ any_mon_comp = mon_comp;
+ }
+ if (!any_mon_comp) {
+ WARN_ON_ONCE(0);
+ err = -EFAULT;
+ goto offline_ctrl_domain;
+ }
+
+ mon_d = &dom->resctrl_mon_dom;
+ mpam_resctrl_domain_hdr_init(cpu, any_mon_comp, r->rid, &mon_d->hdr);
+ mon_d->hdr.type = RESCTRL_MON_DOMAIN;
+ err = resctrl_online_mon_domain(r, &mon_d->hdr);
+ if (err)
+ goto offline_ctrl_domain;
+
+ mpam_resctrl_domain_insert(&r->mon_domains, &mon_d->hdr);
+ } else {
+ pr_debug("Skipped monitor domain online - no monitors\n");
+ }
+
+ return dom;
+
+offline_ctrl_domain:
+ if (r->alloc_capable) {
+ mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr);
+ resctrl_offline_ctrl_domain(r, ctrl_d);
+ }
+free_domain:
+ kfree(dom);
+ dom = ERR_PTR(err);
+
+ return dom;
+}
+
+/*
+ * We know all the monitors are associated with the L3, even if there are no
+ * controls and therefore no control component. Find the cache-id for the CPU
+ * and use that to search for existing resctrl domains.
+ * This relies on mpam_resctrl_pick_domain_id() using the L3 cache-id
+ * for anything that is not a cache.
+ */
+static struct mpam_resctrl_dom *mpam_resctrl_get_mon_domain_from_cpu(int cpu)
+{
+ int cache_id;
+ struct mpam_resctrl_dom *dom;
+ struct mpam_resctrl_res *l3 = &mpam_resctrl_controls[RDT_RESOURCE_L3];
+
+ lockdep_assert_cpus_held();
+
+ if (!l3->class)
+ return NULL;
+ cache_id = get_cpu_cacheinfo_id(cpu, 3);
+ if (cache_id < 0)
+ return NULL;
+
+ list_for_each_entry_rcu(dom, &l3->resctrl_res.mon_domains, resctrl_mon_dom.hdr.list) {
+ if (dom->resctrl_mon_dom.hdr.id == cache_id)
+ return dom;
+ }
+
+ return NULL;
+}
+
+static struct mpam_resctrl_dom *
+mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res)
+{
+ struct mpam_resctrl_dom *dom;
+ struct rdt_resource *r = &res->resctrl_res;
+
+ lockdep_assert_cpus_held();
+
+ list_for_each_entry_rcu(dom, &r->ctrl_domains, resctrl_ctrl_dom.hdr.list) {
+ if (cpumask_test_cpu(cpu, &dom->ctrl_comp->affinity))
+ return dom;
+ }
+
+ if (r->rid != RDT_RESOURCE_L3)
+ return NULL;
+
+ /* Search the mon domain list too - needed on monitor only platforms. */
+ return mpam_resctrl_get_mon_domain_from_cpu(cpu);
+}
+
+int mpam_resctrl_online_cpu(unsigned int cpu)
+{
+ struct mpam_resctrl_res *res;
+ enum resctrl_res_level rid;
+
+ guard(mutex)(&domain_list_lock);
+ for_each_mpam_resctrl_control(res, rid) {
+ struct mpam_resctrl_dom *dom;
+ struct rdt_resource *r = &res->resctrl_res;
+
+ if (!res->class)
+ continue; // dummy_resource;
+
+ dom = mpam_resctrl_get_domain_from_cpu(cpu, res);
+ if (!dom) {
+ dom = mpam_resctrl_alloc_domain(cpu, res);
+ if (IS_ERR(dom))
+ return PTR_ERR(dom);
+ } else {
+ if (r->alloc_capable) {
+ struct rdt_ctrl_domain *ctrl_d = &dom->resctrl_ctrl_dom;
+
+ mpam_resctrl_online_domain_hdr(cpu, &ctrl_d->hdr);
+ }
+ if (r->mon_capable) {
+ struct rdt_l3_mon_domain *mon_d = &dom->resctrl_mon_dom;
+
+ mpam_resctrl_online_domain_hdr(cpu, &mon_d->hdr);
+ }
+ }
+ }
+
+ resctrl_online_cpu(cpu);
+
+ return 0;
+}
+
+void mpam_resctrl_offline_cpu(unsigned int cpu)
+{
+ struct mpam_resctrl_res *res;
+ enum resctrl_res_level rid;
+
+ resctrl_offline_cpu(cpu);
+
+ guard(mutex)(&domain_list_lock);
+ for_each_mpam_resctrl_control(res, rid) {
+ struct mpam_resctrl_dom *dom;
+ struct rdt_l3_mon_domain *mon_d;
+ struct rdt_ctrl_domain *ctrl_d;
+ bool ctrl_dom_empty, mon_dom_empty;
+ struct rdt_resource *r = &res->resctrl_res;
+
+ if (!res->class)
+ continue; // dummy resource
+
+ dom = mpam_resctrl_get_domain_from_cpu(cpu, res);
+ if (WARN_ON_ONCE(!dom))
+ continue;
+
+ if (r->alloc_capable) {
+ ctrl_d = &dom->resctrl_ctrl_dom;
+ ctrl_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr);
+ if (ctrl_dom_empty)
+ resctrl_offline_ctrl_domain(&res->resctrl_res, ctrl_d);
+ } else {
+ ctrl_dom_empty = true;
+ }
+
+ if (r->mon_capable) {
+ mon_d = &dom->resctrl_mon_dom;
+ mon_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &mon_d->hdr);
+ if (mon_dom_empty)
+ resctrl_offline_mon_domain(&res->resctrl_res, &mon_d->hdr);
+ } else {
+ mon_dom_empty = true;
+ }
+
+ if (ctrl_dom_empty && mon_dom_empty)
+ kfree(dom);
+ }
+}
+
+int mpam_resctrl_setup(void)
+{
+ int err = 0;
+ struct mpam_resctrl_res *res;
+ enum resctrl_res_level rid;
+ struct mpam_resctrl_mon *mon;
+ enum resctrl_event_id eventid;
+
+ wait_event(wait_cacheinfo_ready, cacheinfo_ready);
+
+ cpus_read_lock();
+ for_each_mpam_resctrl_control(res, rid) {
+ INIT_LIST_HEAD_RCU(&res->resctrl_res.ctrl_domains);
+ INIT_LIST_HEAD_RCU(&res->resctrl_res.mon_domains);
+ res->resctrl_res.rid = rid;
+ }
+
+ /* Find some classes to use for controls */
+ mpam_resctrl_pick_caches();
+ mpam_resctrl_pick_mba();
+
+ /* Initialise the resctrl structures from the classes */
+ for_each_mpam_resctrl_control(res, rid) {
+ if (!res->class)
+ continue; // dummy resource
+
+ err = mpam_resctrl_control_init(res);
+ if (err) {
+ pr_debug("Failed to initialise rid %u\n", rid);
+ goto internal_error;
+ }
+ }
+
+ /* Find some classes to use for monitors */
+ mpam_resctrl_pick_counters();
+
+ for_each_mpam_resctrl_mon(mon, eventid) {
+ if (!mon->class)
+ continue; // dummy resource
+
+ err = mpam_resctrl_monitor_init(mon, eventid);
+ if (err) {
+ pr_debug("Failed to initialise event %u\n", eventid);
+ goto internal_error;
+ }
+ }
+
+ cpus_read_unlock();
+
+ if (!resctrl_arch_alloc_capable() && !resctrl_arch_mon_capable()) {
+ pr_debug("No alloc(%u) or monitor(%u) found - resctrl not supported\n",
+ resctrl_arch_alloc_capable(), resctrl_arch_mon_capable());
+ return -EOPNOTSUPP;
+ }
+
+ err = resctrl_init();
+ if (err)
+ return err;
+
+ WRITE_ONCE(resctrl_enabled, true);
+
+ return 0;
+
+internal_error:
+ cpus_read_unlock();
+ pr_debug("Internal error %d - resctrl not supported\n", err);
+ return err;
+}
+
+void mpam_resctrl_exit(void)
+{
+ if (!READ_ONCE(resctrl_enabled))
+ return;
+
+ WRITE_ONCE(resctrl_enabled, false);
+ resctrl_exit();
+}
+
+/*
+ * The driver is detaching an MSC from this class, if resctrl was using it,
+ * pull on resctrl_exit().
+ */
+void mpam_resctrl_teardown_class(struct mpam_class *class)
+{
+ struct mpam_resctrl_res *res;
+ enum resctrl_res_level rid;
+ struct mpam_resctrl_mon *mon;
+ enum resctrl_event_id eventid;
+
+ might_sleep();
+
+ for_each_mpam_resctrl_control(res, rid) {
+ if (res->class == class) {
+ res->class = NULL;
+ break;
+ }
+ }
+ for_each_mpam_resctrl_mon(mon, eventid) {
+ if (mon->class == class) {
+ mon->class = NULL;
+ break;
+ }
+ }
+}
+
+static int __init __cacheinfo_ready(void)
+{
+ cacheinfo_ready = true;
+ wake_up(&wait_cacheinfo_ready);
+
+ return 0;
+}
+device_initcall_sync(__cacheinfo_ready);
+
+#ifdef CONFIG_MPAM_KUNIT_TEST
+#include "test_mpam_resctrl.c"
+#endif
diff --git a/drivers/resctrl/test_mpam_resctrl.c b/drivers/resctrl/test_mpam_resctrl.c
new file mode 100644
index 000000000000..b93d6ad87e43
--- /dev/null
+++ b/drivers/resctrl/test_mpam_resctrl.c
@@ -0,0 +1,315 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2025 Arm Ltd.
+/* This file is intended to be included into mpam_resctrl.c */
+
+#include <kunit/test.h>
+#include <linux/array_size.h>
+#include <linux/bits.h>
+#include <linux/math.h>
+#include <linux/sprintf.h>
+
+struct percent_value_case {
+ u8 pc;
+ u8 width;
+ u16 value;
+};
+
+/*
+ * Mysterious inscriptions taken from the union of ARM DDI 0598D.b,
+ * "Arm Architecture Reference Manual Supplement - Memory System
+ * Resource Partitioning and Monitoring (MPAM), for A-profile
+ * architecture", Section 9.8, "About the fixed-point fractional
+ * format" (exact percentage entries only) and ARM IHI0099B.a
+ * "MPAM system component specification", Section 9.3,
+ * "The fixed-point fractional format":
+ */
+static const struct percent_value_case percent_value_cases[] = {
+ /* Architectural cases: */
+ { 1, 8, 1 }, { 1, 12, 0x27 }, { 1, 16, 0x28e },
+ { 25, 8, 0x3f }, { 25, 12, 0x3ff }, { 25, 16, 0x3fff },
+ { 33, 8, 0x53 }, { 33, 12, 0x546 }, { 33, 16, 0x5479 },
+ { 35, 8, 0x58 }, { 35, 12, 0x598 }, { 35, 16, 0x5998 },
+ { 45, 8, 0x72 }, { 45, 12, 0x732 }, { 45, 16, 0x7332 },
+ { 50, 8, 0x7f }, { 50, 12, 0x7ff }, { 50, 16, 0x7fff },
+ { 52, 8, 0x84 }, { 52, 12, 0x850 }, { 52, 16, 0x851d },
+ { 55, 8, 0x8b }, { 55, 12, 0x8cb }, { 55, 16, 0x8ccb },
+ { 58, 8, 0x93 }, { 58, 12, 0x946 }, { 58, 16, 0x9479 },
+ { 75, 8, 0xbf }, { 75, 12, 0xbff }, { 75, 16, 0xbfff },
+ { 80, 8, 0xcb }, { 80, 12, 0xccb }, { 80, 16, 0xcccb },
+ { 88, 8, 0xe0 }, { 88, 12, 0xe13 }, { 88, 16, 0xe146 },
+ { 95, 8, 0xf2 }, { 95, 12, 0xf32 }, { 95, 16, 0xf332 },
+ { 100, 8, 0xff }, { 100, 12, 0xfff }, { 100, 16, 0xffff },
+};
+
+static void test_percent_value_desc(const struct percent_value_case *param,
+ char *desc)
+{
+ snprintf(desc, KUNIT_PARAM_DESC_SIZE,
+ "pc=%d, width=%d, value=0x%.*x\n",
+ param->pc, param->width,
+ DIV_ROUND_UP(param->width, 4), param->value);
+}
+
+KUNIT_ARRAY_PARAM(test_percent_value, percent_value_cases,
+ test_percent_value_desc);
+
+struct percent_value_test_info {
+ u32 pc; /* result of value-to-percent conversion */
+ u32 value; /* result of percent-to-value conversion */
+ u32 max_value; /* maximum raw value allowed by test params */
+ unsigned int shift; /* promotes raw testcase value to 16 bits */
+};
+
+/*
+ * Convert a reference percentage to a fixed-point MAX value and
+ * vice-versa, based on param (not test->param_value!)
+ */
+static void __prepare_percent_value_test(struct kunit *test,
+ struct percent_value_test_info *res,
+ const struct percent_value_case *param)
+{
+ struct mpam_props fake_props = { };
+
+ /* Reject bogus test parameters that would break the tests: */
+ KUNIT_ASSERT_GE(test, param->width, 1);
+ KUNIT_ASSERT_LE(test, param->width, 16);
+ KUNIT_ASSERT_LT(test, param->value, 1 << param->width);
+
+ mpam_set_feature(mpam_feat_mbw_max, &fake_props);
+ fake_props.bwa_wd = param->width;
+
+ res->shift = 16 - param->width;
+ res->max_value = GENMASK_U32(param->width - 1, 0);
+ res->value = percent_to_mbw_max(param->pc, &fake_props);
+ res->pc = mbw_max_to_percent(param->value << res->shift, &fake_props);
+}
+
+static void test_get_mba_granularity(struct kunit *test)
+{
+ int ret;
+ struct mpam_props fake_props = { };
+
+ /* Use MBW_MAX */
+ mpam_set_feature(mpam_feat_mbw_max, &fake_props);
+
+ fake_props.bwa_wd = 0;
+ KUNIT_EXPECT_FALSE(test, mba_class_use_mbw_max(&fake_props));
+
+ fake_props.bwa_wd = 1;
+ KUNIT_EXPECT_TRUE(test, mba_class_use_mbw_max(&fake_props));
+
+ /* Architectural maximum: */
+ fake_props.bwa_wd = 16;
+ KUNIT_EXPECT_TRUE(test, mba_class_use_mbw_max(&fake_props));
+
+ /* No usable control... */
+ fake_props.bwa_wd = 0;
+ ret = get_mba_granularity(&fake_props);
+ KUNIT_EXPECT_EQ(test, ret, 0);
+
+ fake_props.bwa_wd = 1;
+ ret = get_mba_granularity(&fake_props);
+ KUNIT_EXPECT_EQ(test, ret, 50); /* DIV_ROUND_UP(100, 1 << 1)% = 50% */
+
+ fake_props.bwa_wd = 2;
+ ret = get_mba_granularity(&fake_props);
+ KUNIT_EXPECT_EQ(test, ret, 25); /* DIV_ROUND_UP(100, 1 << 2)% = 25% */
+
+ fake_props.bwa_wd = 3;
+ ret = get_mba_granularity(&fake_props);
+ KUNIT_EXPECT_EQ(test, ret, 13); /* DIV_ROUND_UP(100, 1 << 3)% = 13% */
+
+ fake_props.bwa_wd = 6;
+ ret = get_mba_granularity(&fake_props);
+ KUNIT_EXPECT_EQ(test, ret, 2); /* DIV_ROUND_UP(100, 1 << 6)% = 2% */
+
+ fake_props.bwa_wd = 7;
+ ret = get_mba_granularity(&fake_props);
+ KUNIT_EXPECT_EQ(test, ret, 1); /* DIV_ROUND_UP(100, 1 << 7)% = 1% */
+
+ /* Granularity saturates at 1% */
+ fake_props.bwa_wd = 16; /* architectural maximum */
+ ret = get_mba_granularity(&fake_props);
+ KUNIT_EXPECT_EQ(test, ret, 1); /* DIV_ROUND_UP(100, 1 << 16)% = 1% */
+}
+
+static void test_mbw_max_to_percent(struct kunit *test)
+{
+ const struct percent_value_case *param = test->param_value;
+ struct percent_value_test_info res;
+
+ /*
+ * Since the reference values in percent_value_cases[] all
+ * correspond to exact percentages, round-to-nearest will
+ * always give the exact percentage back when the MPAM max
+ * value has precision of 0.5% or finer. (Always true for the
+ * reference data, since they all specify 8 bits or more of
+ * precision.
+ *
+ * So, keep it simple and demand an exact match:
+ */
+ __prepare_percent_value_test(test, &res, param);
+ KUNIT_EXPECT_EQ(test, res.pc, param->pc);
+}
+
+static void test_percent_to_mbw_max(struct kunit *test)
+{
+ const struct percent_value_case *param = test->param_value;
+ struct percent_value_test_info res;
+
+ __prepare_percent_value_test(test, &res, param);
+
+ KUNIT_EXPECT_GE(test, res.value, param->value << res.shift);
+ KUNIT_EXPECT_LE(test, res.value, (param->value + 1) << res.shift);
+ KUNIT_EXPECT_LE(test, res.value, res.max_value << res.shift);
+
+ /* No flexibility allowed for 0% and 100%! */
+
+ if (param->pc == 0)
+ KUNIT_EXPECT_EQ(test, res.value, 0);
+
+ if (param->pc == 100)
+ KUNIT_EXPECT_EQ(test, res.value, res.max_value << res.shift);
+}
+
+static const void *test_all_bwa_wd_gen_params(struct kunit *test, const void *prev,
+ char *desc)
+{
+ uintptr_t param = (uintptr_t)prev;
+
+ if (param > 15)
+ return NULL;
+
+ param++;
+
+ snprintf(desc, KUNIT_PARAM_DESC_SIZE, "wd=%u\n", (unsigned int)param);
+
+ return (void *)param;
+}
+
+static unsigned int test_get_bwa_wd(struct kunit *test)
+{
+ uintptr_t param = (uintptr_t)test->param_value;
+
+ KUNIT_ASSERT_GE(test, param, 1);
+ KUNIT_ASSERT_LE(test, param, 16);
+
+ return param;
+}
+
+static void test_mbw_max_to_percent_limits(struct kunit *test)
+{
+ struct mpam_props fake_props = {0};
+ u32 max_value;
+
+ mpam_set_feature(mpam_feat_mbw_max, &fake_props);
+ fake_props.bwa_wd = test_get_bwa_wd(test);
+ max_value = GENMASK(15, 16 - fake_props.bwa_wd);
+
+ KUNIT_EXPECT_EQ(test, mbw_max_to_percent(max_value, &fake_props),
+ MAX_MBA_BW);
+ KUNIT_EXPECT_EQ(test, mbw_max_to_percent(0, &fake_props),
+ get_mba_min(&fake_props));
+
+ /*
+ * Rounding policy dependent 0% sanity-check:
+ * With round-to-nearest, the minimum mbw_max value really
+ * should map to 0% if there are at least 200 steps.
+ * (100 steps may be enough for some other rounding policies.)
+ */
+ if (fake_props.bwa_wd >= 8)
+ KUNIT_EXPECT_EQ(test, mbw_max_to_percent(0, &fake_props), 0);
+
+ if (fake_props.bwa_wd < 8 &&
+ mbw_max_to_percent(0, &fake_props) == 0)
+ kunit_warn(test, "wd=%d: Testsuite/driver Rounding policy mismatch?",
+ fake_props.bwa_wd);
+}
+
+/*
+ * Check that converting a percentage to mbw_max and back again (or, as
+ * appropriate, vice-versa) always restores the original value:
+ */
+static void test_percent_max_roundtrip_stability(struct kunit *test)
+{
+ struct mpam_props fake_props = {0};
+ unsigned int shift;
+ u32 pc, max, pc2, max2;
+
+ mpam_set_feature(mpam_feat_mbw_max, &fake_props);
+ fake_props.bwa_wd = test_get_bwa_wd(test);
+ shift = 16 - fake_props.bwa_wd;
+
+ /*
+ * Converting a valid value from the coarser scale to the finer
+ * scale and back again must yield the original value:
+ */
+ if (fake_props.bwa_wd >= 7) {
+ /* More than 100 steps: only test exact pc values: */
+ for (pc = get_mba_min(&fake_props); pc <= MAX_MBA_BW; pc++) {
+ max = percent_to_mbw_max(pc, &fake_props);
+ pc2 = mbw_max_to_percent(max, &fake_props);
+ KUNIT_EXPECT_EQ(test, pc2, pc);
+ }
+ } else {
+ /* Fewer than 100 steps: only test exact mbw_max values: */
+ for (max = 0; max < 1 << 16; max += 1 << shift) {
+ pc = mbw_max_to_percent(max, &fake_props);
+ max2 = percent_to_mbw_max(pc, &fake_props);
+ KUNIT_EXPECT_EQ(test, max2, max);
+ }
+ }
+}
+
+static void test_percent_to_max_rounding(struct kunit *test)
+{
+ const struct percent_value_case *param = test->param_value;
+ unsigned int num_rounded_up = 0, total = 0;
+ struct percent_value_test_info res;
+
+ for (param = percent_value_cases, total = 0;
+ param < &percent_value_cases[ARRAY_SIZE(percent_value_cases)];
+ param++, total++) {
+ __prepare_percent_value_test(test, &res, param);
+ if (res.value > param->value << res.shift)
+ num_rounded_up++;
+ }
+
+ /*
+ * The MPAM driver applies a round-to-nearest policy, whereas a
+ * round-down policy seems to have been applied in the
+ * reference table from which the test vectors were selected.
+ *
+ * For a large and well-distributed suite of test vectors,
+ * about half should be rounded up and half down compared with
+ * the reference table. The actual test vectors are few in
+ * number and probably not very well distributed however, so
+ * tolerate a round-up rate of between 1/4 and 3/4 before
+ * crying foul:
+ */
+
+ kunit_info(test, "Round-up rate: %u%% (%u/%u)\n",
+ DIV_ROUND_CLOSEST(num_rounded_up * 100, total),
+ num_rounded_up, total);
+
+ KUNIT_EXPECT_GE(test, 4 * num_rounded_up, 1 * total);
+ KUNIT_EXPECT_LE(test, 4 * num_rounded_up, 3 * total);
+}
+
+static struct kunit_case mpam_resctrl_test_cases[] = {
+ KUNIT_CASE(test_get_mba_granularity),
+ KUNIT_CASE_PARAM(test_mbw_max_to_percent, test_percent_value_gen_params),
+ KUNIT_CASE_PARAM(test_percent_to_mbw_max, test_percent_value_gen_params),
+ KUNIT_CASE_PARAM(test_mbw_max_to_percent_limits, test_all_bwa_wd_gen_params),
+ KUNIT_CASE(test_percent_to_max_rounding),
+ KUNIT_CASE_PARAM(test_percent_max_roundtrip_stability,
+ test_all_bwa_wd_gen_params),
+ {}
+};
+
+static struct kunit_suite mpam_resctrl_test_suite = {
+ .name = "mpam_resctrl_test_suite",
+ .test_cases = mpam_resctrl_test_cases,
+};
+
+kunit_test_suites(&mpam_resctrl_test_suite);
diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h
index 7f00c5285a32..f92a36187a52 100644
--- a/include/linux/arm_mpam.h
+++ b/include/linux/arm_mpam.h
@@ -5,6 +5,7 @@
#define __LINUX_ARM_MPAM_H
#include <linux/acpi.h>
+#include <linux/resctrl_types.h>
#include <linux/types.h>
struct mpam_msc;
@@ -49,6 +50,37 @@ static inline int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx,
}
#endif
+bool resctrl_arch_alloc_capable(void);
+bool resctrl_arch_mon_capable(void);
+
+void resctrl_arch_set_cpu_default_closid(int cpu, u32 closid);
+void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid);
+void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid);
+void resctrl_arch_sched_in(struct task_struct *tsk);
+bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid);
+bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 closid, u32 rmid);
+u32 resctrl_arch_rmid_idx_encode(u32 closid, u32 rmid);
+void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid);
+u32 resctrl_arch_system_num_rmid_idx(void);
+
+struct rdt_resource;
+void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, enum resctrl_event_id evtid);
+void resctrl_arch_mon_ctx_free(struct rdt_resource *r, enum resctrl_event_id evtid, void *ctx);
+
+/*
+ * The CPU configuration for MPAM is cheap to write, and is only written if it
+ * has changed. No need for fine grained enables.
+ */
+static inline void resctrl_arch_enable_mon(void) { }
+static inline void resctrl_arch_disable_mon(void) { }
+static inline void resctrl_arch_enable_alloc(void) { }
+static inline void resctrl_arch_disable_alloc(void) { }
+
+static inline unsigned int resctrl_arch_round_mon_val(unsigned int val)
+{
+ return val;
+}
+
/**
* mpam_register_requestor() - Register a requestor with the MPAM driver
* @partid_max: The maximum PARTID value the requestor can generate.
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index f83ca0abf2cd..dbaa153100f4 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -321,7 +321,7 @@ static __always_inline void syscall_exit_to_user_mode(struct pt_regs *regs)
{
instrumentation_begin();
syscall_exit_to_user_mode_work(regs);
- local_irq_disable_exit_to_user();
+ local_irq_disable();
syscall_exit_to_user_mode_prepare(regs);
instrumentation_end();
exit_to_user_mode();
diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h
index d26d1b1bcbfb..384520217bfe 100644
--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -101,37 +101,6 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs)
}
/**
- * local_irq_enable_exit_to_user - Exit to user variant of local_irq_enable()
- * @ti_work: Cached TIF flags gathered with interrupts disabled
- *
- * Defaults to local_irq_enable(). Can be supplied by architecture specific
- * code.
- */
-static inline void local_irq_enable_exit_to_user(unsigned long ti_work);
-
-#ifndef local_irq_enable_exit_to_user
-static __always_inline void local_irq_enable_exit_to_user(unsigned long ti_work)
-{
- local_irq_enable();
-}
-#endif
-
-/**
- * local_irq_disable_exit_to_user - Exit to user variant of local_irq_disable()
- *
- * Defaults to local_irq_disable(). Can be supplied by architecture specific
- * code.
- */
-static inline void local_irq_disable_exit_to_user(void);
-
-#ifndef local_irq_disable_exit_to_user
-static __always_inline void local_irq_disable_exit_to_user(void)
-{
- local_irq_disable();
-}
-#endif
-
-/**
* arch_exit_to_user_mode_work - Architecture specific TIF work for exit
* to user mode.
* @regs: Pointer to currents pt_regs
@@ -335,6 +304,8 @@ static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs)
*/
static __always_inline void irqentry_exit_to_user_mode(struct pt_regs *regs)
{
+ lockdep_assert_irqs_disabled();
+
instrumentation_begin();
irqentry_exit_to_user_mode_prepare(regs);
instrumentation_end();
@@ -366,6 +337,205 @@ typedef struct irqentry_state {
#endif
/**
+ * irqentry_exit_cond_resched - Conditionally reschedule on return from interrupt
+ *
+ * Conditional reschedule with additional sanity checks.
+ */
+void raw_irqentry_exit_cond_resched(void);
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+#define irqentry_exit_cond_resched_dynamic_enabled raw_irqentry_exit_cond_resched
+#define irqentry_exit_cond_resched_dynamic_disabled NULL
+DECLARE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched);
+#define irqentry_exit_cond_resched() static_call(irqentry_exit_cond_resched)()
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+DECLARE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched);
+void dynamic_irqentry_exit_cond_resched(void);
+#define irqentry_exit_cond_resched() dynamic_irqentry_exit_cond_resched()
+#endif
+#else /* CONFIG_PREEMPT_DYNAMIC */
+#define irqentry_exit_cond_resched() raw_irqentry_exit_cond_resched()
+#endif /* CONFIG_PREEMPT_DYNAMIC */
+
+/**
+ * irqentry_enter_from_kernel_mode - Establish state before invoking the irq handler
+ * @regs: Pointer to currents pt_regs
+ *
+ * Invoked from architecture specific entry code with interrupts disabled.
+ * Can only be called when the interrupt entry came from kernel mode. The
+ * calling code must be non-instrumentable. When the function returns all
+ * state is correct and the subsequent functions can be instrumented.
+ *
+ * The function establishes state (lockdep, RCU (context tracking), tracing) and
+ * is provided for architectures which require a strict split between entry from
+ * kernel and user mode and therefore cannot use irqentry_enter() which handles
+ * both entry modes.
+ *
+ * Returns: An opaque object that must be passed to irqentry_exit_to_kernel_mode().
+ */
+static __always_inline irqentry_state_t irqentry_enter_from_kernel_mode(struct pt_regs *regs)
+{
+ irqentry_state_t ret = {
+ .exit_rcu = false,
+ };
+
+ /*
+ * If this entry hit the idle task invoke ct_irq_enter() whether
+ * RCU is watching or not.
+ *
+ * Interrupts can nest when the first interrupt invokes softirq
+ * processing on return which enables interrupts.
+ *
+ * Scheduler ticks in the idle task can mark quiescent state and
+ * terminate a grace period, if and only if the timer interrupt is
+ * not nested into another interrupt.
+ *
+ * Checking for rcu_is_watching() here would prevent the nesting
+ * interrupt to invoke ct_irq_enter(). If that nested interrupt is
+ * the tick then rcu_flavor_sched_clock_irq() would wrongfully
+ * assume that it is the first interrupt and eventually claim
+ * quiescent state and end grace periods prematurely.
+ *
+ * Unconditionally invoke ct_irq_enter() so RCU state stays
+ * consistent.
+ *
+ * TINY_RCU does not support EQS, so let the compiler eliminate
+ * this part when enabled.
+ */
+ if (!IS_ENABLED(CONFIG_TINY_RCU) &&
+ (is_idle_task(current) || arch_in_rcu_eqs())) {
+ /*
+ * If RCU is not watching then the same careful
+ * sequence vs. lockdep and tracing is required
+ * as in irqentry_enter_from_user_mode().
+ */
+ lockdep_hardirqs_off(CALLER_ADDR0);
+ ct_irq_enter();
+ instrumentation_begin();
+ kmsan_unpoison_entry_regs(regs);
+ trace_hardirqs_off_finish();
+ instrumentation_end();
+
+ ret.exit_rcu = true;
+ return ret;
+ }
+
+ /*
+ * If RCU is watching then RCU only wants to check whether it needs
+ * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick()
+ * already contains a warning when RCU is not watching, so no point
+ * in having another one here.
+ */
+ lockdep_hardirqs_off(CALLER_ADDR0);
+ instrumentation_begin();
+ kmsan_unpoison_entry_regs(regs);
+ rcu_irq_enter_check_tick();
+ trace_hardirqs_off_finish();
+ instrumentation_end();
+
+ return ret;
+}
+
+/**
+ * irqentry_exit_to_kernel_mode_preempt - Run preempt checks on return to kernel mode
+ * @regs: Pointer to current's pt_regs
+ * @state: Return value from matching call to irqentry_enter_from_kernel_mode()
+ *
+ * This is to be invoked before irqentry_exit_to_kernel_mode_after_preempt() to
+ * allow kernel preemption on return from interrupt.
+ *
+ * Must be invoked with interrupts disabled and CPU state which allows kernel
+ * preemption.
+ *
+ * After returning from this function, the caller can modify CPU state before
+ * invoking irqentry_exit_to_kernel_mode_after_preempt(), which is required to
+ * re-establish the tracing, lockdep and RCU state for returning to the
+ * interrupted context.
+ */
+static inline void irqentry_exit_to_kernel_mode_preempt(struct pt_regs *regs,
+ irqentry_state_t state)
+{
+ if (regs_irqs_disabled(regs) || state.exit_rcu)
+ return;
+
+ if (IS_ENABLED(CONFIG_PREEMPTION))
+ irqentry_exit_cond_resched();
+}
+
+/**
+ * irqentry_exit_to_kernel_mode_after_preempt - Establish trace, lockdep and RCU state
+ * @regs: Pointer to current's pt_regs
+ * @state: Return value from matching call to irqentry_enter_from_kernel_mode()
+ *
+ * This is to be invoked after irqentry_exit_to_kernel_mode_preempt() and before
+ * actually returning to the interrupted context.
+ *
+ * There are no requirements for the CPU state other than being able to complete
+ * the tracing, lockdep and RCU state transitions. After this function returns
+ * the caller must return directly to the interrupted context.
+ */
+static __always_inline void
+irqentry_exit_to_kernel_mode_after_preempt(struct pt_regs *regs, irqentry_state_t state)
+{
+ if (!regs_irqs_disabled(regs)) {
+ /*
+ * If RCU was not watching on entry this needs to be done
+ * carefully and needs the same ordering of lockdep/tracing
+ * and RCU as the return to user mode path.
+ */
+ if (state.exit_rcu) {
+ instrumentation_begin();
+ /* Tell the tracer that IRET will enable interrupts */
+ trace_hardirqs_on_prepare();
+ lockdep_hardirqs_on_prepare();
+ instrumentation_end();
+ ct_irq_exit();
+ lockdep_hardirqs_on(CALLER_ADDR0);
+ return;
+ }
+
+ instrumentation_begin();
+ /* Covers both tracing and lockdep */
+ trace_hardirqs_on();
+ instrumentation_end();
+ } else {
+ /*
+ * IRQ flags state is correct already. Just tell RCU if it
+ * was not watching on entry.
+ */
+ if (state.exit_rcu)
+ ct_irq_exit();
+ }
+}
+
+/**
+ * irqentry_exit_to_kernel_mode - Run preempt checks and establish state after
+ * invoking the interrupt handler
+ * @regs: Pointer to current's pt_regs
+ * @state: Return value from matching call to irqentry_enter_from_kernel_mode()
+ *
+ * This is the counterpart of irqentry_enter_from_kernel_mode() and combines
+ * the calls to irqentry_exit_to_kernel_mode_preempt() and
+ * irqentry_exit_to_kernel_mode_after_preempt().
+ *
+ * The requirement for the CPU state is that it can schedule. After the function
+ * returns the tracing, lockdep and RCU state transitions are completed and the
+ * caller must return directly to the interrupted context.
+ */
+static __always_inline void irqentry_exit_to_kernel_mode(struct pt_regs *regs,
+ irqentry_state_t state)
+{
+ lockdep_assert_irqs_disabled();
+
+ instrumentation_begin();
+ irqentry_exit_to_kernel_mode_preempt(regs, state);
+ instrumentation_end();
+
+ irqentry_exit_to_kernel_mode_after_preempt(regs, state);
+}
+
+/**
* irqentry_enter - Handle state tracking on ordinary interrupt entries
* @regs: Pointer to pt_regs of interrupted context
*
@@ -394,33 +564,11 @@ typedef struct irqentry_state {
* establish the proper context for NOHZ_FULL. Otherwise scheduling on exit
* would not be possible.
*
- * Returns: An opaque object that must be passed to idtentry_exit()
+ * Returns: An opaque object that must be passed to irqentry_exit()
*/
irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs);
/**
- * irqentry_exit_cond_resched - Conditionally reschedule on return from interrupt
- *
- * Conditional reschedule with additional sanity checks.
- */
-void raw_irqentry_exit_cond_resched(void);
-
-#ifdef CONFIG_PREEMPT_DYNAMIC
-#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
-#define irqentry_exit_cond_resched_dynamic_enabled raw_irqentry_exit_cond_resched
-#define irqentry_exit_cond_resched_dynamic_disabled NULL
-DECLARE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched);
-#define irqentry_exit_cond_resched() static_call(irqentry_exit_cond_resched)()
-#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
-DECLARE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched);
-void dynamic_irqentry_exit_cond_resched(void);
-#define irqentry_exit_cond_resched() dynamic_irqentry_exit_cond_resched()
-#endif
-#else /* CONFIG_PREEMPT_DYNAMIC */
-#define irqentry_exit_cond_resched() raw_irqentry_exit_cond_resched()
-#endif /* CONFIG_PREEMPT_DYNAMIC */
-
-/**
* irqentry_exit - Handle return from exception that used irqentry_enter()
* @regs: Pointer to pt_regs (exception entry regs)
* @state: Return value from matching call to irqentry_enter()
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 9ef63e414791..1034be02eae8 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -47,7 +47,7 @@ static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *re
*/
while (ti_work & EXIT_TO_USER_MODE_WORK_LOOP) {
- local_irq_enable_exit_to_user(ti_work);
+ local_irq_enable();
if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) {
if (!rseq_grant_slice_extension(ti_work & TIF_SLICE_EXT_DENY))
@@ -74,7 +74,7 @@ static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *re
* might have changed while interrupts and preemption was
* enabled above.
*/
- local_irq_disable_exit_to_user();
+ local_irq_disable();
/* Check if any of the above work has queued a deferred wakeup */
tick_nohz_user_enter_prepare();
@@ -105,70 +105,16 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
{
- irqentry_state_t ret = {
- .exit_rcu = false,
- };
-
if (user_mode(regs)) {
- irqentry_enter_from_user_mode(regs);
- return ret;
- }
+ irqentry_state_t ret = {
+ .exit_rcu = false,
+ };
- /*
- * If this entry hit the idle task invoke ct_irq_enter() whether
- * RCU is watching or not.
- *
- * Interrupts can nest when the first interrupt invokes softirq
- * processing on return which enables interrupts.
- *
- * Scheduler ticks in the idle task can mark quiescent state and
- * terminate a grace period, if and only if the timer interrupt is
- * not nested into another interrupt.
- *
- * Checking for rcu_is_watching() here would prevent the nesting
- * interrupt to invoke ct_irq_enter(). If that nested interrupt is
- * the tick then rcu_flavor_sched_clock_irq() would wrongfully
- * assume that it is the first interrupt and eventually claim
- * quiescent state and end grace periods prematurely.
- *
- * Unconditionally invoke ct_irq_enter() so RCU state stays
- * consistent.
- *
- * TINY_RCU does not support EQS, so let the compiler eliminate
- * this part when enabled.
- */
- if (!IS_ENABLED(CONFIG_TINY_RCU) &&
- (is_idle_task(current) || arch_in_rcu_eqs())) {
- /*
- * If RCU is not watching then the same careful
- * sequence vs. lockdep and tracing is required
- * as in irqentry_enter_from_user_mode().
- */
- lockdep_hardirqs_off(CALLER_ADDR0);
- ct_irq_enter();
- instrumentation_begin();
- kmsan_unpoison_entry_regs(regs);
- trace_hardirqs_off_finish();
- instrumentation_end();
-
- ret.exit_rcu = true;
+ irqentry_enter_from_user_mode(regs);
return ret;
}
- /*
- * If RCU is watching then RCU only wants to check whether it needs
- * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick()
- * already contains a warning when RCU is not watching, so no point
- * in having another one here.
- */
- lockdep_hardirqs_off(CALLER_ADDR0);
- instrumentation_begin();
- kmsan_unpoison_entry_regs(regs);
- rcu_irq_enter_check_tick();
- trace_hardirqs_off_finish();
- instrumentation_end();
-
- return ret;
+ return irqentry_enter_from_kernel_mode(regs);
}
/**
@@ -212,43 +158,10 @@ void dynamic_irqentry_exit_cond_resched(void)
noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
{
- lockdep_assert_irqs_disabled();
-
- /* Check whether this returns to user mode */
- if (user_mode(regs)) {
+ if (user_mode(regs))
irqentry_exit_to_user_mode(regs);
- } else if (!regs_irqs_disabled(regs)) {
- /*
- * If RCU was not watching on entry this needs to be done
- * carefully and needs the same ordering of lockdep/tracing
- * and RCU as the return to user mode path.
- */
- if (state.exit_rcu) {
- instrumentation_begin();
- /* Tell the tracer that IRET will enable interrupts */
- trace_hardirqs_on_prepare();
- lockdep_hardirqs_on_prepare();
- instrumentation_end();
- ct_irq_exit();
- lockdep_hardirqs_on(CALLER_ADDR0);
- return;
- }
-
- instrumentation_begin();
- if (IS_ENABLED(CONFIG_PREEMPTION))
- irqentry_exit_cond_resched();
-
- /* Covers both tracing and lockdep */
- trace_hardirqs_on();
- instrumentation_end();
- } else {
- /*
- * IRQ flags state is correct already. Just tell RCU if it
- * was not watching on entry.
- */
- if (state.exit_rcu)
- ct_irq_exit();
- }
+ else
+ irqentry_exit_to_kernel_mode(regs, state);
}
irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
diff --git a/tools/testing/selftests/arm64/abi/hwcap.c b/tools/testing/selftests/arm64/abi/hwcap.c
index c2661a312fc9..e22703d6b97c 100644
--- a/tools/testing/selftests/arm64/abi/hwcap.c
+++ b/tools/testing/selftests/arm64/abi/hwcap.c
@@ -56,7 +56,8 @@ static void atomics_sigill(void)
static void cmpbr_sigill(void)
{
- /* Not implemented, too complicated and unreliable anyway */
+ asm volatile(".inst 0x74C00040\n" /* CBEQ w0, w0, +8 */
+ "udf #0" : : : "cc"); /* UDF #0 */
}
static void crc32_sigill(void)
diff --git a/tools/testing/selftests/kvm/arm64/set_id_regs.c b/tools/testing/selftests/kvm/arm64/set_id_regs.c
index 73de5be58bab..fa3478a6c914 100644
--- a/tools/testing/selftests/kvm/arm64/set_id_regs.c
+++ b/tools/testing/selftests/kvm/arm64/set_id_regs.c
@@ -124,6 +124,7 @@ static const struct reg_ftr_bits ftr_id_aa64isar2_el1[] = {
static const struct reg_ftr_bits ftr_id_aa64isar3_el1[] = {
REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR3_EL1, FPRCVT, 0),
+ REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR3_EL1, LSUI, 0),
REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR3_EL1, LSFE, 0),
REG_FTR_BITS(FTR_LOWER_SAFE, ID_AA64ISAR3_EL1, FAMINMAX, 0),
REG_FTR_END,