diff options
Diffstat (limited to 'arch')
97 files changed, 1654 insertions, 8444 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index b0adb665041f..a3308a220f86 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1518,6 +1518,14 @@ config STRICT_MODULE_RWX config ARCH_HAS_PHYS_TO_DMA bool +config ARCH_HAS_CPU_RESCTRL + bool + help + An architecture selects this option to indicate that the necessary + hooks are provided to support the common memory system usage + monitoring and control interfaces provided by the 'resctrl' + filesystem (see RESCTRL_FS). + config HAVE_ARCH_COMPILER_H bool help diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 3df5f2dd4c0f..8f1f18adcdb5 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -150,6 +150,8 @@ #define SO_RCVPRIORITY 82 +#define SO_PASSRIGHTS 83 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 25ed6f1a7c7a..3072731fe09c 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1380,8 +1380,7 @@ config CC_HAVE_STACKPROTECTOR_TLS config STACKPROTECTOR_PER_TASK bool "Use a unique stack canary value for each task" depends on STACKPROTECTOR && CURRENT_POINTER_IN_TPIDRURO && !XIP_DEFLATED_DATA - depends on GCC_PLUGINS || CC_HAVE_STACKPROTECTOR_TLS - select GCC_PLUGIN_ARM_SSP_PER_TASK if !CC_HAVE_STACKPROTECTOR_TLS + depends on CC_HAVE_STACKPROTECTOR_TLS default y help Due to the fact that GCC uses an ordinary symbol reference from diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile index 945b5975fce2..d61369b1eabe 100644 --- a/arch/arm/boot/compressed/Makefile +++ b/arch/arm/boot/compressed/Makefile @@ -96,7 +96,7 @@ KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING ccflags-y := -fpic $(call cc-option,-mno-single-pic-base,) -fno-builtin \ -I$(srctree)/scripts/dtc/libfdt -fno-stack-protector \ - -I$(obj) $(DISABLE_ARM_SSP_PER_TASK_PLUGIN) + -I$(obj) ccflags-remove-$(CONFIG_FUNCTION_TRACER) += -pg asflags-y := -DZIMAGE diff --git a/arch/arm/configs/multi_v7_defconfig b/arch/arm/configs/multi_v7_defconfig index 96178acedad0..aca01ad6aafc 100644 --- a/arch/arm/configs/multi_v7_defconfig +++ b/arch/arm/configs/multi_v7_defconfig @@ -1121,25 +1121,6 @@ CONFIG_QCOM_SMSM=y CONFIG_QCOM_SOCINFO=m CONFIG_QCOM_STATS=m CONFIG_QCOM_WCNSS_CTRL=m -CONFIG_ARCH_EMEV2=y -CONFIG_ARCH_R8A7794=y -CONFIG_ARCH_R8A7779=y -CONFIG_ARCH_R8A7790=y -CONFIG_ARCH_R8A7778=y -CONFIG_ARCH_R8A7793=y -CONFIG_ARCH_R8A7791=y -CONFIG_ARCH_R8A7792=y -CONFIG_ARCH_R8A7740=y -CONFIG_ARCH_R8A73A4=y -CONFIG_ARCH_R7S72100=y -CONFIG_ARCH_R7S9210=y -CONFIG_ARCH_R8A77470=y -CONFIG_ARCH_R8A7745=y -CONFIG_ARCH_R8A7742=y -CONFIG_ARCH_R8A7743=y -CONFIG_ARCH_R8A7744=y -CONFIG_ARCH_R9A06G032=y -CONFIG_ARCH_SH73A0=y CONFIG_ROCKCHIP_IODOMAIN=y CONFIG_ARCH_TEGRA_2x_SOC=y CONFIG_ARCH_TEGRA_3x_SOC=y @@ -1203,7 +1184,7 @@ CONFIG_PWM_BCM2835=y CONFIG_PWM_BRCMSTB=m CONFIG_PWM_FSL_FTM=m CONFIG_PWM_MESON=m -CONFIG_PWM_RCAR=m +CONFIG_PWM_RENESAS_RCAR=m CONFIG_PWM_RENESAS_TPU=y CONFIG_PWM_ROCKCHIP=m CONFIG_PWM_SAMSUNG=m diff --git a/arch/arm/configs/shmobile_defconfig b/arch/arm/configs/shmobile_defconfig index 8c30ed14e52c..7c3d6a8f0038 100644 --- a/arch/arm/configs/shmobile_defconfig +++ b/arch/arm/configs/shmobile_defconfig @@ -63,6 +63,7 @@ CONFIG_SMSC_PHY=y CONFIG_CAN_RCAR=y CONFIG_INPUT_EVDEV=y CONFIG_KEYBOARD_GPIO=y +CONFIG_KEYBOARD_GPIO_POLLED=y # CONFIG_INPUT_MOUSE is not set CONFIG_INPUT_TOUCHSCREEN=y CONFIG_TOUCHSCREEN_EDT_FT5X06=y @@ -84,6 +85,7 @@ CONFIG_SERIAL_8250_EM=y CONFIG_SERIAL_SH_SCI=y CONFIG_I2C_CHARDEV=y CONFIG_I2C_DEMUX_PINCTRL=y +CONFIG_I2C_DESIGNWARE_CORE=y CONFIG_I2C_EMEV2=y CONFIG_I2C_GPIO=y CONFIG_I2C_RIIC=y @@ -104,7 +106,7 @@ CONFIG_GPIO_PCF857X=y CONFIG_POWER_RESET=y CONFIG_POWER_RESET_RMOBILE=y CONFIG_POWER_SUPPLY=y -# CONFIG_HWMON is not set +CONFIG_SENSORS_LM75=y CONFIG_THERMAL=y CONFIG_CPU_THERMAL=y CONFIG_RCAR_THERMAL=y @@ -174,6 +176,9 @@ CONFIG_USB_RENESAS_USBHS_UDC=y CONFIG_USB_RENESAS_USBF=y CONFIG_USB_ETH=y CONFIG_MMC=y +CONFIG_MMC_SDHCI=y +CONFIG_MMC_SDHCI_PLTFM=y +CONFIG_MMC_SDHCI_OF_ARASAN=y CONFIG_MMC_SDHI=y CONFIG_MMC_SH_MMCIF=y CONFIG_NEW_LEDS=y @@ -195,29 +200,10 @@ CONFIG_RCAR_DMAC=y CONFIG_RENESAS_USB_DMAC=y CONFIG_RZ_DMAC=y # CONFIG_IOMMU_SUPPORT is not set -CONFIG_ARCH_EMEV2=y -CONFIG_ARCH_R8A7794=y -CONFIG_ARCH_R8A7779=y -CONFIG_ARCH_R8A7790=y -CONFIG_ARCH_R8A7778=y -CONFIG_ARCH_R8A7793=y -CONFIG_ARCH_R8A7791=y -CONFIG_ARCH_R8A7792=y -CONFIG_ARCH_R8A7740=y -CONFIG_ARCH_R8A73A4=y -CONFIG_ARCH_R7S72100=y -CONFIG_ARCH_R7S9210=y -CONFIG_ARCH_R8A77470=y -CONFIG_ARCH_R8A7745=y -CONFIG_ARCH_R8A7742=y -CONFIG_ARCH_R8A7743=y -CONFIG_ARCH_R8A7744=y -CONFIG_ARCH_R9A06G032=y -CONFIG_ARCH_SH73A0=y CONFIG_IIO=y CONFIG_AK8975=y CONFIG_PWM=y -CONFIG_PWM_RCAR=y +CONFIG_PWM_RENESAS_RCAR=y CONFIG_PWM_RENESAS_TPU=y CONFIG_PHY_RCAR_GEN2=y CONFIG_PHY_RCAR_GEN3_USB2=y diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a182295e6f08..de9290d52fca 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -42,6 +42,7 @@ config ARM64 select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_NONLEAF_PMD_YOUNG if ARM64_HAFT + select ARCH_HAS_PREEMPT_LAZY select ARCH_HAS_PTDUMP select ARCH_HAS_PTE_DEVMAP select ARCH_HAS_PTE_SPECIAL @@ -134,7 +135,6 @@ config ARM64 select COMMON_CLK select CPU_PM if (SUSPEND || CPU_IDLE) select CPUMASK_OFFSTACK if NR_CPUS > 256 - select CRC32 select DCACHE_WORD_ACCESS select DYNAMIC_FTRACE if FUNCTION_TRACER select DMA_BOUNCE_UNALIGNED_KMALLOC @@ -333,9 +333,9 @@ config ARCH_MMAP_RND_BITS_MAX default 24 if ARM64_VA_BITS=39 default 27 if ARM64_VA_BITS=42 default 30 if ARM64_VA_BITS=47 - default 29 if ARM64_VA_BITS=48 && ARM64_64K_PAGES - default 31 if ARM64_VA_BITS=48 && ARM64_16K_PAGES - default 33 if ARM64_VA_BITS=48 + default 29 if (ARM64_VA_BITS=48 || ARM64_VA_BITS=52) && ARM64_64K_PAGES + default 31 if (ARM64_VA_BITS=48 || ARM64_VA_BITS=52) && ARM64_16K_PAGES + default 33 if (ARM64_VA_BITS=48 || ARM64_VA_BITS=52) default 14 if ARM64_64K_PAGES default 16 if ARM64_16K_PAGES default 18 @@ -2285,7 +2285,6 @@ config ARM64_SME bool "ARM Scalable Matrix Extension support" default y depends on ARM64_SVE - depends on BROKEN help The Scalable Matrix Extension (SME) is an extension to the AArch64 execution state which utilises a substantial subset of the SVE diff --git a/arch/arm64/boot/dts/qcom/x1e80100.dtsi b/arch/arm64/boot/dts/qcom/x1e80100.dtsi index 4936fa5b98ff..8eddf0c96098 100644 --- a/arch/arm64/boot/dts/qcom/x1e80100.dtsi +++ b/arch/arm64/boot/dts/qcom/x1e80100.dtsi @@ -3752,60 +3752,83 @@ }; gpu_opp_table: opp-table { - compatible = "operating-points-v2"; + compatible = "operating-points-v2-adreno", "operating-points-v2"; + + opp-1250000000 { + opp-hz = /bits/ 64 <1250000000>; + opp-level = <RPMH_REGULATOR_LEVEL_TURBO_L3>; + opp-peak-kBps = <16500000>; + qcom,opp-acd-level = <0xa82a5ffd>; + }; + + opp-1175000000 { + opp-hz = /bits/ 64 <1175000000>; + opp-level = <RPMH_REGULATOR_LEVEL_TURBO_L2>; + opp-peak-kBps = <14398438>; + qcom,opp-acd-level = <0xa82a5ffd>; + }; opp-1100000000 { opp-hz = /bits/ 64 <1100000000>; opp-level = <RPMH_REGULATOR_LEVEL_TURBO_L1>; - opp-peak-kBps = <16500000>; + opp-peak-kBps = <14398438>; + qcom,opp-acd-level = <0xa82a5ffd>; }; opp-1000000000 { opp-hz = /bits/ 64 <1000000000>; opp-level = <RPMH_REGULATOR_LEVEL_TURBO>; opp-peak-kBps = <14398438>; + qcom,opp-acd-level = <0xa82b5ffd>; }; opp-925000000 { opp-hz = /bits/ 64 <925000000>; opp-level = <RPMH_REGULATOR_LEVEL_NOM_L1>; opp-peak-kBps = <14398438>; + qcom,opp-acd-level = <0xa82b5ffd>; }; opp-800000000 { opp-hz = /bits/ 64 <800000000>; opp-level = <RPMH_REGULATOR_LEVEL_NOM>; opp-peak-kBps = <12449219>; + qcom,opp-acd-level = <0xa82c5ffd>; }; opp-744000000 { opp-hz = /bits/ 64 <744000000>; opp-level = <RPMH_REGULATOR_LEVEL_SVS_L2>; opp-peak-kBps = <10687500>; + qcom,opp-acd-level = <0x882e5ffd>; }; opp-687000000 { opp-hz = /bits/ 64 <687000000>; opp-level = <RPMH_REGULATOR_LEVEL_SVS_L1>; opp-peak-kBps = <8171875>; + qcom,opp-acd-level = <0x882e5ffd>; }; opp-550000000 { opp-hz = /bits/ 64 <550000000>; opp-level = <RPMH_REGULATOR_LEVEL_SVS>; opp-peak-kBps = <6074219>; + qcom,opp-acd-level = <0xc0285ffd>; }; opp-390000000 { opp-hz = /bits/ 64 <390000000>; opp-level = <RPMH_REGULATOR_LEVEL_LOW_SVS>; opp-peak-kBps = <3000000>; + qcom,opp-acd-level = <0xc0285ffd>; }; opp-300000000 { opp-hz = /bits/ 64 <300000000>; opp-level = <RPMH_REGULATOR_LEVEL_LOW_SVS_D1>; opp-peak-kBps = <2136719>; + qcom,opp-acd-level = <0xc02b5ffd>; }; }; }; diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 370ad70b4be8..a61154545c89 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -1010,6 +1010,7 @@ CONFIG_SND_SOC_ROCKCHIP_RT5645=m CONFIG_SND_SOC_RK3399_GRU_SOUND=m CONFIG_SND_SOC_SAMSUNG=y CONFIG_SND_SOC_RCAR=m +CONFIG_SND_SOC_MSIOF=m CONFIG_SND_SOC_RZ=m CONFIG_SND_SOC_SOF_TOPLEVEL=y CONFIG_SND_SOC_SOF_OF=y @@ -1474,29 +1475,6 @@ CONFIG_QCOM_WCNSS_CTRL=m CONFIG_QCOM_APR=m CONFIG_QCOM_ICC_BWMON=m CONFIG_QCOM_PBS=m -CONFIG_ARCH_R8A77995=y -CONFIG_ARCH_R8A77990=y -CONFIG_ARCH_R8A77951=y -CONFIG_ARCH_R8A77965=y -CONFIG_ARCH_R8A77960=y -CONFIG_ARCH_R8A77961=y -CONFIG_ARCH_R8A779F0=y -CONFIG_ARCH_R8A77980=y -CONFIG_ARCH_R8A77970=y -CONFIG_ARCH_R8A779A0=y -CONFIG_ARCH_R8A779G0=y -CONFIG_ARCH_R8A779H0=y -CONFIG_ARCH_R8A774C0=y -CONFIG_ARCH_R8A774E1=y -CONFIG_ARCH_R8A774A1=y -CONFIG_ARCH_R8A774B1=y -CONFIG_ARCH_R9A07G043=y -CONFIG_ARCH_R9A07G044=y -CONFIG_ARCH_R9A07G054=y -CONFIG_ARCH_R9A08G045=y -CONFIG_ARCH_R9A09G011=y -CONFIG_ARCH_R9A09G047=y -CONFIG_ARCH_R9A09G057=y CONFIG_ROCKCHIP_IODOMAIN=y CONFIG_ARCH_TEGRA_132_SOC=y CONFIG_ARCH_TEGRA_210_SOC=y @@ -1550,10 +1528,11 @@ CONFIG_PWM_IMX27=m CONFIG_PWM_MESON=m CONFIG_PWM_MTK_DISP=m CONFIG_PWM_MEDIATEK=m -CONFIG_PWM_RCAR=m +CONFIG_PWM_RENESAS_RCAR=m +CONFIG_PWM_RENESAS_RZG2L_GPT=m +CONFIG_PWM_RENESAS_RZ_MTU3=m CONFIG_PWM_RENESAS_TPU=m CONFIG_PWM_ROCKCHIP=y -CONFIG_PWM_RZ_MTU3=m CONFIG_PWM_SAMSUNG=y CONFIG_PWM_SL28CPLD=m CONFIG_PWM_SUN4I=m diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h index 81e4157f92b7..71493b760b83 100644 --- a/arch/arm64/include/asm/cpu.h +++ b/arch/arm64/include/asm/cpu.h @@ -44,6 +44,7 @@ struct cpuinfo_arm64 { u64 reg_dczid; u64 reg_midr; u64 reg_revidr; + u64 reg_aidr; u64 reg_gmid; u64 reg_smidr; u64 reg_mpamidr; diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index dffff6763812..661735616787 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -134,6 +134,7 @@ #define HISI_CPU_PART_TSV110 0xD01 #define HISI_CPU_PART_HIP09 0xD02 +#define HISI_CPU_PART_HIP12 0xD06 #define APPLE_CPU_PART_M1_ICESTORM 0x022 #define APPLE_CPU_PART_M1_FIRESTORM 0x023 @@ -222,6 +223,7 @@ #define MIDR_FUJITSU_A64FX MIDR_CPU_MODEL(ARM_CPU_IMP_FUJITSU, FUJITSU_CPU_PART_A64FX) #define MIDR_HISI_TSV110 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_TSV110) #define MIDR_HISI_HIP09 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_HIP09) +#define MIDR_HISI_HIP12 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_HIP12) #define MIDR_APPLE_M1_ICESTORM MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_ICESTORM) #define MIDR_APPLE_M1_FIRESTORM MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_FIRESTORM) #define MIDR_APPLE_M1_ICESTORM_PRO MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_ICESTORM_PRO) diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index d40e427ddad9..f6d72ca03133 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -204,19 +204,21 @@ orr x0, x0, #(1 << 62) .Lskip_spe_fgt_\@: + +.Lset_debug_fgt_\@: msr_s SYS_HDFGRTR_EL2, x0 msr_s SYS_HDFGWTR_EL2, x0 mov x0, xzr mrs x1, id_aa64pfr1_el1 ubfx x1, x1, #ID_AA64PFR1_EL1_SME_SHIFT, #4 - cbz x1, .Lskip_debug_fgt_\@ + cbz x1, .Lskip_sme_fgt_\@ /* Disable nVHE traps of TPIDR2 and SMPRI */ orr x0, x0, #HFGxTR_EL2_nSMPRI_EL1_MASK orr x0, x0, #HFGxTR_EL2_nTPIDR2_EL0_MASK -.Lskip_debug_fgt_\@: +.Lskip_sme_fgt_\@: mrs_s x1, SYS_ID_AA64MMFR3_EL1 ubfx x1, x1, #ID_AA64MMFR3_EL1_S1PIE_SHIFT, #4 cbz x1, .Lskip_pie_fgt_\@ @@ -237,12 +239,14 @@ /* GCS depends on PIE so we don't check it if PIE is absent */ mrs_s x1, SYS_ID_AA64PFR1_EL1 ubfx x1, x1, #ID_AA64PFR1_EL1_GCS_SHIFT, #4 - cbz x1, .Lset_fgt_\@ + cbz x1, .Lskip_gce_fgt_\@ /* Disable traps of access to GCS registers at EL0 and EL1 */ orr x0, x0, #HFGxTR_EL2_nGCS_EL1_MASK orr x0, x0, #HFGxTR_EL2_nGCS_EL0_MASK +.Lskip_gce_fgt_\@: + .Lset_fgt_\@: msr_s SYS_HFGRTR_EL2, x0 msr_s SYS_HFGWTR_EL2, x0 diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index e4f77757937e..71f0cbf7b288 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -378,12 +378,14 @@ /* * ISS values for SME traps */ - -#define ESR_ELx_SME_ISS_SME_DISABLED 0 -#define ESR_ELx_SME_ISS_ILL 1 -#define ESR_ELx_SME_ISS_SM_DISABLED 2 -#define ESR_ELx_SME_ISS_ZA_DISABLED 3 -#define ESR_ELx_SME_ISS_ZT_DISABLED 4 +#define ESR_ELx_SME_ISS_SMTC_MASK GENMASK(2, 0) +#define ESR_ELx_SME_ISS_SMTC(esr) ((esr) & ESR_ELx_SME_ISS_SMTC_MASK) + +#define ESR_ELx_SME_ISS_SMTC_SME_DISABLED 0 +#define ESR_ELx_SME_ISS_SMTC_ILL 1 +#define ESR_ELx_SME_ISS_SMTC_SM_DISABLED 2 +#define ESR_ELx_SME_ISS_SMTC_ZA_DISABLED 3 +#define ESR_ELx_SME_ISS_SMTC_ZT_DISABLED 4 /* ISS field definitions for MOPS exceptions */ #define ESR_ELx_MOPS_ISS_MEM_INST (UL(1) << 24) diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index 564bc09b3e06..b8cf0ea43cc0 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h @@ -6,6 +6,7 @@ #define __ASM_FP_H #include <asm/errno.h> +#include <asm/percpu.h> #include <asm/ptrace.h> #include <asm/processor.h> #include <asm/sigcontext.h> @@ -76,7 +77,6 @@ extern void fpsimd_load_state(struct user_fpsimd_state *state); extern void fpsimd_thread_switch(struct task_struct *next); extern void fpsimd_flush_thread(void); -extern void fpsimd_signal_preserve_current_state(void); extern void fpsimd_preserve_current_state(void); extern void fpsimd_restore_current_state(void); extern void fpsimd_update_current_state(struct user_fpsimd_state const *state); @@ -93,9 +93,12 @@ struct cpu_fp_state { enum fp_type to_save; }; +DECLARE_PER_CPU(struct cpu_fp_state, fpsimd_last_state); + extern void fpsimd_bind_state_to_cpu(struct cpu_fp_state *fp_state); extern void fpsimd_flush_task_state(struct task_struct *target); +extern void fpsimd_save_and_flush_current_state(void); extern void fpsimd_save_and_flush_cpu_state(void); static inline bool thread_sm_enabled(struct thread_struct *thread) @@ -108,6 +111,8 @@ static inline bool thread_za_enabled(struct thread_struct *thread) return system_supports_sme() && (thread->svcr & SVCR_ZA_MASK); } +extern void task_smstop_sm(struct task_struct *task); + /* Maximum VL that SVE/SME VL-agnostic software can transparently support */ #define VL_ARCH_MAX 0x100 @@ -195,10 +200,8 @@ struct vl_info { extern void sve_alloc(struct task_struct *task, bool flush); extern void fpsimd_release_task(struct task_struct *task); -extern void fpsimd_sync_to_sve(struct task_struct *task); -extern void fpsimd_force_sync_to_sve(struct task_struct *task); -extern void sve_sync_to_fpsimd(struct task_struct *task); -extern void sve_sync_from_fpsimd_zeropad(struct task_struct *task); +extern void fpsimd_sync_from_effective_state(struct task_struct *task); +extern void fpsimd_sync_to_effective_state_zeropad(struct task_struct *task); extern int vec_set_vector_length(struct task_struct *task, enum vec_type type, unsigned long vl, unsigned long flags); @@ -292,14 +295,29 @@ static inline bool sve_vq_available(unsigned int vq) return vq_available(ARM64_VEC_SVE, vq); } -size_t sve_state_size(struct task_struct const *task); +static inline size_t __sve_state_size(unsigned int sve_vl, unsigned int sme_vl) +{ + unsigned int vl = max(sve_vl, sme_vl); + return SVE_SIG_REGS_SIZE(sve_vq_from_vl(vl)); +} + +/* + * Return how many bytes of memory are required to store the full SVE + * state for task, given task's currently configured vector length. + */ +static inline size_t sve_state_size(struct task_struct const *task) +{ + unsigned int sve_vl = task_get_sve_vl(task); + unsigned int sme_vl = task_get_sme_vl(task); + return __sve_state_size(sve_vl, sme_vl); +} #else /* ! CONFIG_ARM64_SVE */ static inline void sve_alloc(struct task_struct *task, bool flush) { } static inline void fpsimd_release_task(struct task_struct *task) { } -static inline void sve_sync_to_fpsimd(struct task_struct *task) { } -static inline void sve_sync_from_fpsimd_zeropad(struct task_struct *task) { } +static inline void fpsimd_sync_from_effective_state(struct task_struct *task) { } +static inline void fpsimd_sync_to_effective_state_zeropad(struct task_struct *task) { } static inline int sve_max_virtualisable_vl(void) { @@ -333,6 +351,11 @@ static inline void vec_update_vq_map(enum vec_type t) { } static inline int vec_verify_vq_map(enum vec_type t) { return 0; } static inline void sve_setup(void) { } +static inline size_t __sve_state_size(unsigned int sve_vl, unsigned int sme_vl) +{ + return 0; +} + static inline size_t sve_state_size(struct task_struct const *task) { return 0; @@ -385,6 +408,16 @@ extern int sme_set_current_vl(unsigned long arg); extern int sme_get_current_vl(void); extern void sme_suspend_exit(void); +static inline size_t __sme_state_size(unsigned int sme_vl) +{ + size_t size = ZA_SIG_REGS_SIZE(sve_vq_from_vl(sme_vl)); + + if (system_supports_sme2()) + size += ZT_SIG_REG_SIZE; + + return size; +} + /* * Return how many bytes of memory are required to store the full SME * specific state for task, given task's currently configured vector @@ -392,15 +425,7 @@ extern void sme_suspend_exit(void); */ static inline size_t sme_state_size(struct task_struct const *task) { - unsigned int vl = task_get_sme_vl(task); - size_t size; - - size = ZA_SIG_REGS_SIZE(sve_vq_from_vl(vl)); - - if (system_supports_sme2()) - size += ZT_SIG_REG_SIZE; - - return size; + return __sme_state_size(task_get_sme_vl(task)); } #else @@ -421,6 +446,11 @@ static inline int sme_set_current_vl(unsigned long arg) { return -EINVAL; } static inline int sme_get_current_vl(void) { return -EINVAL; } static inline void sme_suspend_exit(void) { } +static inline size_t __sme_state_size(unsigned int sme_vl) +{ + return 0; +} + static inline size_t sme_state_size(struct task_struct const *task) { return 0; diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 07fbf5bf85a7..2a8155c4a882 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -69,29 +69,38 @@ extern void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, #include <asm-generic/hugetlb.h> -#define __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE -static inline void flush_hugetlb_tlb_range(struct vm_area_struct *vma, - unsigned long start, - unsigned long end) +static inline void __flush_hugetlb_tlb_range(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + unsigned long stride, + bool last_level) { - unsigned long stride = huge_page_size(hstate_vma(vma)); - switch (stride) { #ifndef __PAGETABLE_PMD_FOLDED case PUD_SIZE: - __flush_tlb_range(vma, start, end, PUD_SIZE, false, 1); + __flush_tlb_range(vma, start, end, PUD_SIZE, last_level, 1); break; #endif case CONT_PMD_SIZE: case PMD_SIZE: - __flush_tlb_range(vma, start, end, PMD_SIZE, false, 2); + __flush_tlb_range(vma, start, end, PMD_SIZE, last_level, 2); break; case CONT_PTE_SIZE: - __flush_tlb_range(vma, start, end, PAGE_SIZE, false, 3); + __flush_tlb_range(vma, start, end, PAGE_SIZE, last_level, 3); break; default: - __flush_tlb_range(vma, start, end, PAGE_SIZE, false, TLBI_TTL_UNKNOWN); + __flush_tlb_range(vma, start, end, PAGE_SIZE, last_level, TLBI_TTL_UNKNOWN); } } +#define __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE +static inline void flush_hugetlb_tlb_range(struct vm_area_struct *vma, + unsigned long start, + unsigned long end) +{ + unsigned long stride = huge_page_size(hstate_vma(vma)); + + __flush_hugetlb_tlb_range(vma, start, end, stride, false); +} + #endif /* __ASM_HUGETLB_H */ diff --git a/arch/arm64/include/asm/mem_encrypt.h b/arch/arm64/include/asm/mem_encrypt.h index a2a1eeb36d4b..314b2b52025f 100644 --- a/arch/arm64/include/asm/mem_encrypt.h +++ b/arch/arm64/include/asm/mem_encrypt.h @@ -4,6 +4,8 @@ #include <asm/rsi.h> +struct device; + struct arm64_mem_crypt_ops { int (*encrypt)(unsigned long addr, int numpages); int (*decrypt)(unsigned long addr, int numpages); diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index d3b538be1500..5285757ee0c1 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -40,6 +40,85 @@ #include <linux/sched.h> #include <linux/page_table_check.h> +static inline void emit_pte_barriers(void) +{ + /* + * These barriers are emitted under certain conditions after a pte entry + * was modified (see e.g. __set_pte_complete()). The dsb makes the store + * visible to the table walker. The isb ensures that any previous + * speculative "invalid translation" marker that is in the CPU's + * pipeline gets cleared, so that any access to that address after + * setting the pte to valid won't cause a spurious fault. If the thread + * gets preempted after storing to the pgtable but before emitting these + * barriers, __switch_to() emits a dsb which ensure the walker gets to + * see the store. There is no guarantee of an isb being issued though. + * This is safe because it will still get issued (albeit on a + * potentially different CPU) when the thread starts running again, + * before any access to the address. + */ + dsb(ishst); + isb(); +} + +static inline void queue_pte_barriers(void) +{ + unsigned long flags; + + if (in_interrupt()) { + emit_pte_barriers(); + return; + } + + flags = read_thread_flags(); + + if (flags & BIT(TIF_LAZY_MMU)) { + /* Avoid the atomic op if already set. */ + if (!(flags & BIT(TIF_LAZY_MMU_PENDING))) + set_thread_flag(TIF_LAZY_MMU_PENDING); + } else { + emit_pte_barriers(); + } +} + +#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE +static inline void arch_enter_lazy_mmu_mode(void) +{ + /* + * lazy_mmu_mode is not supposed to permit nesting. But in practice this + * does happen with CONFIG_DEBUG_PAGEALLOC, where a page allocation + * inside a lazy_mmu_mode section (such as zap_pte_range()) will change + * permissions on the linear map with apply_to_page_range(), which + * re-enters lazy_mmu_mode. So we tolerate nesting in our + * implementation. The first call to arch_leave_lazy_mmu_mode() will + * flush and clear the flag such that the remainder of the work in the + * outer nest behaves as if outside of lazy mmu mode. This is safe and + * keeps tracking simple. + */ + + if (in_interrupt()) + return; + + set_thread_flag(TIF_LAZY_MMU); +} + +static inline void arch_flush_lazy_mmu_mode(void) +{ + if (in_interrupt()) + return; + + if (test_and_clear_thread_flag(TIF_LAZY_MMU_PENDING)) + emit_pte_barriers(); +} + +static inline void arch_leave_lazy_mmu_mode(void) +{ + if (in_interrupt()) + return; + + arch_flush_lazy_mmu_mode(); + clear_thread_flag(TIF_LAZY_MMU); +} + #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE @@ -320,18 +399,20 @@ static inline void __set_pte_nosync(pte_t *ptep, pte_t pte) WRITE_ONCE(*ptep, pte); } -static inline void __set_pte(pte_t *ptep, pte_t pte) +static inline void __set_pte_complete(pte_t pte) { - __set_pte_nosync(ptep, pte); - /* * Only if the new pte is valid and kernel, otherwise TLB maintenance - * or update_mmu_cache() have the necessary barriers. + * has the necessary barriers. */ - if (pte_valid_not_user(pte)) { - dsb(ishst); - isb(); - } + if (pte_valid_not_user(pte)) + queue_pte_barriers(); +} + +static inline void __set_pte(pte_t *ptep, pte_t pte) +{ + __set_pte_nosync(ptep, pte); + __set_pte_complete(pte); } static inline pte_t __ptep_get(pte_t *ptep) @@ -423,23 +504,6 @@ static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr) return pfn_pte(pte_pfn(pte) + nr, pte_pgprot(pte)); } -static inline void __set_ptes(struct mm_struct *mm, - unsigned long __always_unused addr, - pte_t *ptep, pte_t pte, unsigned int nr) -{ - page_table_check_ptes_set(mm, ptep, pte, nr); - __sync_cache_and_tags(pte, nr); - - for (;;) { - __check_safe_pte_update(mm, ptep, pte); - __set_pte(ptep, pte); - if (--nr == 0) - break; - ptep++; - pte = pte_advance_pfn(pte, 1); - } -} - /* * Hugetlb definitions. */ @@ -649,30 +713,64 @@ static inline pgprot_t pud_pgprot(pud_t pud) return __pgprot(pud_val(pfn_pud(pfn, __pgprot(0))) ^ pud_val(pud)); } -static inline void __set_pte_at(struct mm_struct *mm, - unsigned long __always_unused addr, - pte_t *ptep, pte_t pte, unsigned int nr) +static inline void __set_ptes_anysz(struct mm_struct *mm, pte_t *ptep, + pte_t pte, unsigned int nr, + unsigned long pgsize) { - __sync_cache_and_tags(pte, nr); - __check_safe_pte_update(mm, ptep, pte); - __set_pte(ptep, pte); + unsigned long stride = pgsize >> PAGE_SHIFT; + + switch (pgsize) { + case PAGE_SIZE: + page_table_check_ptes_set(mm, ptep, pte, nr); + break; + case PMD_SIZE: + page_table_check_pmds_set(mm, (pmd_t *)ptep, pte_pmd(pte), nr); + break; +#ifndef __PAGETABLE_PMD_FOLDED + case PUD_SIZE: + page_table_check_puds_set(mm, (pud_t *)ptep, pte_pud(pte), nr); + break; +#endif + default: + VM_WARN_ON(1); + } + + __sync_cache_and_tags(pte, nr * stride); + + for (;;) { + __check_safe_pte_update(mm, ptep, pte); + __set_pte_nosync(ptep, pte); + if (--nr == 0) + break; + ptep++; + pte = pte_advance_pfn(pte, stride); + } + + __set_pte_complete(pte); } -static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, pmd_t pmd) +static inline void __set_ptes(struct mm_struct *mm, + unsigned long __always_unused addr, + pte_t *ptep, pte_t pte, unsigned int nr) { - page_table_check_pmd_set(mm, pmdp, pmd); - return __set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd), - PMD_SIZE >> PAGE_SHIFT); + __set_ptes_anysz(mm, ptep, pte, nr, PAGE_SIZE); } -static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, - pud_t *pudp, pud_t pud) +static inline void __set_pmds(struct mm_struct *mm, + unsigned long __always_unused addr, + pmd_t *pmdp, pmd_t pmd, unsigned int nr) +{ + __set_ptes_anysz(mm, (pte_t *)pmdp, pmd_pte(pmd), nr, PMD_SIZE); +} +#define set_pmd_at(mm, addr, pmdp, pmd) __set_pmds(mm, addr, pmdp, pmd, 1) + +static inline void __set_puds(struct mm_struct *mm, + unsigned long __always_unused addr, + pud_t *pudp, pud_t pud, unsigned int nr) { - page_table_check_pud_set(mm, pudp, pud); - return __set_pte_at(mm, addr, (pte_t *)pudp, pud_pte(pud), - PUD_SIZE >> PAGE_SHIFT); + __set_ptes_anysz(mm, (pte_t *)pudp, pud_pte(pud), nr, PUD_SIZE); } +#define set_pud_at(mm, addr, pudp, pud) __set_puds(mm, addr, pudp, pud, 1) #define __p4d_to_phys(p4d) __pte_to_phys(p4d_pte(p4d)) #define __phys_to_p4d_val(phys) __phys_to_pte_val(phys) @@ -739,8 +837,7 @@ static inline int pmd_trans_huge(pmd_t pmd) * If pmd is present-invalid, pmd_table() won't detect it * as a table, so force the valid bit for the comparison. */ - return pmd_val(pmd) && pmd_present(pmd) && - !pmd_table(__pmd(pmd_val(pmd) | PTE_VALID)); + return pmd_present(pmd) && !pmd_table(__pmd(pmd_val(pmd) | PTE_VALID)); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -754,8 +851,6 @@ static inline bool pud_table(pud_t pud) { return true; } PUD_TYPE_TABLE) #endif -extern pgd_t init_pg_dir[]; -extern pgd_t init_pg_end[]; extern pgd_t swapper_pg_dir[]; extern pgd_t idmap_pg_dir[]; extern pgd_t tramp_pg_dir[]; @@ -780,10 +875,8 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) WRITE_ONCE(*pmdp, pmd); - if (pmd_valid(pmd)) { - dsb(ishst); - isb(); - } + if (pmd_valid(pmd)) + queue_pte_barriers(); } static inline void pmd_clear(pmd_t *pmdp) @@ -848,10 +941,8 @@ static inline void set_pud(pud_t *pudp, pud_t pud) WRITE_ONCE(*pudp, pud); - if (pud_valid(pud)) { - dsb(ishst); - isb(); - } + if (pud_valid(pud)) + queue_pte_barriers(); } static inline void pud_clear(pud_t *pudp) @@ -930,8 +1021,7 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) } WRITE_ONCE(*p4dp, p4d); - dsb(ishst); - isb(); + queue_pte_barriers(); } static inline void p4d_clear(p4d_t *p4dp) @@ -1059,8 +1149,7 @@ static inline void set_pgd(pgd_t *pgdp, pgd_t pgd) } WRITE_ONCE(*pgdp, pgd); - dsb(ishst); - isb(); + queue_pte_barriers(); } static inline void pgd_clear(pgd_t *pgdp) @@ -1301,16 +1390,37 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */ -static inline pte_t __ptep_get_and_clear(struct mm_struct *mm, - unsigned long address, pte_t *ptep) +static inline pte_t __ptep_get_and_clear_anysz(struct mm_struct *mm, + pte_t *ptep, + unsigned long pgsize) { pte_t pte = __pte(xchg_relaxed(&pte_val(*ptep), 0)); - page_table_check_pte_clear(mm, pte); + switch (pgsize) { + case PAGE_SIZE: + page_table_check_pte_clear(mm, pte); + break; + case PMD_SIZE: + page_table_check_pmd_clear(mm, pte_pmd(pte)); + break; +#ifndef __PAGETABLE_PMD_FOLDED + case PUD_SIZE: + page_table_check_pud_clear(mm, pte_pud(pte)); + break; +#endif + default: + VM_WARN_ON(1); + } return pte; } +static inline pte_t __ptep_get_and_clear(struct mm_struct *mm, + unsigned long address, pte_t *ptep) +{ + return __ptep_get_and_clear_anysz(mm, ptep, PAGE_SIZE); +} + static inline void __clear_full_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned int nr, int full) { @@ -1347,11 +1457,7 @@ static inline pte_t __get_and_clear_full_ptes(struct mm_struct *mm, static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { - pmd_t pmd = __pmd(xchg_relaxed(&pmd_val(*pmdp), 0)); - - page_table_check_pmd_clear(mm, pmd); - - return pmd; + return pte_pmd(__ptep_get_and_clear_anysz(mm, (pte_t *)pmdp, PMD_SIZE)); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/arm64/include/asm/rsi_cmds.h b/arch/arm64/include/asm/rsi_cmds.h index e6a211001bd3..2c8763876dfb 100644 --- a/arch/arm64/include/asm/rsi_cmds.h +++ b/arch/arm64/include/asm/rsi_cmds.h @@ -7,6 +7,8 @@ #define __ASM_RSI_CMDS_H #include <linux/arm-smccc.h> +#include <linux/string.h> +#include <asm/memory.h> #include <asm/rsi_smc.h> diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 1114c1c3300a..1269c2487574 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -59,11 +59,12 @@ void arch_setup_new_exec(void); #define TIF_SIGPENDING 0 /* signal pending */ #define TIF_NEED_RESCHED 1 /* rescheduling necessary */ -#define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ -#define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */ -#define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ -#define TIF_MTE_ASYNC_FAULT 5 /* MTE Asynchronous Tag Check Fault */ -#define TIF_NOTIFY_SIGNAL 6 /* signal notifications exist */ +#define TIF_NEED_RESCHED_LAZY 2 /* Lazy rescheduling needed */ +#define TIF_NOTIFY_RESUME 3 /* callback before returning to user */ +#define TIF_FOREIGN_FPSTATE 4 /* CPU's FP state is not current's */ +#define TIF_UPROBE 5 /* uprobe breakpoint or singlestep */ +#define TIF_MTE_ASYNC_FAULT 6 /* MTE Asynchronous Tag Check Fault */ +#define TIF_NOTIFY_SIGNAL 7 /* signal notifications exist */ #define TIF_SYSCALL_TRACE 8 /* syscall trace active */ #define TIF_SYSCALL_AUDIT 9 /* syscall auditing */ #define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint for ftrace */ @@ -82,9 +83,12 @@ void arch_setup_new_exec(void); #define TIF_SME_VL_INHERIT 28 /* Inherit SME vl_onexec across exec */ #define TIF_KERNEL_FPSTATE 29 /* Task is in a kernel mode FPSIMD section */ #define TIF_TSC_SIGSEGV 30 /* SIGSEGV on counter-timer access */ +#define TIF_LAZY_MMU 31 /* Task in lazy mmu mode */ +#define TIF_LAZY_MMU_PENDING 32 /* Ops pending for lazy mmu mode exit */ #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_FOREIGN_FPSTATE (1 << TIF_FOREIGN_FPSTATE) #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) @@ -100,10 +104,10 @@ void arch_setup_new_exec(void); #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) #define _TIF_TSC_SIGSEGV (1 << TIF_TSC_SIGSEGV) -#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ +#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ _TIF_UPROBE | _TIF_MTE_ASYNC_FAULT | \ - _TIF_NOTIFY_SIGNAL) + _TIF_NOTIFY_SIGNAL | _TIF_SIGPENDING) #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ diff --git a/arch/arm64/include/asm/vdso/gettimeofday.h b/arch/arm64/include/asm/vdso/gettimeofday.h index 3322c7047d84..da1ab8759592 100644 --- a/arch/arm64/include/asm/vdso/gettimeofday.h +++ b/arch/arm64/include/asm/vdso/gettimeofday.h @@ -8,6 +8,7 @@ #ifndef __ASSEMBLY__ #include <asm/alternative.h> +#include <asm/arch_timer.h> #include <asm/barrier.h> #include <asm/unistd.h> #include <asm/sysreg.h> @@ -69,8 +70,6 @@ int clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) static __always_inline u64 __arch_get_hw_counter(s32 clock_mode, const struct vdso_time_data *vd) { - u64 res; - /* * Core checks for mode already, so this raced against a concurrent * update. Return something. Core will do another round and then @@ -79,24 +78,7 @@ static __always_inline u64 __arch_get_hw_counter(s32 clock_mode, if (clock_mode == VDSO_CLOCKMODE_NONE) return 0; - /* - * If FEAT_ECV is available, use the self-synchronizing counter. - * Otherwise the isb is required to prevent that the counter value - * is speculated. - */ - asm volatile( - ALTERNATIVE("isb\n" - "mrs %0, cntvct_el0", - "nop\n" - __mrs_s("%0", SYS_CNTVCTSS_EL0), - ARM64_HAS_ECV) - : "=r" (res) - : - : "memory"); - - arch_counter_enforce_ordering(res); - - return res; + return __arch_counter_get_cntvct(); } #if IS_ENABLED(CONFIG_CC_IS_GCC) && IS_ENABLED(CONFIG_PAGE_SIZE_64KB) diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h index ebf4a9f943ed..aa280f356b96 100644 --- a/arch/arm64/include/asm/virt.h +++ b/arch/arm64/include/asm/virt.h @@ -67,7 +67,8 @@ * __boot_cpu_mode records what mode CPUs were booted in. * A correctly-implemented bootloader must start all CPUs in the same mode: * In this case, both 32bit halves of __boot_cpu_mode will contain the - * same value (either 0 if booted in EL1, BOOT_CPU_MODE_EL2 if booted in EL2). + * same value (either BOOT_CPU_MODE_EL1 if booted in EL1, BOOT_CPU_MODE_EL2 if + * booted in EL2). * * Should the bootloader fail to do this, the two values will be different. * This allows the kernel to flag an error when the secondaries have come up. diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h index 38fafffe699f..12f534e8f3ed 100644 --- a/arch/arm64/include/asm/vmalloc.h +++ b/arch/arm64/include/asm/vmalloc.h @@ -23,6 +23,51 @@ static inline bool arch_vmap_pmd_supported(pgprot_t prot) return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS); } +#define arch_vmap_pte_range_map_size arch_vmap_pte_range_map_size +static inline unsigned long arch_vmap_pte_range_map_size(unsigned long addr, + unsigned long end, u64 pfn, + unsigned int max_page_shift) +{ + /* + * If the block is at least CONT_PTE_SIZE in size, and is naturally + * aligned in both virtual and physical space, then we can pte-map the + * block using the PTE_CONT bit for more efficient use of the TLB. + */ + if (max_page_shift < CONT_PTE_SHIFT) + return PAGE_SIZE; + + if (end - addr < CONT_PTE_SIZE) + return PAGE_SIZE; + + if (!IS_ALIGNED(addr, CONT_PTE_SIZE)) + return PAGE_SIZE; + + if (!IS_ALIGNED(PFN_PHYS(pfn), CONT_PTE_SIZE)) + return PAGE_SIZE; + + return CONT_PTE_SIZE; +} + +#define arch_vmap_pte_range_unmap_size arch_vmap_pte_range_unmap_size +static inline unsigned long arch_vmap_pte_range_unmap_size(unsigned long addr, + pte_t *ptep) +{ + /* + * The caller handles alignment so it's sufficient just to check + * PTE_CONT. + */ + return pte_valid_cont(__ptep_get(ptep)) ? CONT_PTE_SIZE : PAGE_SIZE; +} + +#define arch_vmap_pte_supported_shift arch_vmap_pte_supported_shift +static inline int arch_vmap_pte_supported_shift(unsigned long size) +{ + if (size >= CONT_PTE_SIZE) + return CONT_PTE_SHIFT; + + return PAGE_SHIFT; +} + #endif #define arch_vmap_pgprot_tagged arch_vmap_pgprot_tagged diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index eb1a840e4110..30d4bbe68661 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -182,5 +182,7 @@ int main(void) #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS DEFINE(FTRACE_OPS_DIRECT_CALL, offsetof(struct ftrace_ops, direct_call)); #endif + DEFINE(PIE_E0_ASM, PIE_E0); + DEFINE(PIE_E1_ASM, PIE_E1); return 0; } diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 4c46d80aa64b..379c82d22c75 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -765,17 +765,17 @@ static const struct arm64_ftr_bits ftr_raz[] = { #define ARM64_FTR_REG(id, table) \ __ARM64_FTR_REG_OVERRIDE(#id, id, table, &no_override) -struct arm64_ftr_override id_aa64mmfr0_override; -struct arm64_ftr_override id_aa64mmfr1_override; -struct arm64_ftr_override id_aa64mmfr2_override; -struct arm64_ftr_override id_aa64pfr0_override; -struct arm64_ftr_override id_aa64pfr1_override; -struct arm64_ftr_override id_aa64zfr0_override; -struct arm64_ftr_override id_aa64smfr0_override; -struct arm64_ftr_override id_aa64isar1_override; -struct arm64_ftr_override id_aa64isar2_override; - -struct arm64_ftr_override arm64_sw_feature_override; +struct arm64_ftr_override __read_mostly id_aa64mmfr0_override; +struct arm64_ftr_override __read_mostly id_aa64mmfr1_override; +struct arm64_ftr_override __read_mostly id_aa64mmfr2_override; +struct arm64_ftr_override __read_mostly id_aa64pfr0_override; +struct arm64_ftr_override __read_mostly id_aa64pfr1_override; +struct arm64_ftr_override __read_mostly id_aa64zfr0_override; +struct arm64_ftr_override __read_mostly id_aa64smfr0_override; +struct arm64_ftr_override __read_mostly id_aa64isar1_override; +struct arm64_ftr_override __read_mostly id_aa64isar2_override; + +struct arm64_ftr_override __read_mostly arm64_sw_feature_override; static const struct __ftr_reg_entry { u32 sys_id; @@ -1410,6 +1410,8 @@ void update_cpu_features(int cpu, info->reg_id_aa64mmfr2, boot->reg_id_aa64mmfr2); taint |= check_update_ftr_reg(SYS_ID_AA64MMFR3_EL1, cpu, info->reg_id_aa64mmfr3, boot->reg_id_aa64mmfr3); + taint |= check_update_ftr_reg(SYS_ID_AA64MMFR4_EL1, cpu, + info->reg_id_aa64mmfr4, boot->reg_id_aa64mmfr4); taint |= check_update_ftr_reg(SYS_ID_AA64PFR0_EL1, cpu, info->reg_id_aa64pfr0, boot->reg_id_aa64pfr0); diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 285d7d538342..94525abd1c22 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -209,80 +209,79 @@ static const char *const compat_hwcap2_str[] = { static int c_show(struct seq_file *m, void *v) { - int i, j; + int j; + int cpu = m->index; bool compat = personality(current->personality) == PER_LINUX32; + struct cpuinfo_arm64 *cpuinfo = v; + u32 midr = cpuinfo->reg_midr; - for_each_online_cpu(i) { - struct cpuinfo_arm64 *cpuinfo = &per_cpu(cpu_data, i); - u32 midr = cpuinfo->reg_midr; - - /* - * glibc reads /proc/cpuinfo to determine the number of - * online processors, looking for lines beginning with - * "processor". Give glibc what it expects. - */ - seq_printf(m, "processor\t: %d\n", i); - if (compat) - seq_printf(m, "model name\t: ARMv8 Processor rev %d (%s)\n", - MIDR_REVISION(midr), COMPAT_ELF_PLATFORM); + /* + * glibc reads /proc/cpuinfo to determine the number of + * online processors, looking for lines beginning with + * "processor". Give glibc what it expects. + */ + seq_printf(m, "processor\t: %d\n", cpu); + if (compat) + seq_printf(m, "model name\t: ARMv8 Processor rev %d (%s)\n", + MIDR_REVISION(midr), COMPAT_ELF_PLATFORM); - seq_printf(m, "BogoMIPS\t: %lu.%02lu\n", - loops_per_jiffy / (500000UL/HZ), - loops_per_jiffy / (5000UL/HZ) % 100); + seq_printf(m, "BogoMIPS\t: %lu.%02lu\n", + loops_per_jiffy / (500000UL/HZ), + loops_per_jiffy / (5000UL/HZ) % 100); - /* - * Dump out the common processor features in a single line. - * Userspace should read the hwcaps with getauxval(AT_HWCAP) - * rather than attempting to parse this, but there's a body of - * software which does already (at least for 32-bit). - */ - seq_puts(m, "Features\t:"); - if (compat) { + /* + * Dump out the common processor features in a single line. + * Userspace should read the hwcaps with getauxval(AT_HWCAP) + * rather than attempting to parse this, but there's a body of + * software which does already (at least for 32-bit). + */ + seq_puts(m, "Features\t:"); + if (compat) { #ifdef CONFIG_COMPAT - for (j = 0; j < ARRAY_SIZE(compat_hwcap_str); j++) { - if (compat_elf_hwcap & (1 << j)) { - /* - * Warn once if any feature should not - * have been present on arm64 platform. - */ - if (WARN_ON_ONCE(!compat_hwcap_str[j])) - continue; - - seq_printf(m, " %s", compat_hwcap_str[j]); - } + for (j = 0; j < ARRAY_SIZE(compat_hwcap_str); j++) { + if (compat_elf_hwcap & (1 << j)) { + /* + * Warn once if any feature should not + * have been present on arm64 platform. + */ + if (WARN_ON_ONCE(!compat_hwcap_str[j])) + continue; + + seq_printf(m, " %s", compat_hwcap_str[j]); } + } - for (j = 0; j < ARRAY_SIZE(compat_hwcap2_str); j++) - if (compat_elf_hwcap2 & (1 << j)) - seq_printf(m, " %s", compat_hwcap2_str[j]); + for (j = 0; j < ARRAY_SIZE(compat_hwcap2_str); j++) + if (compat_elf_hwcap2 & (1 << j)) + seq_printf(m, " %s", compat_hwcap2_str[j]); #endif /* CONFIG_COMPAT */ - } else { - for (j = 0; j < ARRAY_SIZE(hwcap_str); j++) - if (cpu_have_feature(j)) - seq_printf(m, " %s", hwcap_str[j]); - } - seq_puts(m, "\n"); - - seq_printf(m, "CPU implementer\t: 0x%02x\n", - MIDR_IMPLEMENTOR(midr)); - seq_printf(m, "CPU architecture: 8\n"); - seq_printf(m, "CPU variant\t: 0x%x\n", MIDR_VARIANT(midr)); - seq_printf(m, "CPU part\t: 0x%03x\n", MIDR_PARTNUM(midr)); - seq_printf(m, "CPU revision\t: %d\n\n", MIDR_REVISION(midr)); + } else { + for (j = 0; j < ARRAY_SIZE(hwcap_str); j++) + if (cpu_have_feature(j)) + seq_printf(m, " %s", hwcap_str[j]); } + seq_puts(m, "\n"); + + seq_printf(m, "CPU implementer\t: 0x%02x\n", + MIDR_IMPLEMENTOR(midr)); + seq_puts(m, "CPU architecture: 8\n"); + seq_printf(m, "CPU variant\t: 0x%x\n", MIDR_VARIANT(midr)); + seq_printf(m, "CPU part\t: 0x%03x\n", MIDR_PARTNUM(midr)); + seq_printf(m, "CPU revision\t: %d\n\n", MIDR_REVISION(midr)); return 0; } static void *c_start(struct seq_file *m, loff_t *pos) { - return *pos < 1 ? (void *)1 : NULL; + *pos = cpumask_next(*pos - 1, cpu_online_mask); + return *pos < nr_cpu_ids ? &per_cpu(cpu_data, *pos) : NULL; } static void *c_next(struct seq_file *m, void *v, loff_t *pos) { ++*pos; - return NULL; + return c_start(m, pos); } static void c_stop(struct seq_file *m, void *v) @@ -328,11 +327,13 @@ static const struct kobj_type cpuregs_kobj_type = { CPUREGS_ATTR_RO(midr_el1, midr); CPUREGS_ATTR_RO(revidr_el1, revidr); +CPUREGS_ATTR_RO(aidr_el1, aidr); CPUREGS_ATTR_RO(smidr_el1, smidr); static struct attribute *cpuregs_id_attrs[] = { &cpuregs_attr_midr_el1.attr, &cpuregs_attr_revidr_el1.attr, + &cpuregs_attr_aidr_el1.attr, NULL }; @@ -469,6 +470,7 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) info->reg_dczid = read_cpuid(DCZID_EL0); info->reg_midr = read_cpuid_id(); info->reg_revidr = read_cpuid(REVIDR_EL1); + info->reg_aidr = read_cpuid(AIDR_EL1); info->reg_id_aa64dfr0 = read_cpuid(ID_AA64DFR0_EL1); info->reg_id_aa64dfr1 = read_cpuid(ID_AA64DFR1_EL1); diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 1d25d8899dbf..250e9d7c08a7 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -169,14 +169,14 @@ static DEFINE_RAW_SPINLOCK(efi_rt_lock); void arch_efi_call_virt_setup(void) { efi_virtmap_load(); - __efi_fpsimd_begin(); raw_spin_lock(&efi_rt_lock); + __efi_fpsimd_begin(); } void arch_efi_call_virt_teardown(void) { - raw_spin_unlock(&efi_rt_lock); __efi_fpsimd_end(); + raw_spin_unlock(&efi_rt_lock); efi_virtmap_unload(); } diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index b260ddc4d3e9..7c1970b341b8 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -132,7 +132,7 @@ static void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags) do { local_irq_enable(); - if (thread_flags & _TIF_NEED_RESCHED) + if (thread_flags & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) schedule(); if (thread_flags & _TIF_UPROBE) @@ -393,20 +393,16 @@ static bool cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs) * As per the ABI exit SME streaming mode and clear the SVE state not * shared with FPSIMD on syscall entry. */ -static inline void fp_user_discard(void) +static inline void fpsimd_syscall_enter(void) { - /* - * If SME is active then exit streaming mode. If ZA is active - * then flush the SVE registers but leave userspace access to - * both SVE and SME enabled, otherwise disable SME for the - * task and fall through to disabling SVE too. This means - * that after a syscall we never have any streaming mode - * register state to track, if this changes the KVM code will - * need updating. - */ + /* Ensure PSTATE.SM is clear, but leave PSTATE.ZA as-is. */ if (system_supports_sme()) sme_smstop_sm(); + /* + * The CPU is not in streaming mode. If non-streaming SVE is not + * supported, there is no SVE state that needs to be discarded. + */ if (!system_supports_sve()) return; @@ -416,6 +412,33 @@ static inline void fp_user_discard(void) sve_vq_minus_one = sve_vq_from_vl(task_get_sve_vl(current)) - 1; sve_flush_live(true, sve_vq_minus_one); } + + /* + * Any live non-FPSIMD SVE state has been zeroed. Allow + * fpsimd_save_user_state() to lazily discard SVE state until either + * the live state is unbound or fpsimd_syscall_exit() is called. + */ + __this_cpu_write(fpsimd_last_state.to_save, FP_STATE_FPSIMD); +} + +static __always_inline void fpsimd_syscall_exit(void) +{ + if (!system_supports_sve()) + return; + + /* + * The current task's user FPSIMD/SVE/SME state is now bound to this + * CPU. The fpsimd_last_state.to_save value is either: + * + * - FP_STATE_FPSIMD, if the state has not been reloaded on this CPU + * since fpsimd_syscall_enter(). + * + * - FP_STATE_CURRENT, if the state has been reloaded on this CPU at + * any point. + * + * Reset this to FP_STATE_CURRENT to stop lazy discarding. + */ + __this_cpu_write(fpsimd_last_state.to_save, FP_STATE_CURRENT); } UNHANDLED(el1t, 64, sync) @@ -739,10 +762,11 @@ static void noinstr el0_svc(struct pt_regs *regs) { enter_from_user_mode(regs); cortex_a76_erratum_1463225_svc_handler(); - fp_user_discard(); + fpsimd_syscall_enter(); local_daif_restore(DAIF_PROCCTX); do_el0_svc(regs); exit_to_user_mode(regs); + fpsimd_syscall_exit(); } static void noinstr el0_fpac(struct pt_regs *regs, unsigned long esr) diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 8370d55f0353..c37f02d7194e 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -119,7 +119,7 @@ * whatever is in the FPSIMD registers is not saved to memory, but discarded. */ -static DEFINE_PER_CPU(struct cpu_fp_state, fpsimd_last_state); +DEFINE_PER_CPU(struct cpu_fp_state, fpsimd_last_state); __ro_after_init struct vl_info vl_info[ARM64_VEC_MAX] = { #ifdef CONFIG_ARM64_SVE @@ -180,12 +180,12 @@ static inline void set_sve_default_vl(int val) set_default_vl(ARM64_VEC_SVE, val); } -static void __percpu *efi_sve_state; +static u8 *efi_sve_state; #else /* ! CONFIG_ARM64_SVE */ /* Dummy declaration for code that will be optimised out: */ -extern void __percpu *efi_sve_state; +extern u8 *efi_sve_state; #endif /* ! CONFIG_ARM64_SVE */ @@ -359,20 +359,15 @@ static void task_fpsimd_load(void) WARN_ON(preemptible()); WARN_ON(test_thread_flag(TIF_KERNEL_FPSTATE)); - if (system_supports_fpmr()) - write_sysreg_s(current->thread.uw.fpmr, SYS_FPMR); - if (system_supports_sve() || system_supports_sme()) { switch (current->thread.fp_type) { case FP_STATE_FPSIMD: /* Stop tracking SVE for this task until next use. */ - if (test_and_clear_thread_flag(TIF_SVE)) - sve_user_disable(); + clear_thread_flag(TIF_SVE); break; case FP_STATE_SVE: - if (!thread_sm_enabled(¤t->thread) && - !WARN_ON_ONCE(!test_and_set_thread_flag(TIF_SVE))) - sve_user_enable(); + if (!thread_sm_enabled(¤t->thread)) + WARN_ON_ONCE(!test_and_set_thread_flag(TIF_SVE)); if (test_thread_flag(TIF_SVE)) sve_set_vq(sve_vq_from_vl(task_get_sve_vl(current)) - 1); @@ -413,6 +408,9 @@ static void task_fpsimd_load(void) restore_ffr = system_supports_fa64(); } + if (system_supports_fpmr()) + write_sysreg_s(current->thread.uw.fpmr, SYS_FPMR); + if (restore_sve_regs) { WARN_ON_ONCE(current->thread.fp_type != FP_STATE_SVE); sve_load_state(sve_pffr(¤t->thread), @@ -453,12 +451,15 @@ static void fpsimd_save_user_state(void) *(last->fpmr) = read_sysreg_s(SYS_FPMR); /* - * If a task is in a syscall the ABI allows us to only - * preserve the state shared with FPSIMD so don't bother - * saving the full SVE state in that case. + * Save SVE state if it is live. + * + * The syscall ABI discards live SVE state at syscall entry. When + * entering a syscall, fpsimd_syscall_enter() sets to_save to + * FP_STATE_FPSIMD to allow the SVE state to be lazily discarded until + * either new SVE state is loaded+bound or fpsimd_syscall_exit() is + * called prior to a return to userspace. */ - if ((last->to_save == FP_STATE_CURRENT && test_thread_flag(TIF_SVE) && - !in_syscall(current_pt_regs())) || + if ((last->to_save == FP_STATE_CURRENT && test_thread_flag(TIF_SVE)) || last->to_save == FP_STATE_SVE) { save_sve_regs = true; save_ffr = true; @@ -651,7 +652,7 @@ static void __fpsimd_to_sve(void *sst, struct user_fpsimd_state const *fst, * task->thread.uw.fpsimd_state must be up to date before calling this * function. */ -static void fpsimd_to_sve(struct task_struct *task) +static inline void fpsimd_to_sve(struct task_struct *task) { unsigned int vq; void *sst = task->thread.sve_state; @@ -675,7 +676,7 @@ static void fpsimd_to_sve(struct task_struct *task) * bytes of allocated kernel memory. * task->thread.sve_state must be up to date before calling this function. */ -static void sve_to_fpsimd(struct task_struct *task) +static inline void sve_to_fpsimd(struct task_struct *task) { unsigned int vq, vl; void const *sst = task->thread.sve_state; @@ -694,44 +695,39 @@ static void sve_to_fpsimd(struct task_struct *task) } } -void cpu_enable_fpmr(const struct arm64_cpu_capabilities *__always_unused p) +static inline void __fpsimd_zero_vregs(struct user_fpsimd_state *fpsimd) { - write_sysreg_s(read_sysreg_s(SYS_SCTLR_EL1) | SCTLR_EL1_EnFPM_MASK, - SYS_SCTLR_EL1); + memset(&fpsimd->vregs, 0, sizeof(fpsimd->vregs)); } -#ifdef CONFIG_ARM64_SVE /* - * Call __sve_free() directly only if you know task can't be scheduled - * or preempted. + * Simulate the effects of an SMSTOP SM instruction. */ -static void __sve_free(struct task_struct *task) +void task_smstop_sm(struct task_struct *task) { - kfree(task->thread.sve_state); - task->thread.sve_state = NULL; -} + if (!thread_sm_enabled(&task->thread)) + return; -static void sve_free(struct task_struct *task) -{ - WARN_ON(test_tsk_thread_flag(task, TIF_SVE)); + __fpsimd_zero_vregs(&task->thread.uw.fpsimd_state); + task->thread.uw.fpsimd_state.fpsr = 0x0800009f; + if (system_supports_fpmr()) + task->thread.uw.fpmr = 0; - __sve_free(task); + task->thread.svcr &= ~SVCR_SM_MASK; + task->thread.fp_type = FP_STATE_FPSIMD; } -/* - * Return how many bytes of memory are required to store the full SVE - * state for task, given task's currently configured vector length. - */ -size_t sve_state_size(struct task_struct const *task) +void cpu_enable_fpmr(const struct arm64_cpu_capabilities *__always_unused p) { - unsigned int vl = 0; - - if (system_supports_sve()) - vl = task_get_sve_vl(task); - if (system_supports_sme()) - vl = max(vl, task_get_sme_vl(task)); + write_sysreg_s(read_sysreg_s(SYS_SCTLR_EL1) | SCTLR_EL1_EnFPM_MASK, + SYS_SCTLR_EL1); +} - return SVE_SIG_REGS_SIZE(sve_vq_from_vl(vl)); +#ifdef CONFIG_ARM64_SVE +static void sve_free(struct task_struct *task) +{ + kfree(task->thread.sve_state); + task->thread.sve_state = NULL; } /* @@ -758,69 +754,34 @@ void sve_alloc(struct task_struct *task, bool flush) kzalloc(sve_state_size(task), GFP_KERNEL); } - -/* - * Force the FPSIMD state shared with SVE to be updated in the SVE state - * even if the SVE state is the current active state. - * - * This should only be called by ptrace. task must be non-runnable. - * task->thread.sve_state must point to at least sve_state_size(task) - * bytes of allocated kernel memory. - */ -void fpsimd_force_sync_to_sve(struct task_struct *task) -{ - fpsimd_to_sve(task); -} - -/* - * Ensure that task->thread.sve_state is up to date with respect to - * the user task, irrespective of when SVE is in use or not. - * - * This should only be called by ptrace. task must be non-runnable. - * task->thread.sve_state must point to at least sve_state_size(task) - * bytes of allocated kernel memory. - */ -void fpsimd_sync_to_sve(struct task_struct *task) -{ - if (!test_tsk_thread_flag(task, TIF_SVE) && - !thread_sm_enabled(&task->thread)) - fpsimd_to_sve(task); -} - /* - * Ensure that task->thread.uw.fpsimd_state is up to date with respect to - * the user task, irrespective of whether SVE is in use or not. + * Ensure that task->thread.uw.fpsimd_state is up to date with respect to the + * task's currently effective FPSIMD/SVE state. * - * This should only be called by ptrace. task must be non-runnable. - * task->thread.sve_state must point to at least sve_state_size(task) - * bytes of allocated kernel memory. + * The task's FPSIMD/SVE/SME state must not be subject to concurrent + * manipulation. */ -void sve_sync_to_fpsimd(struct task_struct *task) +void fpsimd_sync_from_effective_state(struct task_struct *task) { if (task->thread.fp_type == FP_STATE_SVE) sve_to_fpsimd(task); } /* - * Ensure that task->thread.sve_state is up to date with respect to - * the task->thread.uw.fpsimd_state. + * Ensure that the task's currently effective FPSIMD/SVE state is up to date + * with respect to task->thread.uw.fpsimd_state, zeroing any effective + * non-FPSIMD (S)SVE state. * - * This should only be called by ptrace to merge new FPSIMD register - * values into a task for which SVE is currently active. - * task must be non-runnable. - * task->thread.sve_state must point to at least sve_state_size(task) - * bytes of allocated kernel memory. - * task->thread.uw.fpsimd_state must already have been initialised with - * the new FPSIMD register values to be merged in. + * The task's FPSIMD/SVE/SME state must not be subject to concurrent + * manipulation. */ -void sve_sync_from_fpsimd_zeropad(struct task_struct *task) +void fpsimd_sync_to_effective_state_zeropad(struct task_struct *task) { unsigned int vq; void *sst = task->thread.sve_state; struct user_fpsimd_state const *fst = &task->thread.uw.fpsimd_state; - if (!test_tsk_thread_flag(task, TIF_SVE) && - !thread_sm_enabled(&task->thread)) + if (task->thread.fp_type != FP_STATE_SVE) return; vq = sve_vq_from_vl(thread_get_cur_vl(&task->thread)); @@ -829,10 +790,73 @@ void sve_sync_from_fpsimd_zeropad(struct task_struct *task) __fpsimd_to_sve(sst, fst, vq); } +static int change_live_vector_length(struct task_struct *task, + enum vec_type type, + unsigned long vl) +{ + unsigned int sve_vl = task_get_sve_vl(task); + unsigned int sme_vl = task_get_sme_vl(task); + void *sve_state = NULL, *sme_state = NULL; + + if (type == ARM64_VEC_SME) + sme_vl = vl; + else + sve_vl = vl; + + /* + * Allocate the new sve_state and sme_state before freeing the old + * copies so that allocation failure can be handled without needing to + * mutate the task's state in any way. + * + * Changes to the SVE vector length must not discard live ZA state or + * clear PSTATE.ZA, as userspace code which is unaware of the AAPCS64 + * ZA lazy saving scheme may attempt to change the SVE vector length + * while unsaved/dormant ZA state exists. + */ + sve_state = kzalloc(__sve_state_size(sve_vl, sme_vl), GFP_KERNEL); + if (!sve_state) + goto out_mem; + + if (type == ARM64_VEC_SME) { + sme_state = kzalloc(__sme_state_size(sme_vl), GFP_KERNEL); + if (!sme_state) + goto out_mem; + } + + if (task == current) + fpsimd_save_and_flush_current_state(); + else + fpsimd_flush_task_state(task); + + /* + * Always preserve PSTATE.SM and the effective FPSIMD state, zeroing + * other SVE state. + */ + fpsimd_sync_from_effective_state(task); + task_set_vl(task, type, vl); + kfree(task->thread.sve_state); + task->thread.sve_state = sve_state; + fpsimd_sync_to_effective_state_zeropad(task); + + if (type == ARM64_VEC_SME) { + task->thread.svcr &= ~SVCR_ZA_MASK; + kfree(task->thread.sme_state); + task->thread.sme_state = sme_state; + } + + return 0; + +out_mem: + kfree(sve_state); + kfree(sme_state); + return -ENOMEM; +} + int vec_set_vector_length(struct task_struct *task, enum vec_type type, unsigned long vl, unsigned long flags) { - bool free_sme = false; + bool onexec = flags & PR_SVE_SET_VL_ONEXEC; + bool inherit = flags & PR_SVE_VL_INHERIT; if (flags & ~(unsigned long)(PR_SVE_VL_INHERIT | PR_SVE_SET_VL_ONEXEC)) @@ -852,71 +876,17 @@ int vec_set_vector_length(struct task_struct *task, enum vec_type type, vl = find_supported_vector_length(type, vl); - if (flags & (PR_SVE_VL_INHERIT | - PR_SVE_SET_VL_ONEXEC)) + if (!onexec && vl != task_get_vl(task, type)) { + if (change_live_vector_length(task, type, vl)) + return -ENOMEM; + } + + if (onexec || inherit) task_set_vl_onexec(task, type, vl); else /* Reset VL to system default on next exec: */ task_set_vl_onexec(task, type, 0); - /* Only actually set the VL if not deferred: */ - if (flags & PR_SVE_SET_VL_ONEXEC) - goto out; - - if (vl == task_get_vl(task, type)) - goto out; - - /* - * To ensure the FPSIMD bits of the SVE vector registers are preserved, - * write any live register state back to task_struct, and convert to a - * regular FPSIMD thread. - */ - if (task == current) { - get_cpu_fpsimd_context(); - - fpsimd_save_user_state(); - } - - fpsimd_flush_task_state(task); - if (test_and_clear_tsk_thread_flag(task, TIF_SVE) || - thread_sm_enabled(&task->thread)) { - sve_to_fpsimd(task); - task->thread.fp_type = FP_STATE_FPSIMD; - } - - if (system_supports_sme()) { - if (type == ARM64_VEC_SME || - !(task->thread.svcr & (SVCR_SM_MASK | SVCR_ZA_MASK))) { - /* - * We are changing the SME VL or weren't using - * SME anyway, discard the state and force a - * reallocation. - */ - task->thread.svcr &= ~(SVCR_SM_MASK | - SVCR_ZA_MASK); - clear_tsk_thread_flag(task, TIF_SME); - free_sme = true; - } - } - - if (task == current) - put_cpu_fpsimd_context(); - - task_set_vl(task, type, vl); - - /* - * Free the changed states if they are not in use, SME will be - * reallocated to the correct size on next use and we just - * allocate SVE now in case it is needed for use in streaming - * mode. - */ - sve_free(task); - sve_alloc(task, true); - - if (free_sme) - sme_free(task); - -out: update_tsk_thread_flag(task, vec_vl_inherit_flag(type), flags & PR_SVE_VL_INHERIT); @@ -1131,15 +1101,15 @@ static void __init sve_efi_setup(void) if (!sve_vl_valid(max_vl)) goto fail; - efi_sve_state = __alloc_percpu( - SVE_SIG_REGS_SIZE(sve_vq_from_vl(max_vl)), SVE_VQ_BYTES); + efi_sve_state = kmalloc(SVE_SIG_REGS_SIZE(sve_vq_from_vl(max_vl)), + GFP_KERNEL); if (!efi_sve_state) goto fail; return; fail: - panic("Cannot allocate percpu memory for EFI SVE save/restore"); + panic("Cannot allocate memory for EFI SVE save/restore"); } void cpu_enable_sve(const struct arm64_cpu_capabilities *__always_unused p) @@ -1212,7 +1182,7 @@ void __init sve_setup(void) */ void fpsimd_release_task(struct task_struct *dead_task) { - __sve_free(dead_task); + sve_free(dead_task); sme_free(dead_task); } @@ -1436,7 +1406,7 @@ void do_sme_acc(unsigned long esr, struct pt_regs *regs) * If this not a trap due to SME being disabled then something * is being used in the wrong mode, report as SIGILL. */ - if (ESR_ELx_ISS(esr) != ESR_ELx_SME_ISS_SME_DISABLED) { + if (ESR_ELx_SME_ISS_SMTC(esr) != ESR_ELx_SME_ISS_SMTC_SME_DISABLED) { force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0); return; } @@ -1460,6 +1430,8 @@ void do_sme_acc(unsigned long esr, struct pt_regs *regs) sme_set_vq(vq_minus_one); fpsimd_bind_task_to_cpu(); + } else { + fpsimd_flush_task_state(current); } put_cpu_fpsimd_context(); @@ -1573,8 +1545,8 @@ void fpsimd_thread_switch(struct task_struct *next) fpsimd_save_user_state(); if (test_tsk_thread_flag(next, TIF_KERNEL_FPSTATE)) { - fpsimd_load_kernel_state(next); fpsimd_flush_cpu_state(); + fpsimd_load_kernel_state(next); } else { /* * Fix up TIF_FOREIGN_FPSTATE to correctly describe next's @@ -1661,6 +1633,9 @@ void fpsimd_flush_thread(void) current->thread.svcr = 0; } + if (system_supports_fpmr()) + current->thread.uw.fpmr = 0; + current->thread.fp_type = FP_STATE_FPSIMD; put_cpu_fpsimd_context(); @@ -1683,18 +1658,6 @@ void fpsimd_preserve_current_state(void) } /* - * Like fpsimd_preserve_current_state(), but ensure that - * current->thread.uw.fpsimd_state is updated so that it can be copied to - * the signal frame. - */ -void fpsimd_signal_preserve_current_state(void) -{ - fpsimd_preserve_current_state(); - if (current->thread.fp_type == FP_STATE_SVE) - sve_to_fpsimd(current); -} - -/* * Associate current's FPSIMD context with this cpu * The caller must have ownership of the cpu FPSIMD context before calling * this function. @@ -1786,30 +1749,14 @@ void fpsimd_restore_current_state(void) put_cpu_fpsimd_context(); } -/* - * Load an updated userland FPSIMD state for 'current' from memory and set the - * flag that indicates that the FPSIMD register contents are the most recent - * FPSIMD state of 'current'. This is used by the signal code to restore the - * register state when returning from a signal handler in FPSIMD only cases, - * any SVE context will be discarded. - */ void fpsimd_update_current_state(struct user_fpsimd_state const *state) { if (WARN_ON(!system_supports_fpsimd())) return; - get_cpu_fpsimd_context(); - current->thread.uw.fpsimd_state = *state; - if (test_thread_flag(TIF_SVE)) + if (current->thread.fp_type == FP_STATE_SVE) fpsimd_to_sve(current); - - task_fpsimd_load(); - fpsimd_bind_task_to_cpu(); - - clear_thread_flag(TIF_FOREIGN_FPSTATE); - - put_cpu_fpsimd_context(); } /* @@ -1839,6 +1786,17 @@ void fpsimd_flush_task_state(struct task_struct *t) barrier(); } +void fpsimd_save_and_flush_current_state(void) +{ + if (!system_supports_fpsimd()) + return; + + get_cpu_fpsimd_context(); + fpsimd_save_user_state(); + fpsimd_flush_task_state(current); + put_cpu_fpsimd_context(); +} + /* * Save the FPSIMD state to memory and invalidate cpu view. * This function must be called with preemption disabled. @@ -1948,10 +1906,10 @@ EXPORT_SYMBOL_GPL(kernel_neon_end); #ifdef CONFIG_EFI -static DEFINE_PER_CPU(struct user_fpsimd_state, efi_fpsimd_state); -static DEFINE_PER_CPU(bool, efi_fpsimd_state_used); -static DEFINE_PER_CPU(bool, efi_sve_state_used); -static DEFINE_PER_CPU(bool, efi_sm_state); +static struct user_fpsimd_state efi_fpsimd_state; +static bool efi_fpsimd_state_used; +static bool efi_sve_state_used; +static bool efi_sm_state; /* * EFI runtime services support functions @@ -1984,18 +1942,16 @@ void __efi_fpsimd_begin(void) * If !efi_sve_state, SVE can't be in use yet and doesn't need * preserving: */ - if (system_supports_sve() && likely(efi_sve_state)) { - char *sve_state = this_cpu_ptr(efi_sve_state); + if (system_supports_sve() && efi_sve_state != NULL) { bool ffr = true; u64 svcr; - __this_cpu_write(efi_sve_state_used, true); + efi_sve_state_used = true; if (system_supports_sme()) { svcr = read_sysreg_s(SYS_SVCR); - __this_cpu_write(efi_sm_state, - svcr & SVCR_SM_MASK); + efi_sm_state = svcr & SVCR_SM_MASK; /* * Unless we have FA64 FFR does not @@ -2005,19 +1961,18 @@ void __efi_fpsimd_begin(void) ffr = !(svcr & SVCR_SM_MASK); } - sve_save_state(sve_state + sve_ffr_offset(sve_max_vl()), - &this_cpu_ptr(&efi_fpsimd_state)->fpsr, - ffr); + sve_save_state(efi_sve_state + sve_ffr_offset(sve_max_vl()), + &efi_fpsimd_state.fpsr, ffr); if (system_supports_sme()) sysreg_clear_set_s(SYS_SVCR, SVCR_SM_MASK, 0); } else { - fpsimd_save_state(this_cpu_ptr(&efi_fpsimd_state)); + fpsimd_save_state(&efi_fpsimd_state); } - __this_cpu_write(efi_fpsimd_state_used, true); + efi_fpsimd_state_used = true; } } @@ -2029,12 +1984,10 @@ void __efi_fpsimd_end(void) if (!system_supports_fpsimd()) return; - if (!__this_cpu_xchg(efi_fpsimd_state_used, false)) { + if (!efi_fpsimd_state_used) { kernel_neon_end(); } else { - if (system_supports_sve() && - likely(__this_cpu_read(efi_sve_state_used))) { - char const *sve_state = this_cpu_ptr(efi_sve_state); + if (system_supports_sve() && efi_sve_state_used) { bool ffr = true; /* @@ -2043,7 +1996,7 @@ void __efi_fpsimd_end(void) * streaming mode. */ if (system_supports_sme()) { - if (__this_cpu_read(efi_sm_state)) { + if (efi_sm_state) { sysreg_clear_set_s(SYS_SVCR, 0, SVCR_SM_MASK); @@ -2057,14 +2010,15 @@ void __efi_fpsimd_end(void) } } - sve_load_state(sve_state + sve_ffr_offset(sve_max_vl()), - &this_cpu_ptr(&efi_fpsimd_state)->fpsr, - ffr); + sve_load_state(efi_sve_state + sve_ffr_offset(sve_max_vl()), + &efi_fpsimd_state.fpsr, ffr); - __this_cpu_write(efi_sve_state_used, false); + efi_sve_state_used = false; } else { - fpsimd_load_state(this_cpu_ptr(&efi_fpsimd_state)); + fpsimd_load_state(&efi_fpsimd_state); } + + efi_fpsimd_state_used = false; } } diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 2ce73525de2c..ca04b338cb0d 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -89,7 +89,7 @@ SYM_CODE_START(primary_entry) adrp x1, early_init_stack mov sp, x1 mov x29, xzr - adrp x0, init_idmap_pg_dir + adrp x0, __pi_init_idmap_pg_dir mov x1, xzr bl __pi_create_init_idmap @@ -101,7 +101,7 @@ SYM_CODE_START(primary_entry) cbnz x19, 0f dmb sy mov x1, x0 // end of used region - adrp x0, init_idmap_pg_dir + adrp x0, __pi_init_idmap_pg_dir adr_l x2, dcache_inval_poc blr x2 b 1f @@ -507,7 +507,7 @@ SYM_FUNC_END(__no_granule_support) SYM_FUNC_START_LOCAL(__primary_switch) adrp x1, reserved_pg_dir - adrp x2, init_idmap_pg_dir + adrp x2, __pi_init_idmap_pg_dir bl __enable_mmu adrp x1, early_init_stack diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 2004b4f41ade..2bc390d94331 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -10,6 +10,12 @@ #error This file should only be included in vmlinux.lds.S #endif +#define PI_EXPORT_SYM(sym) \ + __PI_EXPORT_SYM(sym, __pi_ ## sym, Cannot export BSS symbol sym to startup code) +#define __PI_EXPORT_SYM(sym, pisym, msg)\ + PROVIDE(pisym = sym); \ + ASSERT((sym - KIMAGE_VADDR) < (__bss_start - KIMAGE_VADDR), #msg) + PROVIDE(__efistub_primary_entry = primary_entry); /* @@ -36,37 +42,30 @@ PROVIDE(__pi___memcpy = __pi_memcpy); PROVIDE(__pi___memmove = __pi_memmove); PROVIDE(__pi___memset = __pi_memset); -PROVIDE(__pi_id_aa64isar1_override = id_aa64isar1_override); -PROVIDE(__pi_id_aa64isar2_override = id_aa64isar2_override); -PROVIDE(__pi_id_aa64mmfr0_override = id_aa64mmfr0_override); -PROVIDE(__pi_id_aa64mmfr1_override = id_aa64mmfr1_override); -PROVIDE(__pi_id_aa64mmfr2_override = id_aa64mmfr2_override); -PROVIDE(__pi_id_aa64pfr0_override = id_aa64pfr0_override); -PROVIDE(__pi_id_aa64pfr1_override = id_aa64pfr1_override); -PROVIDE(__pi_id_aa64smfr0_override = id_aa64smfr0_override); -PROVIDE(__pi_id_aa64zfr0_override = id_aa64zfr0_override); -PROVIDE(__pi_arm64_sw_feature_override = arm64_sw_feature_override); -PROVIDE(__pi_arm64_use_ng_mappings = arm64_use_ng_mappings); -PROVIDE(__pi__ctype = _ctype); -PROVIDE(__pi_memstart_offset_seed = memstart_offset_seed); - -PROVIDE(__pi_init_idmap_pg_dir = init_idmap_pg_dir); -PROVIDE(__pi_init_idmap_pg_end = init_idmap_pg_end); -PROVIDE(__pi_init_pg_dir = init_pg_dir); -PROVIDE(__pi_init_pg_end = init_pg_end); -PROVIDE(__pi_swapper_pg_dir = swapper_pg_dir); - -PROVIDE(__pi__text = _text); -PROVIDE(__pi__stext = _stext); -PROVIDE(__pi__etext = _etext); -PROVIDE(__pi___start_rodata = __start_rodata); -PROVIDE(__pi___inittext_begin = __inittext_begin); -PROVIDE(__pi___inittext_end = __inittext_end); -PROVIDE(__pi___initdata_begin = __initdata_begin); -PROVIDE(__pi___initdata_end = __initdata_end); -PROVIDE(__pi__data = _data); -PROVIDE(__pi___bss_start = __bss_start); -PROVIDE(__pi__end = _end); +PI_EXPORT_SYM(id_aa64isar1_override); +PI_EXPORT_SYM(id_aa64isar2_override); +PI_EXPORT_SYM(id_aa64mmfr0_override); +PI_EXPORT_SYM(id_aa64mmfr1_override); +PI_EXPORT_SYM(id_aa64mmfr2_override); +PI_EXPORT_SYM(id_aa64pfr0_override); +PI_EXPORT_SYM(id_aa64pfr1_override); +PI_EXPORT_SYM(id_aa64smfr0_override); +PI_EXPORT_SYM(id_aa64zfr0_override); +PI_EXPORT_SYM(arm64_sw_feature_override); +PI_EXPORT_SYM(arm64_use_ng_mappings); +PI_EXPORT_SYM(_ctype); + +PI_EXPORT_SYM(swapper_pg_dir); + +PI_EXPORT_SYM(_text); +PI_EXPORT_SYM(_stext); +PI_EXPORT_SYM(_etext); +PI_EXPORT_SYM(__start_rodata); +PI_EXPORT_SYM(__inittext_begin); +PI_EXPORT_SYM(__inittext_end); +PI_EXPORT_SYM(__initdata_begin); +PI_EXPORT_SYM(__initdata_end); +PI_EXPORT_SYM(_data); #ifdef CONFIG_KVM diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c index 1da3e25f9d9e..c9503ed45a6c 100644 --- a/arch/arm64/kernel/kaslr.c +++ b/arch/arm64/kernel/kaslr.c @@ -10,8 +10,6 @@ #include <asm/cpufeature.h> #include <asm/memory.h> -u16 __initdata memstart_offset_seed; - bool __ro_after_init __kaslr_is_enabled = false; void __init kaslr_init(void) diff --git a/arch/arm64/kernel/pi/kaslr_early.c b/arch/arm64/kernel/pi/kaslr_early.c index 0257b43819db..e0e018046a46 100644 --- a/arch/arm64/kernel/pi/kaslr_early.c +++ b/arch/arm64/kernel/pi/kaslr_early.c @@ -18,8 +18,6 @@ #include "pi.h" -extern u16 memstart_offset_seed; - static u64 __init get_kaslr_seed(void *fdt, int node) { static char const seed_str[] __initconst = "kaslr-seed"; @@ -53,8 +51,6 @@ u64 __init kaslr_early_init(void *fdt, int chosen) return 0; } - memstart_offset_seed = seed & U16_MAX; - /* * OK, so we are proceeding with KASLR enabled. Calculate a suitable * kernel image offset from the seed. Let's place the kernel in the diff --git a/arch/arm64/kernel/pi/pi.h b/arch/arm64/kernel/pi/pi.h index c91e5e965cd3..1f4731a4e17e 100644 --- a/arch/arm64/kernel/pi/pi.h +++ b/arch/arm64/kernel/pi/pi.h @@ -22,6 +22,7 @@ static inline void *prel64_to_pointer(const prel64_t *offset) extern bool dynamic_scs_is_enabled; extern pgd_t init_idmap_pg_dir[], init_idmap_pg_end[]; +extern pgd_t init_pg_dir[], init_pg_end[]; void init_feature_override(u64 boot_status, const void *fdt, int chosen); u64 kaslr_early_init(void *fdt, int chosen); diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 42faebb7b712..a5ca15daeb8a 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -344,50 +344,34 @@ void arch_release_task_struct(struct task_struct *tsk) int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { - if (current->mm) - fpsimd_preserve_current_state(); + /* + * The current/src task's FPSIMD state may or may not be live, and may + * have been altered by ptrace after entry to the kernel. Save the + * effective FPSIMD state so that this will be copied into dst. + */ + fpsimd_save_and_flush_current_state(); + fpsimd_sync_from_effective_state(src); + *dst = *src; /* - * Detach src's sve_state (if any) from dst so that it does not - * get erroneously used or freed prematurely. dst's copies - * will be allocated on demand later on if dst uses SVE. - * For consistency, also clear TIF_SVE here: this could be done - * later in copy_process(), but to avoid tripping up future - * maintainers it is best not to leave TIF flags and buffers in - * an inconsistent state, even temporarily. + * Drop stale reference to src's sve_state and convert dst to + * non-streaming FPSIMD mode. */ + dst->thread.fp_type = FP_STATE_FPSIMD; dst->thread.sve_state = NULL; clear_tsk_thread_flag(dst, TIF_SVE); + task_smstop_sm(dst); /* - * In the unlikely event that we create a new thread with ZA - * enabled we should retain the ZA and ZT state so duplicate - * it here. This may be shortly freed if we exec() or if - * CLONE_SETTLS but it's simpler to do it here. To avoid - * confusing the rest of the code ensure that we have a - * sve_state allocated whenever sme_state is allocated. + * Drop stale reference to src's sme_state and ensure dst has ZA + * disabled. + * + * When necessary, ZA will be inherited later in copy_thread_za(). */ - if (thread_za_enabled(&src->thread)) { - dst->thread.sve_state = kzalloc(sve_state_size(src), - GFP_KERNEL); - if (!dst->thread.sve_state) - return -ENOMEM; - - dst->thread.sme_state = kmemdup(src->thread.sme_state, - sme_state_size(src), - GFP_KERNEL); - if (!dst->thread.sme_state) { - kfree(dst->thread.sve_state); - dst->thread.sve_state = NULL; - return -ENOMEM; - } - } else { - dst->thread.sme_state = NULL; - clear_tsk_thread_flag(dst, TIF_SME); - } - - dst->thread.fp_type = FP_STATE_FPSIMD; + dst->thread.sme_state = NULL; + clear_tsk_thread_flag(dst, TIF_SME); + dst->thread.svcr &= ~SVCR_ZA_MASK; /* clear any pending asynchronous tag fault raised by the parent */ clear_tsk_thread_flag(dst, TIF_MTE_ASYNC_FAULT); @@ -395,6 +379,31 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) return 0; } +static int copy_thread_za(struct task_struct *dst, struct task_struct *src) +{ + if (!thread_za_enabled(&src->thread)) + return 0; + + dst->thread.sve_state = kzalloc(sve_state_size(src), + GFP_KERNEL); + if (!dst->thread.sve_state) + return -ENOMEM; + + dst->thread.sme_state = kmemdup(src->thread.sme_state, + sme_state_size(src), + GFP_KERNEL); + if (!dst->thread.sme_state) { + kfree(dst->thread.sve_state); + dst->thread.sve_state = NULL; + return -ENOMEM; + } + + set_tsk_thread_flag(dst, TIF_SME); + dst->thread.svcr |= SVCR_ZA_MASK; + + return 0; +} + asmlinkage void ret_from_fork(void) asm("ret_from_fork"); int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) @@ -427,8 +436,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) * out-of-sync with the saved value. */ *task_user_tls(p) = read_sysreg(tpidr_el0); - if (system_supports_tpidr2()) - p->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); if (system_supports_poe()) p->thread.por_el0 = read_sysreg_s(SYS_POR_EL0); @@ -441,13 +448,39 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) } /* + * Due to the AAPCS64 "ZA lazy saving scheme", PSTATE.ZA and + * TPIDR2 need to be manipulated as a pair, and either both + * need to be inherited or both need to be reset. + * + * Within a process, child threads must not inherit their + * parent's TPIDR2 value or they may clobber their parent's + * stack at some later point. + * + * When a process is fork()'d, the child must inherit ZA and + * TPIDR2 from its parent in case there was dormant ZA state. + * + * Use CLONE_VM to determine when the child will share the + * address space with the parent, and cannot safely inherit the + * state. + */ + if (system_supports_sme()) { + if (!(clone_flags & CLONE_VM)) { + p->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); + ret = copy_thread_za(p, current); + if (ret) + return ret; + } else { + p->thread.tpidr2_el0 = 0; + WARN_ON_ONCE(p->thread.svcr & SVCR_ZA_MASK); + } + } + + /* * If a TLS pointer was passed to clone, use it for the new - * thread. We also reset TPIDR2 if it's in use. + * thread. */ - if (clone_flags & CLONE_SETTLS) { + if (clone_flags & CLONE_SETTLS) p->thread.uw.tp_value = tls; - p->thread.tpidr2_el0 = 0; - } ret = copy_thread_gcs(p, args); if (ret != 0) @@ -680,10 +713,11 @@ struct task_struct *__switch_to(struct task_struct *prev, gcs_thread_switch(next); /* - * Complete any pending TLB or cache maintenance on this CPU in case - * the thread migrates to a different CPU. - * This full barrier is also required by the membarrier system - * call. + * Complete any pending TLB or cache maintenance on this CPU in case the + * thread migrates to a different CPU. This full barrier is also + * required by the membarrier system call. Additionally it makes any + * in-progress pgtable writes visible to the table walker; See + * emit_pte_barriers(). */ dsb(ish); diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index f79b0d5f71ac..a360e52db02f 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -594,7 +594,7 @@ static int __fpr_get(struct task_struct *target, { struct user_fpsimd_state *uregs; - sve_sync_to_fpsimd(target); + fpsimd_sync_from_effective_state(target); uregs = &target->thread.uw.fpsimd_state; @@ -626,7 +626,7 @@ static int __fpr_set(struct task_struct *target, * Ensure target->thread.uw.fpsimd_state is up to date, so that a * short copyin can't resurrect stale data. */ - sve_sync_to_fpsimd(target); + fpsimd_sync_from_effective_state(target); newstate = target->thread.uw.fpsimd_state; @@ -653,7 +653,7 @@ static int fpr_set(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; - sve_sync_from_fpsimd_zeropad(target); + fpsimd_sync_to_effective_state_zeropad(target); fpsimd_flush_task_state(target); return ret; @@ -775,6 +775,11 @@ static void sve_init_header_from_task(struct user_sve_header *header, task_type = ARM64_VEC_SVE; active = (task_type == type); + if (active && target->thread.fp_type == FP_STATE_SVE) + header->flags = SVE_PT_REGS_SVE; + else + header->flags = SVE_PT_REGS_FPSIMD; + switch (type) { case ARM64_VEC_SVE: if (test_tsk_thread_flag(target, TIF_SVE_VL_INHERIT)) @@ -789,19 +794,14 @@ static void sve_init_header_from_task(struct user_sve_header *header, return; } - if (active) { - if (target->thread.fp_type == FP_STATE_FPSIMD) { - header->flags |= SVE_PT_REGS_FPSIMD; - } else { - header->flags |= SVE_PT_REGS_SVE; - } - } - header->vl = task_get_vl(target, type); vq = sve_vq_from_vl(header->vl); header->max_vl = vec_max_vl(type); - header->size = SVE_PT_SIZE(vq, header->flags); + if (active) + header->size = SVE_PT_SIZE(vq, header->flags); + else + header->size = sizeof(header); header->max_size = SVE_PT_SIZE(sve_vq_from_vl(header->max_vl), SVE_PT_REGS_SVE); } @@ -820,18 +820,25 @@ static int sve_get_common(struct task_struct *target, unsigned int vq; unsigned long start, end; + if (target == current) + fpsimd_preserve_current_state(); + /* Header */ sve_init_header_from_task(&header, target, type); vq = sve_vq_from_vl(header.vl); membuf_write(&to, &header, sizeof(header)); - if (target == current) - fpsimd_preserve_current_state(); - BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header)); BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header)); + /* + * When the requested vector type is not active, do not present data + * from the other mode to userspace. + */ + if (header.size == sizeof(header)) + return 0; + switch ((header.flags & SVE_PT_REGS_MASK)) { case SVE_PT_REGS_FPSIMD: return __fpr_get(target, regset, to); @@ -859,7 +866,7 @@ static int sve_get_common(struct task_struct *target, return membuf_zero(&to, end - start); default: - return 0; + BUILD_BUG(); } } @@ -883,6 +890,9 @@ static int sve_set_common(struct task_struct *target, struct user_sve_header header; unsigned int vq; unsigned long start, end; + bool fpsimd; + + fpsimd_flush_task_state(target); /* Header */ if (count < sizeof(header)) @@ -890,7 +900,16 @@ static int sve_set_common(struct task_struct *target, ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &header, 0, sizeof(header)); if (ret) - goto out; + return ret; + + /* + * Streaming SVE data is always stored and presented in SVE format. + * Require the user to provide SVE formatted data for consistency, and + * to avoid the risk that we configure the task into an invalid state. + */ + fpsimd = (header.flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD; + if (fpsimd && type == ARM64_VEC_SME) + return -EINVAL; /* * Apart from SVE_PT_REGS_MASK, all SVE_PT_* flags are consumed by @@ -899,7 +918,21 @@ static int sve_set_common(struct task_struct *target, ret = vec_set_vector_length(target, type, header.vl, ((unsigned long)header.flags & ~SVE_PT_REGS_MASK) << 16); if (ret) - goto out; + return ret; + + /* Allocate SME storage if necessary, preserving any existing ZA/ZT state */ + if (type == ARM64_VEC_SME) { + sme_alloc(target, false); + if (!target->thread.sme_state) + return -ENOMEM; + } + + /* Allocate SVE storage if necessary, zeroing any existing SVE state */ + if (!fpsimd) { + sve_alloc(target, true); + if (!target->thread.sve_state) + return -ENOMEM; + } /* * Actual VL set may be different from what the user asked @@ -910,81 +943,47 @@ static int sve_set_common(struct task_struct *target, /* Enter/exit streaming mode */ if (system_supports_sme()) { - u64 old_svcr = target->thread.svcr; - switch (type) { case ARM64_VEC_SVE: target->thread.svcr &= ~SVCR_SM_MASK; + set_tsk_thread_flag(target, TIF_SVE); break; case ARM64_VEC_SME: target->thread.svcr |= SVCR_SM_MASK; - - /* - * Disable traps and ensure there is SME storage but - * preserve any currently set values in ZA/ZT. - */ - sme_alloc(target, false); set_tsk_thread_flag(target, TIF_SME); break; default: WARN_ON_ONCE(1); - ret = -EINVAL; - goto out; + return -EINVAL; } - - /* - * If we switched then invalidate any existing SVE - * state and ensure there's storage. - */ - if (target->thread.svcr != old_svcr) - sve_alloc(target, true); } + /* Always zero V regs, FPSR, and FPCR */ + memset(¤t->thread.uw.fpsimd_state, 0, + sizeof(current->thread.uw.fpsimd_state)); + /* Registers: FPSIMD-only case */ BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header)); - if ((header.flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD) { - ret = __fpr_set(target, regset, pos, count, kbuf, ubuf, - SVE_PT_FPSIMD_OFFSET); + if (fpsimd) { clear_tsk_thread_flag(target, TIF_SVE); target->thread.fp_type = FP_STATE_FPSIMD; - goto out; + ret = __fpr_set(target, regset, pos, count, kbuf, ubuf, + SVE_PT_FPSIMD_OFFSET); + return ret; } - /* - * Otherwise: no registers or full SVE case. For backwards - * compatibility reasons we treat empty flags as SVE registers. - */ + /* Otherwise: no registers or full SVE case. */ + + target->thread.fp_type = FP_STATE_SVE; /* * If setting a different VL from the requested VL and there is * register data, the data layout will be wrong: don't even * try to set the registers in this case. */ - if (count && vq != sve_vq_from_vl(header.vl)) { - ret = -EIO; - goto out; - } - - sve_alloc(target, true); - if (!target->thread.sve_state) { - ret = -ENOMEM; - clear_tsk_thread_flag(target, TIF_SVE); - target->thread.fp_type = FP_STATE_FPSIMD; - goto out; - } - - /* - * Ensure target->thread.sve_state is up to date with target's - * FPSIMD regs, so that a short copyin leaves trailing - * registers unmodified. Only enable SVE if we are - * configuring normal SVE, a system with streaming SVE may not - * have normal SVE. - */ - fpsimd_sync_to_sve(target); - if (type == ARM64_VEC_SVE) - set_tsk_thread_flag(target, TIF_SVE); - target->thread.fp_type = FP_STATE_SVE; + if (count && vq != sve_vq_from_vl(header.vl)) + return -EIO; BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header)); start = SVE_PT_SVE_OFFSET; @@ -993,7 +992,7 @@ static int sve_set_common(struct task_struct *target, target->thread.sve_state, start, end); if (ret) - goto out; + return ret; start = end; end = SVE_PT_SVE_FPSR_OFFSET(vq); @@ -1009,8 +1008,6 @@ static int sve_set_common(struct task_struct *target, &target->thread.uw.fpsimd_state.fpsr, start, end); -out: - fpsimd_flush_task_state(target); return ret; } diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 85104587f849..77c7926a4df6 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -169,7 +169,7 @@ static void __init smp_build_mpidr_hash(void) static void __init setup_machine_fdt(phys_addr_t dt_phys) { - int size; + int size = 0; void *dt_virt = fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL); const char *name; @@ -182,10 +182,10 @@ static void __init setup_machine_fdt(phys_addr_t dt_phys) */ if (!early_init_dt_scan(dt_virt, dt_phys)) { pr_crit("\n" - "Error: invalid device tree blob at physical address %pa (virtual address 0x%px)\n" - "The dtb must be 8-byte aligned and must not exceed 2 MB in size\n" - "\nPlease check your bootloader.", - &dt_phys, dt_virt); + "Error: invalid device tree blob: PA=%pa, VA=%px, size=%d bytes\n" + "The dtb must be 8-byte aligned and must not exceed 2 MB in size.\n" + "\nPlease check your bootloader.\n", + &dt_phys, dt_virt, size); /* * Note that in this _really_ early stage we cannot even BUG() diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index a7c37afb4ebe..417140cd399b 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -250,6 +250,8 @@ static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) ¤t->thread.uw.fpsimd_state; int err; + fpsimd_sync_from_effective_state(current); + /* copy the FP and status/control registers */ err = __copy_to_user(ctx->vregs, fpsimd->vregs, sizeof(fpsimd->vregs)); __put_user_error(fpsimd->fpsr, &ctx->fpsr, err); @@ -262,37 +264,46 @@ static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) return err ? -EFAULT : 0; } -static int restore_fpsimd_context(struct user_ctxs *user) +static int read_fpsimd_context(struct user_fpsimd_state *fpsimd, + struct user_ctxs *user) { - struct user_fpsimd_state fpsimd; - int err = 0; + int err; /* check the size information */ if (user->fpsimd_size != sizeof(struct fpsimd_context)) return -EINVAL; /* copy the FP and status/control registers */ - err = __copy_from_user(fpsimd.vregs, &(user->fpsimd->vregs), - sizeof(fpsimd.vregs)); - __get_user_error(fpsimd.fpsr, &(user->fpsimd->fpsr), err); - __get_user_error(fpsimd.fpcr, &(user->fpsimd->fpcr), err); + err = __copy_from_user(fpsimd->vregs, &(user->fpsimd->vregs), + sizeof(fpsimd->vregs)); + __get_user_error(fpsimd->fpsr, &(user->fpsimd->fpsr), err); + __get_user_error(fpsimd->fpcr, &(user->fpsimd->fpcr), err); + + return err ? -EFAULT : 0; +} + +static int restore_fpsimd_context(struct user_ctxs *user) +{ + struct user_fpsimd_state fpsimd; + int err; + + err = read_fpsimd_context(&fpsimd, user); + if (err) + return err; clear_thread_flag(TIF_SVE); + current->thread.svcr &= ~SVCR_SM_MASK; current->thread.fp_type = FP_STATE_FPSIMD; /* load the hardware registers from the fpsimd_state structure */ - if (!err) - fpsimd_update_current_state(&fpsimd); - - return err ? -EFAULT : 0; + fpsimd_update_current_state(&fpsimd); + return 0; } static int preserve_fpmr_context(struct fpmr_context __user *ctx) { int err = 0; - current->thread.uw.fpmr = read_sysreg_s(SYS_FPMR); - __put_user_error(FPMR_MAGIC, &ctx->head.magic, err); __put_user_error(sizeof(*ctx), &ctx->head.size, err); __put_user_error(current->thread.uw.fpmr, &ctx->fpmr, err); @@ -310,7 +321,7 @@ static int restore_fpmr_context(struct user_ctxs *user) __get_user_error(fpmr, &user->fpmr->fpmr, err); if (!err) - write_sysreg_s(fpmr, SYS_FPMR); + current->thread.uw.fpmr = fpmr; return err; } @@ -372,11 +383,6 @@ static int preserve_sve_context(struct sve_context __user *ctx) err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); if (vq) { - /* - * This assumes that the SVE state has already been saved to - * the task struct by calling the function - * fpsimd_signal_preserve_current_state(). - */ err |= __copy_to_user((char __user *)ctx + SVE_SIG_REGS_OFFSET, current->thread.sve_state, SVE_SIG_REGS_SIZE(vq)); @@ -391,6 +397,7 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user) unsigned int vl, vq; struct user_fpsimd_state fpsimd; u16 user_vl, flags; + bool sm; if (user->sve_size < sizeof(*user->sve)) return -EINVAL; @@ -400,7 +407,8 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user) if (err) return err; - if (flags & SVE_SIG_FLAG_SM) { + sm = flags & SVE_SIG_FLAG_SM; + if (sm) { if (!system_supports_sme()) return -EINVAL; @@ -420,28 +428,23 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user) if (user_vl != vl) return -EINVAL; - if (user->sve_size == sizeof(*user->sve)) { - clear_thread_flag(TIF_SVE); - current->thread.svcr &= ~SVCR_SM_MASK; - current->thread.fp_type = FP_STATE_FPSIMD; - goto fpsimd_only; - } + /* + * Non-streaming SVE state may be preserved without an SVE payload, in + * which case the SVE context only has a header with VL==0, and all + * state can be restored from the FPSIMD context. + * + * Streaming SVE state is always preserved with an SVE payload. For + * consistency and robustness, reject restoring streaming SVE state + * without an SVE payload. + */ + if (!sm && user->sve_size == sizeof(*user->sve)) + return restore_fpsimd_context(user); vq = sve_vq_from_vl(vl); if (user->sve_size < SVE_SIG_CONTEXT_SIZE(vq)) return -EINVAL; - /* - * Careful: we are about __copy_from_user() directly into - * thread.sve_state with preemption enabled, so protection is - * needed to prevent a racing context switch from writing stale - * registers back over the new data. - */ - - fpsimd_flush_task_state(current); - /* From now, fpsimd_thread_switch() won't touch thread.sve_state */ - sve_alloc(current, true); if (!current->thread.sve_state) { clear_thread_flag(TIF_SVE); @@ -461,19 +464,14 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user) set_thread_flag(TIF_SVE); current->thread.fp_type = FP_STATE_SVE; -fpsimd_only: - /* copy the FP and status/control registers */ - /* restore_sigframe() already checked that user->fpsimd != NULL. */ - err = __copy_from_user(fpsimd.vregs, user->fpsimd->vregs, - sizeof(fpsimd.vregs)); - __get_user_error(fpsimd.fpsr, &user->fpsimd->fpsr, err); - __get_user_error(fpsimd.fpcr, &user->fpsimd->fpcr, err); + err = read_fpsimd_context(&fpsimd, user); + if (err) + return err; - /* load the hardware registers from the fpsimd_state structure */ - if (!err) - fpsimd_update_current_state(&fpsimd); + /* Merge the FPSIMD registers into the SVE state */ + fpsimd_update_current_state(&fpsimd); - return err ? -EFAULT : 0; + return 0; } #else /* ! CONFIG_ARM64_SVE */ @@ -493,13 +491,12 @@ extern int preserve_sve_context(void __user *ctx); static int preserve_tpidr2_context(struct tpidr2_context __user *ctx) { + u64 tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); int err = 0; - current->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0); - __put_user_error(TPIDR2_MAGIC, &ctx->head.magic, err); __put_user_error(sizeof(*ctx), &ctx->head.size, err); - __put_user_error(current->thread.tpidr2_el0, &ctx->tpidr2, err); + __put_user_error(tpidr2_el0, &ctx->tpidr2, err); return err; } @@ -541,11 +538,6 @@ static int preserve_za_context(struct za_context __user *ctx) err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); if (vq) { - /* - * This assumes that the ZA state has already been saved to - * the task struct by calling the function - * fpsimd_signal_preserve_current_state(). - */ err |= __copy_to_user((char __user *)ctx + ZA_SIG_REGS_OFFSET, current->thread.sme_state, ZA_SIG_REGS_SIZE(vq)); @@ -580,16 +572,6 @@ static int restore_za_context(struct user_ctxs *user) if (user->za_size < ZA_SIG_CONTEXT_SIZE(vq)) return -EINVAL; - /* - * Careful: we are about __copy_from_user() directly into - * thread.sme_state with preemption enabled, so protection is - * needed to prevent a racing context switch from writing stale - * registers back over the new data. - */ - - fpsimd_flush_task_state(current); - /* From now, fpsimd_thread_switch() won't touch thread.sve_state */ - sme_alloc(current, true); if (!current->thread.sme_state) { current->thread.svcr &= ~SVCR_ZA_MASK; @@ -627,11 +609,6 @@ static int preserve_zt_context(struct zt_context __user *ctx) BUILD_BUG_ON(sizeof(ctx->__reserved) != sizeof(reserved)); err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved)); - /* - * This assumes that the ZT state has already been saved to - * the task struct by calling the function - * fpsimd_signal_preserve_current_state(). - */ err |= __copy_to_user((char __user *)ctx + ZT_SIG_REGS_OFFSET, thread_zt_state(¤t->thread), ZT_SIG_REGS_SIZE(1)); @@ -657,16 +634,6 @@ static int restore_zt_context(struct user_ctxs *user) if (nregs != 1) return -EINVAL; - /* - * Careful: we are about __copy_from_user() directly into - * thread.zt_state with preemption enabled, so protection is - * needed to prevent a racing context switch from writing stale - * registers back over the new data. - */ - - fpsimd_flush_task_state(current); - /* From now, fpsimd_thread_switch() won't touch ZT in thread state */ - err = __copy_from_user(thread_zt_state(¤t->thread), (char __user const *)user->zt + ZT_SIG_REGS_OFFSET, @@ -1017,6 +984,8 @@ static int restore_sigframe(struct pt_regs *regs, */ forget_syscall(regs); + fpsimd_save_and_flush_current_state(); + err |= !valid_user_regs(®s->user_regs, current); if (err == 0) err = parse_user_sigframe(&user, sf); @@ -1507,21 +1476,9 @@ static int setup_return(struct pt_regs *regs, struct ksignal *ksig, /* Signal handlers are invoked with ZA and streaming mode disabled */ if (system_supports_sme()) { - /* - * If we were in streaming mode the saved register - * state was SVE but we will exit SM and use the - * FPSIMD register state - flush the saved FPSIMD - * register state in case it gets loaded. - */ - if (current->thread.svcr & SVCR_SM_MASK) { - memset(¤t->thread.uw.fpsimd_state, 0, - sizeof(current->thread.uw.fpsimd_state)); - current->thread.fp_type = FP_STATE_FPSIMD; - } - - current->thread.svcr &= ~(SVCR_ZA_MASK | - SVCR_SM_MASK); - sme_smstop(); + task_smstop_sm(current); + current->thread.svcr &= ~SVCR_ZA_MASK; + write_sysreg_s(0, SYS_TPIDR2_EL0); } return 0; @@ -1535,7 +1492,7 @@ static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, struct user_access_state ua_state; int err = 0; - fpsimd_signal_preserve_current_state(); + fpsimd_save_and_flush_current_state(); if (get_sigframe(&user, ksig, regs)) return 1; diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index 81e798b6dada..bb3b526ff43f 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -103,7 +103,7 @@ static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame) * Note that this also saves V16-31, which aren't visible * in AArch32. */ - fpsimd_signal_preserve_current_state(); + fpsimd_save_and_flush_current_state(); /* Place structure header on the stack */ __put_user_error(magic, &frame->magic, err); @@ -169,14 +169,17 @@ static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame) fpsimd.fpsr = fpscr & VFP_FPSCR_STAT_MASK; fpsimd.fpcr = fpscr & VFP_FPSCR_CTRL_MASK; + if (err) + return -EFAULT; + /* * We don't need to touch the exception register, so * reload the hardware state. */ - if (!err) - fpsimd_update_current_state(&fpsimd); + fpsimd_save_and_flush_current_state(); + current->thread.uw.fpsimd_state = fpsimd; - return err ? -EFAULT : 0; + return 0; } static int compat_restore_sigframe(struct pt_regs *regs, diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index e73326bd3ff7..e4a525a865c1 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -249,9 +249,9 @@ SECTIONS __inittext_end = .; __initdata_begin = .; - init_idmap_pg_dir = .; + __pi_init_idmap_pg_dir = .; . += INIT_IDMAP_DIR_SIZE; - init_idmap_pg_end = .; + __pi_init_idmap_pg_end = .; .init.data : { INIT_DATA @@ -319,11 +319,12 @@ SECTIONS /* start of zero-init region */ BSS_SECTION(SBSS_ALIGN, 0, 0) + __pi___bss_start = __bss_start; . = ALIGN(PAGE_SIZE); - init_pg_dir = .; + __pi_init_pg_dir = .; . += INIT_DIR_SIZE; - init_pg_end = .; + __pi_init_pg_end = .; /* end of zero-init region */ . += SZ_4K; /* stack for the early C runtime */ @@ -332,6 +333,7 @@ SECTIONS . = ALIGN(SEGMENT_ALIGN); __pecoff_data_size = ABSOLUTE(. - __initdata_begin); _end = .; + __pi__end = .; STABS_DEBUG DWARF_DEBUG diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index cfe8cb8ba1cc..0c8737f4f2ce 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -129,7 +129,7 @@ pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep) if (!pte_present(orig_pte) || !pte_cont(orig_pte)) return orig_pte; - ncontig = num_contig_ptes(page_size(pte_page(orig_pte)), &pgsize); + ncontig = find_num_contig(mm, addr, ptep, &pgsize); for (i = 0; i < ncontig; i++, ptep++) { pte_t pte = __ptep_get(ptep); @@ -159,12 +159,11 @@ static pte_t get_clear_contig(struct mm_struct *mm, pte_t pte, tmp_pte; bool present; - pte = __ptep_get_and_clear(mm, addr, ptep); + pte = __ptep_get_and_clear_anysz(mm, ptep, pgsize); present = pte_present(pte); while (--ncontig) { ptep++; - addr += pgsize; - tmp_pte = __ptep_get_and_clear(mm, addr, ptep); + tmp_pte = __ptep_get_and_clear_anysz(mm, ptep, pgsize); if (present) { if (pte_dirty(tmp_pte)) pte = pte_mkdirty(pte); @@ -183,8 +182,9 @@ static pte_t get_clear_contig_flush(struct mm_struct *mm, { pte_t orig_pte = get_clear_contig(mm, addr, ptep, pgsize, ncontig); struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0); + unsigned long end = addr + (pgsize * ncontig); - flush_tlb_range(&vma, addr, addr + (pgsize * ncontig)); + __flush_hugetlb_tlb_range(&vma, addr, end, pgsize, true); return orig_pte; } @@ -207,9 +207,12 @@ static void clear_flush(struct mm_struct *mm, unsigned long i, saddr = addr; for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) - __ptep_get_and_clear(mm, addr, ptep); + __ptep_get_and_clear_anysz(mm, ptep, pgsize); - flush_tlb_range(&vma, saddr, addr); + if (mm == &init_mm) + flush_tlb_kernel_range(saddr, addr); + else + __flush_hugetlb_tlb_range(&vma, saddr, addr, pgsize, true); } void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, @@ -218,30 +221,20 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, size_t pgsize; int i; int ncontig; - unsigned long pfn, dpfn; - pgprot_t hugeprot; ncontig = num_contig_ptes(sz, &pgsize); if (!pte_present(pte)) { for (i = 0; i < ncontig; i++, ptep++, addr += pgsize) - __set_ptes(mm, addr, ptep, pte, 1); - return; - } - - if (!pte_cont(pte)) { - __set_ptes(mm, addr, ptep, pte, 1); + __set_ptes_anysz(mm, ptep, pte, 1, pgsize); return; } - pfn = pte_pfn(pte); - dpfn = pgsize >> PAGE_SHIFT; - hugeprot = pte_pgprot(pte); - - clear_flush(mm, addr, ptep, pgsize, ncontig); + /* Only need to "break" if transitioning valid -> valid. */ + if (pte_cont(pte) && pte_valid(__ptep_get(ptep))) + clear_flush(mm, addr, ptep, pgsize, ncontig); - for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) - __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1); + __set_ptes_anysz(mm, ptep, pte, ncontig, pgsize); } pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, @@ -431,23 +424,23 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty) { - int ncontig, i; + int ncontig; size_t pgsize = 0; - unsigned long pfn = pte_pfn(pte), dpfn; struct mm_struct *mm = vma->vm_mm; - pgprot_t hugeprot; pte_t orig_pte; + VM_WARN_ON(!pte_present(pte)); + if (!pte_cont(pte)) return __ptep_set_access_flags(vma, addr, ptep, pte, dirty); - ncontig = find_num_contig(mm, addr, ptep, &pgsize); - dpfn = pgsize >> PAGE_SHIFT; + ncontig = num_contig_ptes(huge_page_size(hstate_vma(vma)), &pgsize); if (!__cont_access_flags_changed(ptep, pte, ncontig)) return 0; orig_pte = get_clear_contig_flush(mm, addr, ptep, pgsize, ncontig); + VM_WARN_ON(!pte_present(orig_pte)); /* Make sure we don't lose the dirty or young state */ if (pte_dirty(orig_pte)) @@ -456,38 +449,31 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, if (pte_young(orig_pte)) pte = pte_mkyoung(pte); - hugeprot = pte_pgprot(pte); - for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) - __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1); - + __set_ptes_anysz(mm, ptep, pte, ncontig, pgsize); return 1; } void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - unsigned long pfn, dpfn; - pgprot_t hugeprot; - int ncontig, i; + int ncontig; size_t pgsize; pte_t pte; - if (!pte_cont(__ptep_get(ptep))) { + pte = __ptep_get(ptep); + VM_WARN_ON(!pte_present(pte)); + + if (!pte_cont(pte)) { __ptep_set_wrprotect(mm, addr, ptep); return; } ncontig = find_num_contig(mm, addr, ptep, &pgsize); - dpfn = pgsize >> PAGE_SHIFT; pte = get_clear_contig_flush(mm, addr, ptep, pgsize, ncontig); pte = pte_wrprotect(pte); - hugeprot = pte_pgprot(pte); - pfn = pte_pfn(pte); - - for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) - __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1); + __set_ptes_anysz(mm, ptep, pte, ncontig, pgsize); } pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, @@ -497,10 +483,7 @@ pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, size_t pgsize; int ncontig; - if (!pte_cont(__ptep_get(ptep))) - return ptep_clear_flush(vma, addr, ptep); - - ncontig = find_num_contig(mm, addr, ptep, &pgsize); + ncontig = num_contig_ptes(huge_page_size(hstate_vma(vma)), &pgsize); return get_clear_contig_flush(mm, addr, ptep, pgsize, ncontig); } diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index b99bf3980fc6..0c8c35dd645e 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -275,26 +275,6 @@ void __init arm64_memblock_init(void) } } - if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { - extern u16 memstart_offset_seed; - u64 mmfr0 = read_cpuid(ID_AA64MMFR0_EL1); - int parange = cpuid_feature_extract_unsigned_field( - mmfr0, ID_AA64MMFR0_EL1_PARANGE_SHIFT); - s64 range = linear_region_size - - BIT(id_aa64mmfr0_parange_to_phys_shift(parange)); - - /* - * If the size of the linear region exceeds, by a sufficient - * margin, the size of the region that the physical memory can - * span, randomize the linear region as well. - */ - if (memstart_offset_seed > 0 && range >= (s64)ARM64_MEMSTART_ALIGN) { - range /= ARM64_MEMSTART_ALIGN; - memstart_addr -= ARM64_MEMSTART_ALIGN * - ((range * memstart_offset_seed) >> 16); - } - } - /* * Register the kernel text, kernel data, initrd, and initial * pagetables with memblock. diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 39fd1f7ff02a..04d4a8f676db 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -96,8 +96,8 @@ static int change_memory_common(unsigned long addr, int numpages, * we are operating on does not result in such splitting. * * Let's restrict ourselves to mappings created by vmalloc (or vmap). - * Those are guaranteed to consist entirely of page mappings, and - * splitting is never needed. + * Disallow VM_ALLOW_HUGE_VMAP mappings to guarantee that only page + * mappings are updated and splitting is never needed. * * So check whether the [addr, addr + size) interval is entirely * covered by precisely one VM area that has the VM_ALLOC flag set. @@ -105,7 +105,7 @@ static int change_memory_common(unsigned long addr, int numpages, area = find_vm_area((void *)addr); if (!area || end > (unsigned long)kasan_reset_tag(area->addr) + area->size || - !(area->flags & VM_ALLOC)) + ((area->flags & (VM_ALLOC | VM_ALLOW_HUGE_VMAP)) != VM_ALLOC)) return -EINVAL; if (!numpages) diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index fb30c8804f87..80d470aa469d 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -512,26 +512,11 @@ alternative_else_nop_endif ubfx x1, x1, #ID_AA64MMFR3_EL1_S1PIE_SHIFT, #4 cbz x1, .Lskip_indirection - /* - * The PROT_* macros describing the various memory types may resolve to - * C expressions if they include the PTE_MAYBE_* macros, and so they - * can only be used from C code. The PIE_E* constants below are also - * defined in terms of those macros, but will mask out those - * PTE_MAYBE_* constants, whether they are set or not. So #define them - * as 0x0 here so we can evaluate the PIE_E* constants in asm context. - */ - -#define PTE_MAYBE_NG 0 -#define PTE_MAYBE_SHARED 0 - - mov_q x0, PIE_E0 + mov_q x0, PIE_E0_ASM msr REG_PIRE0_EL1, x0 - mov_q x0, PIE_E1 + mov_q x0, PIE_E1_ASM msr REG_PIR_EL1, x0 -#undef PTE_MAYBE_NG -#undef PTE_MAYBE_SHARED - orr tcr2, tcr2, TCR2_EL1_PIE msr REG_TCR2_EL1, x0 diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 634d78422adb..da8b89dd2910 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -2113,7 +2113,7 @@ bool bpf_jit_supports_subprog_tailcalls(void) } static void invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_link *l, - int args_off, int retval_off, int run_ctx_off, + int bargs_off, int retval_off, int run_ctx_off, bool save_ret) { __le32 *branch; @@ -2155,7 +2155,7 @@ static void invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_link *l, branch = ctx->image + ctx->idx; emit(A64_NOP, ctx); - emit(A64_ADD_I(1, A64_R(0), A64_SP, args_off), ctx); + emit(A64_ADD_I(1, A64_R(0), A64_SP, bargs_off), ctx); if (!p->jited) emit_addr_mov_i64(A64_R(1), (const u64)p->insnsi, ctx); @@ -2180,7 +2180,7 @@ static void invoke_bpf_prog(struct jit_ctx *ctx, struct bpf_tramp_link *l, } static void invoke_bpf_mod_ret(struct jit_ctx *ctx, struct bpf_tramp_links *tl, - int args_off, int retval_off, int run_ctx_off, + int bargs_off, int retval_off, int run_ctx_off, __le32 **branches) { int i; @@ -2190,7 +2190,7 @@ static void invoke_bpf_mod_ret(struct jit_ctx *ctx, struct bpf_tramp_links *tl, */ emit(A64_STR64I(A64_ZR, A64_SP, retval_off), ctx); for (i = 0; i < tl->nr_links; i++) { - invoke_bpf_prog(ctx, tl->links[i], args_off, retval_off, + invoke_bpf_prog(ctx, tl->links[i], bargs_off, retval_off, run_ctx_off, true); /* if (*(u64 *)(sp + retval_off) != 0) * goto do_fexit; @@ -2204,23 +2204,125 @@ static void invoke_bpf_mod_ret(struct jit_ctx *ctx, struct bpf_tramp_links *tl, } } -static void save_args(struct jit_ctx *ctx, int args_off, int nregs) +struct arg_aux { + /* how many args are passed through registers, the rest of the args are + * passed through stack + */ + int args_in_regs; + /* how many registers are used to pass arguments */ + int regs_for_args; + /* how much stack is used for additional args passed to bpf program + * that did not fit in original function registers + */ + int bstack_for_args; + /* home much stack is used for additional args passed to the + * original function when called from trampoline (this one needs + * arguments to be properly aligned) + */ + int ostack_for_args; +}; + +static int calc_arg_aux(const struct btf_func_model *m, + struct arg_aux *a) { - int i; + int stack_slots, nregs, slots, i; + + /* verifier ensures m->nr_args <= MAX_BPF_FUNC_ARGS */ + for (i = 0, nregs = 0; i < m->nr_args; i++) { + slots = (m->arg_size[i] + 7) / 8; + if (nregs + slots <= 8) /* passed through register ? */ + nregs += slots; + else + break; + } + + a->args_in_regs = i; + a->regs_for_args = nregs; + a->ostack_for_args = 0; + a->bstack_for_args = 0; + + /* the rest arguments are passed through stack */ + for (; i < m->nr_args; i++) { + /* We can not know for sure about exact alignment needs for + * struct passed on stack, so deny those + */ + if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG) + return -ENOTSUPP; + stack_slots = (m->arg_size[i] + 7) / 8; + a->bstack_for_args += stack_slots * 8; + a->ostack_for_args = a->ostack_for_args + stack_slots * 8; + } + + return 0; +} - for (i = 0; i < nregs; i++) { - emit(A64_STR64I(i, A64_SP, args_off), ctx); - args_off += 8; +static void clear_garbage(struct jit_ctx *ctx, int reg, int effective_bytes) +{ + if (effective_bytes) { + int garbage_bits = 64 - 8 * effective_bytes; +#ifdef CONFIG_CPU_BIG_ENDIAN + /* garbage bits are at the right end */ + emit(A64_LSR(1, reg, reg, garbage_bits), ctx); + emit(A64_LSL(1, reg, reg, garbage_bits), ctx); +#else + /* garbage bits are at the left end */ + emit(A64_LSL(1, reg, reg, garbage_bits), ctx); + emit(A64_LSR(1, reg, reg, garbage_bits), ctx); +#endif } } -static void restore_args(struct jit_ctx *ctx, int args_off, int nregs) +static void save_args(struct jit_ctx *ctx, int bargs_off, int oargs_off, + const struct btf_func_model *m, + const struct arg_aux *a, + bool for_call_origin) { int i; + int reg; + int doff; + int soff; + int slots; + u8 tmp = bpf2a64[TMP_REG_1]; + + /* store arguments to the stack for the bpf program, or restore + * arguments from stack for the original function + */ + for (reg = 0; reg < a->regs_for_args; reg++) { + emit(for_call_origin ? + A64_LDR64I(reg, A64_SP, bargs_off) : + A64_STR64I(reg, A64_SP, bargs_off), + ctx); + bargs_off += 8; + } + + soff = 32; /* on stack arguments start from FP + 32 */ + doff = (for_call_origin ? oargs_off : bargs_off); + + /* save on stack arguments */ + for (i = a->args_in_regs; i < m->nr_args; i++) { + slots = (m->arg_size[i] + 7) / 8; + /* verifier ensures arg_size <= 16, so slots equals 1 or 2 */ + while (slots-- > 0) { + emit(A64_LDR64I(tmp, A64_FP, soff), ctx); + /* if there is unused space in the last slot, clear + * the garbage contained in the space. + */ + if (slots == 0 && !for_call_origin) + clear_garbage(ctx, tmp, m->arg_size[i] % 8); + emit(A64_STR64I(tmp, A64_SP, doff), ctx); + soff += 8; + doff += 8; + } + } +} - for (i = 0; i < nregs; i++) { - emit(A64_LDR64I(i, A64_SP, args_off), ctx); - args_off += 8; +static void restore_args(struct jit_ctx *ctx, int bargs_off, int nregs) +{ + int reg; + + for (reg = 0; reg < nregs; reg++) { + emit(A64_LDR64I(reg, A64_SP, bargs_off), ctx); + bargs_off += 8; } } @@ -2243,17 +2345,21 @@ static bool is_struct_ops_tramp(const struct bpf_tramp_links *fentry_links) */ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, struct bpf_tramp_links *tlinks, void *func_addr, - int nregs, u32 flags) + const struct btf_func_model *m, + const struct arg_aux *a, + u32 flags) { int i; int stack_size; int retaddr_off; int regs_off; int retval_off; - int args_off; - int nregs_off; + int bargs_off; + int nfuncargs_off; int ip_off; int run_ctx_off; + int oargs_off; + int nfuncargs; struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN]; @@ -2262,31 +2368,38 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, bool is_struct_ops = is_struct_ops_tramp(fentry); /* trampoline stack layout: - * [ parent ip ] - * [ FP ] - * SP + retaddr_off [ self ip ] - * [ FP ] + * [ parent ip ] + * [ FP ] + * SP + retaddr_off [ self ip ] + * [ FP ] * - * [ padding ] align SP to multiples of 16 + * [ padding ] align SP to multiples of 16 * - * [ x20 ] callee saved reg x20 - * SP + regs_off [ x19 ] callee saved reg x19 + * [ x20 ] callee saved reg x20 + * SP + regs_off [ x19 ] callee saved reg x19 * - * SP + retval_off [ return value ] BPF_TRAMP_F_CALL_ORIG or - * BPF_TRAMP_F_RET_FENTRY_RET + * SP + retval_off [ return value ] BPF_TRAMP_F_CALL_ORIG or + * BPF_TRAMP_F_RET_FENTRY_RET + * [ arg reg N ] + * [ ... ] + * SP + bargs_off [ arg reg 1 ] for bpf * - * [ arg reg N ] - * [ ... ] - * SP + args_off [ arg reg 1 ] + * SP + nfuncargs_off [ arg regs count ] * - * SP + nregs_off [ arg regs count ] + * SP + ip_off [ traced function ] BPF_TRAMP_F_IP_ARG flag * - * SP + ip_off [ traced function ] BPF_TRAMP_F_IP_ARG flag + * SP + run_ctx_off [ bpf_tramp_run_ctx ] * - * SP + run_ctx_off [ bpf_tramp_run_ctx ] + * [ stack arg N ] + * [ ... ] + * SP + oargs_off [ stack arg 1 ] for original func */ stack_size = 0; + oargs_off = stack_size; + if (flags & BPF_TRAMP_F_CALL_ORIG) + stack_size += a->ostack_for_args; + run_ctx_off = stack_size; /* room for bpf_tramp_run_ctx */ stack_size += round_up(sizeof(struct bpf_tramp_run_ctx), 8); @@ -2296,13 +2409,14 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, if (flags & BPF_TRAMP_F_IP_ARG) stack_size += 8; - nregs_off = stack_size; + nfuncargs_off = stack_size; /* room for args count */ stack_size += 8; - args_off = stack_size; + bargs_off = stack_size; /* room for args */ - stack_size += nregs * 8; + nfuncargs = a->regs_for_args + a->bstack_for_args / 8; + stack_size += 8 * nfuncargs; /* room for return value */ retval_off = stack_size; @@ -2349,11 +2463,11 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, } /* save arg regs count*/ - emit(A64_MOVZ(1, A64_R(10), nregs, 0), ctx); - emit(A64_STR64I(A64_R(10), A64_SP, nregs_off), ctx); + emit(A64_MOVZ(1, A64_R(10), nfuncargs, 0), ctx); + emit(A64_STR64I(A64_R(10), A64_SP, nfuncargs_off), ctx); - /* save arg regs */ - save_args(ctx, args_off, nregs); + /* save args for bpf */ + save_args(ctx, bargs_off, oargs_off, m, a, false); /* save callee saved registers */ emit(A64_STR64I(A64_R(19), A64_SP, regs_off), ctx); @@ -2369,7 +2483,7 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, } for (i = 0; i < fentry->nr_links; i++) - invoke_bpf_prog(ctx, fentry->links[i], args_off, + invoke_bpf_prog(ctx, fentry->links[i], bargs_off, retval_off, run_ctx_off, flags & BPF_TRAMP_F_RET_FENTRY_RET); @@ -2379,12 +2493,13 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, if (!branches) return -ENOMEM; - invoke_bpf_mod_ret(ctx, fmod_ret, args_off, retval_off, + invoke_bpf_mod_ret(ctx, fmod_ret, bargs_off, retval_off, run_ctx_off, branches); } if (flags & BPF_TRAMP_F_CALL_ORIG) { - restore_args(ctx, args_off, nregs); + /* save args for original func */ + save_args(ctx, bargs_off, oargs_off, m, a, true); /* call original func */ emit(A64_LDR64I(A64_R(10), A64_SP, retaddr_off), ctx); emit(A64_ADR(A64_LR, AARCH64_INSN_SIZE * 2), ctx); @@ -2403,7 +2518,7 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, } for (i = 0; i < fexit->nr_links; i++) - invoke_bpf_prog(ctx, fexit->links[i], args_off, retval_off, + invoke_bpf_prog(ctx, fexit->links[i], bargs_off, retval_off, run_ctx_off, false); if (flags & BPF_TRAMP_F_CALL_ORIG) { @@ -2417,7 +2532,7 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, } if (flags & BPF_TRAMP_F_RESTORE_REGS) - restore_args(ctx, args_off, nregs); + restore_args(ctx, bargs_off, a->regs_for_args); /* restore callee saved register x19 and x20 */ emit(A64_LDR64I(A64_R(19), A64_SP, regs_off), ctx); @@ -2454,21 +2569,6 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, return ctx->idx; } -static int btf_func_model_nregs(const struct btf_func_model *m) -{ - int nregs = m->nr_args; - int i; - - /* extra registers needed for struct argument */ - for (i = 0; i < MAX_BPF_FUNC_ARGS; i++) { - /* The arg_size is at most 16 bytes, enforced by the verifier. */ - if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG) - nregs += (m->arg_size[i] + 7) / 8 - 1; - } - - return nregs; -} - int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, void *func_addr) { @@ -2477,14 +2577,14 @@ int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, .idx = 0, }; struct bpf_tramp_image im; - int nregs, ret; + struct arg_aux aaux; + int ret; - nregs = btf_func_model_nregs(m); - /* the first 8 registers are used for arguments */ - if (nregs > 8) - return -ENOTSUPP; + ret = calc_arg_aux(m, &aaux); + if (ret < 0) + return ret; - ret = prepare_trampoline(&ctx, &im, tlinks, func_addr, nregs, flags); + ret = prepare_trampoline(&ctx, &im, tlinks, func_addr, m, &aaux, flags); if (ret < 0) return ret; @@ -2511,9 +2611,10 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image, u32 flags, struct bpf_tramp_links *tlinks, void *func_addr) { - int ret, nregs; - void *image, *tmp; u32 size = ro_image_end - ro_image; + struct arg_aux aaux; + void *image, *tmp; + int ret; /* image doesn't need to be in module memory range, so we can * use kvmalloc. @@ -2529,13 +2630,12 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image, .write = true, }; - nregs = btf_func_model_nregs(m); - /* the first 8 registers are used for arguments */ - if (nregs > 8) - return -ENOTSUPP; jit_fill_hole(image, (unsigned int)(ro_image_end - ro_image)); - ret = prepare_trampoline(&ctx, im, tlinks, func_addr, nregs, flags); + ret = calc_arg_aux(m, &aaux); + if (ret) + goto out; + ret = prepare_trampoline(&ctx, im, tlinks, func_addr, m, &aaux, flags); if (ret > 0 && validate_code(&ctx) < 0) { ret = -EINVAL; diff --git a/arch/arm64/xen/hypercall.S b/arch/arm64/xen/hypercall.S index 9d01361696a1..ae551b857137 100644 --- a/arch/arm64/xen/hypercall.S +++ b/arch/arm64/xen/hypercall.S @@ -83,7 +83,26 @@ HYPERCALL3(vcpu_op); HYPERCALL1(platform_op_raw); HYPERCALL2(multicall); HYPERCALL2(vm_assist); -HYPERCALL3(dm_op); + +SYM_FUNC_START(HYPERVISOR_dm_op) + mov x16, #__HYPERVISOR_dm_op; \ + /* + * dm_op hypercalls are issued by the userspace. The kernel needs to + * enable access to TTBR0_EL1 as the hypervisor would issue stage 1 + * translations to user memory via AT instructions. Since AT + * instructions are not affected by the PAN bit (ARMv8.1), we only + * need the explicit uaccess_enable/disable if the TTBR0 PAN emulation + * is enabled (it implies that hardware UAO and PAN disabled). + */ + uaccess_ttbr0_enable x6, x7, x8 + hvc XEN_IMM + + /* + * Disable userspace access from kernel once the hyp call completed. + */ + uaccess_ttbr0_disable x6, x7 + ret +SYM_FUNC_END(HYPERVISOR_dm_op); SYM_FUNC_START(privcmd_call) mov x16, x0 diff --git a/arch/m68k/coldfire/m5272.c b/arch/m68k/coldfire/m5272.c index 734dab657fe3..5b70dfdab368 100644 --- a/arch/m68k/coldfire/m5272.c +++ b/arch/m68k/coldfire/m5272.c @@ -119,7 +119,7 @@ static struct fixed_phy_status nettel_fixed_phy_status __initdata = { static int __init init_BSP(void) { m5272_uarts_init(); - fixed_phy_add(PHY_POLL, 0, &nettel_fixed_phy_status); + fixed_phy_add(0, &nettel_fixed_phy_status); clkdev_add_table(m5272_clk_lookup, ARRAY_SIZE(m5272_clk_lookup)); return 0; } diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig index 77f78d326a32..d05690289e33 100644 --- a/arch/m68k/configs/amiga_defconfig +++ b/arch/m68k/configs/amiga_defconfig @@ -267,8 +267,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m @@ -602,7 +600,6 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set -CONFIG_PRIME_NUMBERS=m CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -621,8 +618,6 @@ CONFIG_ATOMIC64_SELFTEST=m CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_KSTRTOX=m -CONFIG_TEST_PRINTF=m -CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m @@ -632,7 +627,6 @@ CONFIG_TEST_IDA=m CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m CONFIG_TEST_BPF=m -CONFIG_TEST_BLACKHOLE_DEV=m CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig index f4031aa5d37f..a1747fbe23fb 100644 --- a/arch/m68k/configs/apollo_defconfig +++ b/arch/m68k/configs/apollo_defconfig @@ -263,8 +263,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m @@ -559,7 +557,6 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set -CONFIG_PRIME_NUMBERS=m CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -578,8 +575,6 @@ CONFIG_ATOMIC64_SELFTEST=m CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_KSTRTOX=m -CONFIG_TEST_PRINTF=m -CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m @@ -589,7 +584,6 @@ CONFIG_TEST_IDA=m CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m CONFIG_TEST_BPF=m -CONFIG_TEST_BLACKHOLE_DEV=m CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig index fa92131cf4b3..74293551f66b 100644 --- a/arch/m68k/configs/atari_defconfig +++ b/arch/m68k/configs/atari_defconfig @@ -270,8 +270,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m @@ -579,7 +577,6 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set -CONFIG_PRIME_NUMBERS=m CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -598,8 +595,6 @@ CONFIG_ATOMIC64_SELFTEST=m CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_KSTRTOX=m -CONFIG_TEST_PRINTF=m -CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m @@ -609,7 +604,6 @@ CONFIG_TEST_IDA=m CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m CONFIG_TEST_BPF=m -CONFIG_TEST_BLACKHOLE_DEV=m CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig index 9c2afc477061..419b13ae950a 100644 --- a/arch/m68k/configs/bvme6000_defconfig +++ b/arch/m68k/configs/bvme6000_defconfig @@ -260,8 +260,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m @@ -551,7 +549,6 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set -CONFIG_PRIME_NUMBERS=m CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -570,8 +567,6 @@ CONFIG_ATOMIC64_SELFTEST=m CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_KSTRTOX=m -CONFIG_TEST_PRINTF=m -CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m @@ -581,7 +576,6 @@ CONFIG_TEST_IDA=m CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m CONFIG_TEST_BPF=m -CONFIG_TEST_BLACKHOLE_DEV=m CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig index e7cdab059d96..4c81d756587c 100644 --- a/arch/m68k/configs/hp300_defconfig +++ b/arch/m68k/configs/hp300_defconfig @@ -262,8 +262,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m @@ -561,7 +559,6 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set -CONFIG_PRIME_NUMBERS=m CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -580,8 +577,6 @@ CONFIG_ATOMIC64_SELFTEST=m CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_KSTRTOX=m -CONFIG_TEST_PRINTF=m -CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m @@ -591,7 +586,6 @@ CONFIG_TEST_IDA=m CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m CONFIG_TEST_BPF=m -CONFIG_TEST_BLACKHOLE_DEV=m CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig index 0a79751c20a5..daa01d7fb462 100644 --- a/arch/m68k/configs/mac_defconfig +++ b/arch/m68k/configs/mac_defconfig @@ -261,8 +261,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m @@ -578,7 +576,6 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set -CONFIG_PRIME_NUMBERS=m CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -597,8 +594,6 @@ CONFIG_ATOMIC64_SELFTEST=m CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_KSTRTOX=m -CONFIG_TEST_PRINTF=m -CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m @@ -608,7 +603,6 @@ CONFIG_TEST_IDA=m CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m CONFIG_TEST_BPF=m -CONFIG_TEST_BLACKHOLE_DEV=m CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig index f8ca490ee65a..641ca22eb3b2 100644 --- a/arch/m68k/configs/multi_defconfig +++ b/arch/m68k/configs/multi_defconfig @@ -281,8 +281,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m @@ -665,7 +663,6 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set -CONFIG_PRIME_NUMBERS=m CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -684,8 +681,6 @@ CONFIG_ATOMIC64_SELFTEST=m CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_KSTRTOX=m -CONFIG_TEST_PRINTF=m -CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m @@ -695,7 +690,6 @@ CONFIG_TEST_IDA=m CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m CONFIG_TEST_BPF=m -CONFIG_TEST_BLACKHOLE_DEV=m CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig index 88fdcea906f3..f98ffa7a1640 100644 --- a/arch/m68k/configs/mvme147_defconfig +++ b/arch/m68k/configs/mvme147_defconfig @@ -259,8 +259,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m @@ -551,7 +549,6 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set -CONFIG_PRIME_NUMBERS=m CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -570,8 +567,6 @@ CONFIG_ATOMIC64_SELFTEST=m CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_KSTRTOX=m -CONFIG_TEST_PRINTF=m -CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m @@ -581,7 +576,6 @@ CONFIG_TEST_IDA=m CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m CONFIG_TEST_BPF=m -CONFIG_TEST_BLACKHOLE_DEV=m CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig index 8acbe83dac72..2bfc3f4b48f9 100644 --- a/arch/m68k/configs/mvme16x_defconfig +++ b/arch/m68k/configs/mvme16x_defconfig @@ -260,8 +260,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m @@ -552,7 +550,6 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set -CONFIG_PRIME_NUMBERS=m CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -571,8 +568,6 @@ CONFIG_ATOMIC64_SELFTEST=m CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_KSTRTOX=m -CONFIG_TEST_PRINTF=m -CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m @@ -582,7 +577,6 @@ CONFIG_TEST_IDA=m CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m CONFIG_TEST_BPF=m -CONFIG_TEST_BLACKHOLE_DEV=m CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig index e3095301f3c5..2bd46cbcca2a 100644 --- a/arch/m68k/configs/q40_defconfig +++ b/arch/m68k/configs/q40_defconfig @@ -261,8 +261,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m @@ -568,7 +566,6 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set -CONFIG_PRIME_NUMBERS=m CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -587,8 +584,6 @@ CONFIG_ATOMIC64_SELFTEST=m CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_KSTRTOX=m -CONFIG_TEST_PRINTF=m -CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m @@ -598,7 +593,6 @@ CONFIG_TEST_IDA=m CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m CONFIG_TEST_BPF=m -CONFIG_TEST_BLACKHOLE_DEV=m CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig index 948e48ddd128..dc7fc94fc669 100644 --- a/arch/m68k/configs/sun3_defconfig +++ b/arch/m68k/configs/sun3_defconfig @@ -256,8 +256,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m @@ -549,7 +547,6 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set -CONFIG_PRIME_NUMBERS=m CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -567,8 +564,6 @@ CONFIG_ATOMIC64_SELFTEST=m CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_KSTRTOX=m -CONFIG_TEST_PRINTF=m -CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m @@ -578,7 +573,6 @@ CONFIG_TEST_IDA=m CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m CONFIG_TEST_BPF=m -CONFIG_TEST_BLACKHOLE_DEV=m CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig index 5bcf9181c37c..b026a54867f5 100644 --- a/arch/m68k/configs/sun3x_defconfig +++ b/arch/m68k/configs/sun3x_defconfig @@ -257,8 +257,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m -# CONFIG_IP_DCCP_CCID3 is not set CONFIG_SCTP_COOKIE_HMAC_SHA1=y CONFIG_RDS=m CONFIG_RDS_TCP=m @@ -549,7 +547,6 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m CONFIG_CRYPTO_USER_API_RNG=m CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_HW is not set -CONFIG_PRIME_NUMBERS=m CONFIG_XZ_DEC_TEST=m CONFIG_GLOB_SELFTEST=m # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set @@ -568,8 +565,6 @@ CONFIG_ATOMIC64_SELFTEST=m CONFIG_ASYNC_RAID6_TEST=m CONFIG_TEST_HEXDUMP=m CONFIG_TEST_KSTRTOX=m -CONFIG_TEST_PRINTF=m -CONFIG_TEST_SCANF=m CONFIG_TEST_BITMAP=m CONFIG_TEST_UUID=m CONFIG_TEST_XARRAY=m @@ -579,7 +574,6 @@ CONFIG_TEST_IDA=m CONFIG_TEST_BITOPS=m CONFIG_TEST_VMALLOC=m CONFIG_TEST_BPF=m -CONFIG_TEST_BLACKHOLE_DEV=m CONFIG_FIND_BIT_BENCHMARK=m CONFIG_TEST_FIRMWARE=m CONFIG_TEST_SYSCTL=m diff --git a/arch/m68k/kernel/setup_mm.c b/arch/m68k/kernel/setup_mm.c index 0fba32552836..c7e8de0d34bb 100644 --- a/arch/m68k/kernel/setup_mm.c +++ b/arch/m68k/kernel/setup_mm.c @@ -484,7 +484,7 @@ static int hardware_proc_show(struct seq_file *m, void *v) if (mach_get_model) mach_get_model(model); else - strcpy(model, "Unknown m68k"); + strscpy(model, "Unknown m68k"); seq_printf(m, "Model:\t\t%s\n", model); for (mem = 0, i = 0; i < m68k_num_memory; i++) diff --git a/arch/m68k/mac/config.c b/arch/m68k/mac/config.c index e324410ef239..d26c7f4f8c36 100644 --- a/arch/m68k/mac/config.c +++ b/arch/m68k/mac/config.c @@ -793,7 +793,7 @@ static void __init mac_identify(void) } macintosh_config = mac_data_table; - for (m = macintosh_config; m->ident != -1; m++) { + for (m = &mac_data_table[1]; m->ident != -1; m++) { if (m->ident == model) { macintosh_config = m; break; diff --git a/arch/mips/bcm47xx/setup.c b/arch/mips/bcm47xx/setup.c index 247be207f293..de426a474b5b 100644 --- a/arch/mips/bcm47xx/setup.c +++ b/arch/mips/bcm47xx/setup.c @@ -282,7 +282,7 @@ static int __init bcm47xx_register_bus_complete(void) bcm47xx_leds_register(); bcm47xx_workarounds(); - fixed_phy_add(PHY_POLL, 0, &bcm47xx_fixed_phy_status); + fixed_phy_add(0, &bcm47xx_fixed_phy_status); return 0; } device_initcall(bcm47xx_register_bus_complete); diff --git a/arch/mips/configs/bigsur_defconfig b/arch/mips/configs/bigsur_defconfig index 8f7c36868204..97d2cd997285 100644 --- a/arch/mips/configs/bigsur_defconfig +++ b/arch/mips/configs/bigsur_defconfig @@ -81,7 +81,6 @@ CONFIG_IP_VS_SH=m CONFIG_IP_VS_SED=m CONFIG_IP_VS_NQ=m CONFIG_IP_VS_FTP=m -CONFIG_IP_DCCP=m CONFIG_BRIDGE=m CONFIG_VLAN_8021Q=m CONFIG_VLAN_8021Q_GVRP=y diff --git a/arch/mips/configs/gpr_defconfig b/arch/mips/configs/gpr_defconfig index 48c8feec958f..437ef6dc0b4c 100644 --- a/arch/mips/configs/gpr_defconfig +++ b/arch/mips/configs/gpr_defconfig @@ -84,7 +84,6 @@ CONFIG_BRIDGE_EBT_MARK_T=m CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m -CONFIG_IP_DCCP=m CONFIG_IP_SCTP=m CONFIG_TIPC=m CONFIG_ATM=y diff --git a/arch/mips/configs/mtx1_defconfig b/arch/mips/configs/mtx1_defconfig index cbf9c35a6177..e4bcdb64df6c 100644 --- a/arch/mips/configs/mtx1_defconfig +++ b/arch/mips/configs/mtx1_defconfig @@ -130,7 +130,6 @@ CONFIG_BRIDGE_EBT_MARK_T=m CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m -CONFIG_IP_DCCP=m CONFIG_IP_SCTP=m CONFIG_TIPC=m CONFIG_ATM=y diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 22fa8f19924a..31ac655b7837 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -161,6 +161,8 @@ #define SO_RCVPRIORITY 82 +#define SO_PASSRIGHTS 83 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h index eab87c6beacb..e5d64c84aadf 100644 --- a/arch/nios2/include/asm/pgtable.h +++ b/arch/nios2/include/asm/pgtable.h @@ -291,4 +291,20 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, #define update_mmu_cache(vma, addr, ptep) \ update_mmu_cache_range(NULL, vma, addr, ptep, 1) +static inline int pte_same(pte_t pte_a, pte_t pte_b); + +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS +static inline int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty) +{ + if (!pte_same(*ptep, entry)) + set_ptes(vma->vm_mm, address, ptep, entry, 1); + /* + * update_mmu_cache will unconditionally execute, handling both + * the case that the PTE changed and the spurious fault case. + */ + return true; +} + #endif /* _ASM_NIOS2_PGTABLE_H */ diff --git a/arch/nios2/kernel/cpuinfo.c b/arch/nios2/kernel/cpuinfo.c index 7b1e8f9128e9..55882feb6249 100644 --- a/arch/nios2/kernel/cpuinfo.c +++ b/arch/nios2/kernel/cpuinfo.c @@ -46,10 +46,7 @@ void __init setup_cpuinfo(void) cpuinfo.cpu_clock_freq = fcpu(cpu, "clock-frequency"); str = of_get_property(cpu, "altr,implementation", &len); - if (str) - strscpy(cpuinfo.cpu_impl, str, sizeof(cpuinfo.cpu_impl)); - else - strcpy(cpuinfo.cpu_impl, "<unknown>"); + strscpy(cpuinfo.cpu_impl, str ?: "<unknown>"); cpuinfo.has_div = of_property_read_bool(cpu, "altr,has-div"); cpuinfo.has_mul = of_property_read_bool(cpu, "altr,has-mul"); diff --git a/arch/nios2/mm/tlb.c b/arch/nios2/mm/tlb.c index f90ac35f05f3..a9cbe20f9e79 100644 --- a/arch/nios2/mm/tlb.c +++ b/arch/nios2/mm/tlb.c @@ -144,10 +144,11 @@ static void flush_tlb_one(unsigned long addr) if (((pteaddr >> 2) & 0xfffff) != (addr >> PAGE_SHIFT)) continue; + tlbmisc = RDCTL(CTL_TLBMISC); pr_debug("Flush entry by writing way=%dl pid=%ld\n", - way, (pid_misc >> TLBMISC_PID_SHIFT)); + way, ((tlbmisc >> TLBMISC_PID_SHIFT) & TLBMISC_PID_MASK)); - tlbmisc = TLBMISC_WE | (way << TLBMISC_WAY_SHIFT); + tlbmisc = TLBMISC_WE | (way << TLBMISC_WAY_SHIFT) | (tlbmisc & TLBMISC_PID); WRCTL(CTL_TLBMISC, tlbmisc); WRCTL(CTL_PTEADDR, pteaddr_invalid(addr)); WRCTL(CTL_TLBACC, 0); @@ -237,7 +238,8 @@ void flush_tlb_pid(unsigned long mmu_pid) if (pid != mmu_pid) continue; - tlbmisc = TLBMISC_WE | (way << TLBMISC_WAY_SHIFT); + tlbmisc = TLBMISC_WE | (way << TLBMISC_WAY_SHIFT) | + (pid << TLBMISC_PID_SHIFT); WRCTL(CTL_TLBMISC, tlbmisc); WRCTL(CTL_TLBACC, 0); } @@ -272,15 +274,17 @@ void flush_tlb_all(void) /* remember pid/way until we return */ get_misc_and_pid(&org_misc, &pid_misc); - /* Start at way 0, way is auto-incremented after each TLBACC write */ - WRCTL(CTL_TLBMISC, TLBMISC_WE); - /* Map each TLB entry to physcal address 0 with no-access and a bad ptbase */ for (line = 0; line < cpuinfo.tlb_num_lines; line++) { WRCTL(CTL_PTEADDR, pteaddr_invalid(addr)); - for (way = 0; way < cpuinfo.tlb_num_ways; way++) + for (way = 0; way < cpuinfo.tlb_num_ways; way++) { + // Code such as replace_tlb_one_pid assumes that no duplicate entries exist + // for a single address across ways, so also use way as a dummy PID + WRCTL(CTL_TLBMISC, TLBMISC_WE | (way << TLBMISC_WAY_SHIFT) | + (way << TLBMISC_PID_SHIFT)); WRCTL(CTL_TLBACC, 0); + } addr += PAGE_SIZE; } diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index 96831c988606..1f2d5b7a7f5d 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -142,6 +142,8 @@ #define SCM_DEVMEM_DMABUF SO_DEVMEM_DMABUF #define SO_DEVMEM_DONTNEED 0x4050 +#define SO_PASSRIGHTS 0x4051 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/powerpc/configs/pmac32_defconfig b/arch/powerpc/configs/pmac32_defconfig index 1bc3466bc909..ae45f70b29f0 100644 --- a/arch/powerpc/configs/pmac32_defconfig +++ b/arch/powerpc/configs/pmac32_defconfig @@ -87,7 +87,6 @@ CONFIG_IP_NF_RAW=m CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m -CONFIG_IP_DCCP=m CONFIG_BT=m CONFIG_BT_RFCOMM=m CONFIG_BT_RFCOMM_TTY=y diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig index 242c1fab9d46..f96f8ed9856c 100644 --- a/arch/powerpc/configs/ppc6xx_defconfig +++ b/arch/powerpc/configs/ppc6xx_defconfig @@ -225,7 +225,6 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m CONFIG_BRIDGE_EBT_NFLOG=m -CONFIG_IP_DCCP=m CONFIG_TIPC=m CONFIG_ATM=m CONFIG_ATM_CLIP=m diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 6a4805968966..791d1942a058 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -572,7 +572,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, /* * Return the number of jiffies until the next timeout. If the timeout is - * longer than the NEXT_TIMER_MAX_DELTA, then return NEXT_TIMER_MAX_DELTA + * longer than the TIMER_NEXT_MAX_DELTA, then return TIMER_NEXT_MAX_DELTA * because the larger value can break the timer APIs. */ static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu) @@ -598,7 +598,7 @@ static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu) if (do_div(nr_jiffies, tb_ticks_per_jiffy)) nr_jiffies++; - return min_t(unsigned long long, nr_jiffies, NEXT_TIMER_MAX_DELTA); + return min_t(unsigned long long, nr_jiffies, TIMER_NEXT_MAX_DELTA); } static void arm_next_watchdog(struct kvm_vcpu *vcpu) @@ -616,10 +616,10 @@ static void arm_next_watchdog(struct kvm_vcpu *vcpu) spin_lock_irqsave(&vcpu->arch.wdt_lock, flags); nr_jiffies = watchdog_next_timeout(vcpu); /* - * If the number of jiffies of watchdog timer >= NEXT_TIMER_MAX_DELTA + * If the number of jiffies of watchdog timer >= TIMER_NEXT_MAX_DELTA * then do not run the watchdog timer as this can break timer APIs. */ - if (nr_jiffies < NEXT_TIMER_MAX_DELTA) + if (nr_jiffies < TIMER_NEXT_MAX_DELTA) mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies); else timer_delete(&vcpu->arch.wdt_timer); diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h index 1d1c78d4cff1..e7b032dfd17f 100644 --- a/arch/riscv/net/bpf_jit.h +++ b/arch/riscv/net/bpf_jit.h @@ -608,6 +608,21 @@ static inline u32 rv_fence(u8 pred, u8 succ) return rv_i_insn(imm11_0, 0, 0, 0, 0xf); } +static inline void emit_fence_r_rw(struct rv_jit_context *ctx) +{ + emit(rv_fence(0x2, 0x3), ctx); +} + +static inline void emit_fence_rw_w(struct rv_jit_context *ctx) +{ + emit(rv_fence(0x3, 0x1), ctx); +} + +static inline void emit_fence_rw_rw(struct rv_jit_context *ctx) +{ + emit(rv_fence(0x3, 0x3), ctx); +} + static inline u32 rv_nop(void) { return rv_i_insn(0, 0, 0, 0, 0x13); diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index ca60db75199d..10e01ff06312 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -473,11 +473,212 @@ static inline void emit_kcfi(u32 hash, struct rv_jit_context *ctx) emit(hash, ctx); } -static void emit_atomic(u8 rd, u8 rs, s16 off, s32 imm, bool is64, - struct rv_jit_context *ctx) +static int emit_load_8(bool sign_ext, u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) +{ + int insns_start; + + if (is_12b_int(off)) { + insns_start = ctx->ninsns; + if (sign_ext) + emit(rv_lb(rd, off, rs), ctx); + else + emit(rv_lbu(rd, off, rs), ctx); + return ctx->ninsns - insns_start; + } + + emit_imm(RV_REG_T1, off, ctx); + emit_add(RV_REG_T1, RV_REG_T1, rs, ctx); + insns_start = ctx->ninsns; + if (sign_ext) + emit(rv_lb(rd, 0, RV_REG_T1), ctx); + else + emit(rv_lbu(rd, 0, RV_REG_T1), ctx); + return ctx->ninsns - insns_start; +} + +static int emit_load_16(bool sign_ext, u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) +{ + int insns_start; + + if (is_12b_int(off)) { + insns_start = ctx->ninsns; + if (sign_ext) + emit(rv_lh(rd, off, rs), ctx); + else + emit(rv_lhu(rd, off, rs), ctx); + return ctx->ninsns - insns_start; + } + + emit_imm(RV_REG_T1, off, ctx); + emit_add(RV_REG_T1, RV_REG_T1, rs, ctx); + insns_start = ctx->ninsns; + if (sign_ext) + emit(rv_lh(rd, 0, RV_REG_T1), ctx); + else + emit(rv_lhu(rd, 0, RV_REG_T1), ctx); + return ctx->ninsns - insns_start; +} + +static int emit_load_32(bool sign_ext, u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) +{ + int insns_start; + + if (is_12b_int(off)) { + insns_start = ctx->ninsns; + if (sign_ext) + emit(rv_lw(rd, off, rs), ctx); + else + emit(rv_lwu(rd, off, rs), ctx); + return ctx->ninsns - insns_start; + } + + emit_imm(RV_REG_T1, off, ctx); + emit_add(RV_REG_T1, RV_REG_T1, rs, ctx); + insns_start = ctx->ninsns; + if (sign_ext) + emit(rv_lw(rd, 0, RV_REG_T1), ctx); + else + emit(rv_lwu(rd, 0, RV_REG_T1), ctx); + return ctx->ninsns - insns_start; +} + +static int emit_load_64(bool sign_ext, u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) +{ + int insns_start; + + if (is_12b_int(off)) { + insns_start = ctx->ninsns; + emit_ld(rd, off, rs, ctx); + return ctx->ninsns - insns_start; + } + + emit_imm(RV_REG_T1, off, ctx); + emit_add(RV_REG_T1, RV_REG_T1, rs, ctx); + insns_start = ctx->ninsns; + emit_ld(rd, 0, RV_REG_T1, ctx); + return ctx->ninsns - insns_start; +} + +static void emit_store_8(u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) +{ + if (is_12b_int(off)) { + emit(rv_sb(rd, off, rs), ctx); + return; + } + + emit_imm(RV_REG_T1, off, ctx); + emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); + emit(rv_sb(RV_REG_T1, 0, rs), ctx); +} + +static void emit_store_16(u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) +{ + if (is_12b_int(off)) { + emit(rv_sh(rd, off, rs), ctx); + return; + } + + emit_imm(RV_REG_T1, off, ctx); + emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); + emit(rv_sh(RV_REG_T1, 0, rs), ctx); +} + +static void emit_store_32(u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) +{ + if (is_12b_int(off)) { + emit_sw(rd, off, rs, ctx); + return; + } + + emit_imm(RV_REG_T1, off, ctx); + emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); + emit_sw(RV_REG_T1, 0, rs, ctx); +} + +static void emit_store_64(u8 rd, s32 off, u8 rs, struct rv_jit_context *ctx) +{ + if (is_12b_int(off)) { + emit_sd(rd, off, rs, ctx); + return; + } + + emit_imm(RV_REG_T1, off, ctx); + emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); + emit_sd(RV_REG_T1, 0, rs, ctx); +} + +static int emit_atomic_ld_st(u8 rd, u8 rs, const struct bpf_insn *insn, + struct rv_jit_context *ctx) +{ + u8 code = insn->code; + s32 imm = insn->imm; + s16 off = insn->off; + + switch (imm) { + /* dst_reg = load_acquire(src_reg + off16) */ + case BPF_LOAD_ACQ: + switch (BPF_SIZE(code)) { + case BPF_B: + emit_load_8(false, rd, off, rs, ctx); + break; + case BPF_H: + emit_load_16(false, rd, off, rs, ctx); + break; + case BPF_W: + emit_load_32(false, rd, off, rs, ctx); + break; + case BPF_DW: + emit_load_64(false, rd, off, rs, ctx); + break; + } + emit_fence_r_rw(ctx); + + /* If our next insn is a redundant zext, return 1 to tell + * build_body() to skip it. + */ + if (BPF_SIZE(code) != BPF_DW && insn_is_zext(&insn[1])) + return 1; + break; + /* store_release(dst_reg + off16, src_reg) */ + case BPF_STORE_REL: + emit_fence_rw_w(ctx); + switch (BPF_SIZE(code)) { + case BPF_B: + emit_store_8(rd, off, rs, ctx); + break; + case BPF_H: + emit_store_16(rd, off, rs, ctx); + break; + case BPF_W: + emit_store_32(rd, off, rs, ctx); + break; + case BPF_DW: + emit_store_64(rd, off, rs, ctx); + break; + } + break; + default: + pr_err_once("bpf-jit: invalid atomic load/store opcode %02x\n", imm); + return -EINVAL; + } + + return 0; +} + +static int emit_atomic_rmw(u8 rd, u8 rs, const struct bpf_insn *insn, + struct rv_jit_context *ctx) { - u8 r0; + u8 r0, code = insn->code; + s16 off = insn->off; + s32 imm = insn->imm; int jmp_offset; + bool is64; + + if (BPF_SIZE(code) != BPF_W && BPF_SIZE(code) != BPF_DW) { + pr_err_once("bpf-jit: 1- and 2-byte RMW atomics are not supported\n"); + return -EINVAL; + } + is64 = BPF_SIZE(code) == BPF_DW; if (off) { if (is_12b_int(off)) { @@ -554,9 +755,14 @@ static void emit_atomic(u8 rd, u8 rs, s16 off, s32 imm, bool is64, rv_sc_w(RV_REG_T3, rs, rd, 0, 1), ctx); jmp_offset = ninsns_rvoff(-6); emit(rv_bne(RV_REG_T3, 0, jmp_offset >> 1), ctx); - emit(rv_fence(0x3, 0x3), ctx); + emit_fence_rw_rw(ctx); break; + default: + pr_err_once("bpf-jit: invalid atomic RMW opcode %02x\n", imm); + return -EINVAL; } + + return 0; } #define BPF_FIXUP_OFFSET_MASK GENMASK(26, 0) @@ -1650,8 +1856,8 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, case BPF_LDX | BPF_PROBE_MEM32 | BPF_W: case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW: { - int insn_len, insns_start; bool sign_ext; + int insn_len; sign_ext = BPF_MODE(insn->code) == BPF_MEMSX || BPF_MODE(insn->code) == BPF_PROBE_MEMSX; @@ -1663,78 +1869,16 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, switch (BPF_SIZE(code)) { case BPF_B: - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - if (sign_ext) - emit(rv_lb(rd, off, rs), ctx); - else - emit(rv_lbu(rd, off, rs), ctx); - insn_len = ctx->ninsns - insns_start; - break; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rs, ctx); - insns_start = ctx->ninsns; - if (sign_ext) - emit(rv_lb(rd, 0, RV_REG_T1), ctx); - else - emit(rv_lbu(rd, 0, RV_REG_T1), ctx); - insn_len = ctx->ninsns - insns_start; + insn_len = emit_load_8(sign_ext, rd, off, rs, ctx); break; case BPF_H: - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - if (sign_ext) - emit(rv_lh(rd, off, rs), ctx); - else - emit(rv_lhu(rd, off, rs), ctx); - insn_len = ctx->ninsns - insns_start; - break; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rs, ctx); - insns_start = ctx->ninsns; - if (sign_ext) - emit(rv_lh(rd, 0, RV_REG_T1), ctx); - else - emit(rv_lhu(rd, 0, RV_REG_T1), ctx); - insn_len = ctx->ninsns - insns_start; + insn_len = emit_load_16(sign_ext, rd, off, rs, ctx); break; case BPF_W: - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - if (sign_ext) - emit(rv_lw(rd, off, rs), ctx); - else - emit(rv_lwu(rd, off, rs), ctx); - insn_len = ctx->ninsns - insns_start; - break; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rs, ctx); - insns_start = ctx->ninsns; - if (sign_ext) - emit(rv_lw(rd, 0, RV_REG_T1), ctx); - else - emit(rv_lwu(rd, 0, RV_REG_T1), ctx); - insn_len = ctx->ninsns - insns_start; + insn_len = emit_load_32(sign_ext, rd, off, rs, ctx); break; case BPF_DW: - if (is_12b_int(off)) { - insns_start = ctx->ninsns; - emit_ld(rd, off, rs, ctx); - insn_len = ctx->ninsns - insns_start; - break; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rs, ctx); - insns_start = ctx->ninsns; - emit_ld(rd, 0, RV_REG_T1, ctx); - insn_len = ctx->ninsns - insns_start; + insn_len = emit_load_64(sign_ext, rd, off, rs, ctx); break; } @@ -1879,49 +2023,27 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, /* STX: *(size *)(dst + off) = src */ case BPF_STX | BPF_MEM | BPF_B: - if (is_12b_int(off)) { - emit(rv_sb(rd, off, rs), ctx); - break; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); - emit(rv_sb(RV_REG_T1, 0, rs), ctx); + emit_store_8(rd, off, rs, ctx); break; case BPF_STX | BPF_MEM | BPF_H: - if (is_12b_int(off)) { - emit(rv_sh(rd, off, rs), ctx); - break; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); - emit(rv_sh(RV_REG_T1, 0, rs), ctx); + emit_store_16(rd, off, rs, ctx); break; case BPF_STX | BPF_MEM | BPF_W: - if (is_12b_int(off)) { - emit_sw(rd, off, rs, ctx); - break; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); - emit_sw(RV_REG_T1, 0, rs, ctx); + emit_store_32(rd, off, rs, ctx); break; case BPF_STX | BPF_MEM | BPF_DW: - if (is_12b_int(off)) { - emit_sd(rd, off, rs, ctx); - break; - } - - emit_imm(RV_REG_T1, off, ctx); - emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); - emit_sd(RV_REG_T1, 0, rs, ctx); + emit_store_64(rd, off, rs, ctx); break; + case BPF_STX | BPF_ATOMIC | BPF_B: + case BPF_STX | BPF_ATOMIC | BPF_H: case BPF_STX | BPF_ATOMIC | BPF_W: case BPF_STX | BPF_ATOMIC | BPF_DW: - emit_atomic(rd, rs, off, imm, - BPF_SIZE(code) == BPF_DW, ctx); + if (bpf_atomic_is_load_store(insn)) + ret = emit_atomic_ld_st(rd, rs, insn, ctx); + else + ret = emit_atomic_rmw(rd, rs, insn, ctx); + if (ret) + return ret; break; case BPF_STX | BPF_PROBE_MEM32 | BPF_B: diff --git a/arch/riscv/net/bpf_jit_core.c b/arch/riscv/net/bpf_jit_core.c index f8cd2f70a7fb..f6ca5cfa6b2f 100644 --- a/arch/riscv/net/bpf_jit_core.c +++ b/arch/riscv/net/bpf_jit_core.c @@ -26,9 +26,8 @@ static int build_body(struct rv_jit_context *ctx, bool extra_pass, int *offset) int ret; ret = bpf_jit_emit_insn(insn, ctx, extra_pass); - /* BPF_LD | BPF_IMM | BPF_DW: skip the next instruction. */ if (ret > 0) - i++; + i++; /* skip the next instruction */ if (offset) offset[i] = ctx->ninsns; if (ret < 0) diff --git a/arch/s390/include/asm/nospec-branch.h b/arch/s390/include/asm/nospec-branch.h index 192835a3e24d..c7c96282f011 100644 --- a/arch/s390/include/asm/nospec-branch.h +++ b/arch/s390/include/asm/nospec-branch.h @@ -26,8 +26,6 @@ static inline bool nospec_uses_trampoline(void) return __is_defined(CC_USING_EXPOLINE) && !nospec_disable; } -#ifdef CONFIG_EXPOLINE_EXTERN - void __s390_indirect_jump_r1(void); void __s390_indirect_jump_r2(void); void __s390_indirect_jump_r3(void); @@ -44,8 +42,6 @@ void __s390_indirect_jump_r13(void); void __s390_indirect_jump_r14(void); void __s390_indirect_jump_r15(void); -#endif - #endif /* __ASSEMBLY__ */ #endif /* _ASM_S390_EXPOLINE_H */ diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 0776dfde2dba..c7f8313ba449 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -48,8 +48,6 @@ struct bpf_jit { int lit64; /* Current position in 64-bit literal pool */ int base_ip; /* Base address for literal pool */ int exit_ip; /* Address of exit */ - int r1_thunk_ip; /* Address of expoline thunk for 'br %r1' */ - int r14_thunk_ip; /* Address of expoline thunk for 'br %r14' */ int tail_call_start; /* Tail call start offset */ int excnt; /* Number of exception table entries */ int prologue_plt_ret; /* Return address for prologue hotpatch PLT */ @@ -127,6 +125,18 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) jit->seen_regs |= (1 << r1); } +static s32 off_to_pcrel(struct bpf_jit *jit, u32 off) +{ + return off - jit->prg; +} + +static s64 ptr_to_pcrel(struct bpf_jit *jit, const void *ptr) +{ + if (jit->prg_buf) + return (const u8 *)ptr - ((const u8 *)jit->prg_buf + jit->prg); + return 0; +} + #define REG_SET_SEEN(b1) \ ({ \ reg_set_seen(jit, b1); \ @@ -201,7 +211,7 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) #define EMIT4_PCREL_RIC(op, mask, target) \ ({ \ - int __rel = ((target) - jit->prg) / 2; \ + int __rel = off_to_pcrel(jit, target) / 2; \ _EMIT4((op) | (mask) << 20 | (__rel & 0xffff)); \ }) @@ -239,7 +249,7 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) #define EMIT6_PCREL_RIEB(op1, op2, b1, b2, mask, target) \ ({ \ - unsigned int rel = (int)((target) - jit->prg) / 2; \ + unsigned int rel = off_to_pcrel(jit, target) / 2; \ _EMIT6((op1) | reg(b1, b2) << 16 | (rel & 0xffff), \ (op2) | (mask) << 12); \ REG_SET_SEEN(b1); \ @@ -248,7 +258,7 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) #define EMIT6_PCREL_RIEC(op1, op2, b1, imm, mask, target) \ ({ \ - unsigned int rel = (int)((target) - jit->prg) / 2; \ + unsigned int rel = off_to_pcrel(jit, target) / 2; \ _EMIT6((op1) | (reg_high(b1) | (mask)) << 16 | \ (rel & 0xffff), (op2) | ((imm) & 0xff) << 8); \ REG_SET_SEEN(b1); \ @@ -257,29 +267,41 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) #define EMIT6_PCREL(op1, op2, b1, b2, i, off, mask) \ ({ \ - int rel = (addrs[(i) + (off) + 1] - jit->prg) / 2; \ + int rel = off_to_pcrel(jit, addrs[(i) + (off) + 1]) / 2;\ _EMIT6((op1) | reg(b1, b2) << 16 | (rel & 0xffff), (op2) | (mask));\ REG_SET_SEEN(b1); \ REG_SET_SEEN(b2); \ }) +static void emit6_pcrel_ril(struct bpf_jit *jit, u32 op, s64 pcrel) +{ + u32 pc32dbl = (s32)(pcrel / 2); + + _EMIT6(op | pc32dbl >> 16, pc32dbl & 0xffff); +} + +static void emit6_pcrel_rilb(struct bpf_jit *jit, u32 op, u8 b, s64 pcrel) +{ + emit6_pcrel_ril(jit, op | reg_high(b) << 16, pcrel); + REG_SET_SEEN(b); +} + #define EMIT6_PCREL_RILB(op, b, target) \ -({ \ - unsigned int rel = (int)((target) - jit->prg) / 2; \ - _EMIT6((op) | reg_high(b) << 16 | rel >> 16, rel & 0xffff);\ - REG_SET_SEEN(b); \ -}) + emit6_pcrel_rilb(jit, op, b, off_to_pcrel(jit, target)) -#define EMIT6_PCREL_RIL(op, target) \ -({ \ - unsigned int rel = (int)((target) - jit->prg) / 2; \ - _EMIT6((op) | rel >> 16, rel & 0xffff); \ -}) +#define EMIT6_PCREL_RILB_PTR(op, b, target_ptr) \ + emit6_pcrel_rilb(jit, op, b, ptr_to_pcrel(jit, target_ptr)) + +static void emit6_pcrel_rilc(struct bpf_jit *jit, u32 op, u8 mask, s64 pcrel) +{ + emit6_pcrel_ril(jit, op | mask << 20, pcrel); +} #define EMIT6_PCREL_RILC(op, mask, target) \ -({ \ - EMIT6_PCREL_RIL((op) | (mask) << 20, (target)); \ -}) + emit6_pcrel_rilc(jit, op, mask, off_to_pcrel(jit, target)) + +#define EMIT6_PCREL_RILC_PTR(op, mask, target_ptr) \ + emit6_pcrel_rilc(jit, op, mask, ptr_to_pcrel(jit, target_ptr)) #define _EMIT6_IMM(op, imm) \ ({ \ @@ -503,7 +525,7 @@ static void bpf_skip(struct bpf_jit *jit, int size) { if (size >= 6 && !is_valid_rel(size)) { /* brcl 0xf,size */ - EMIT6_PCREL_RIL(0xc0f4000000, size); + EMIT6_PCREL_RILC(0xc0040000, 0xf, size); size -= 6; } else if (size >= 4 && is_valid_rel(size)) { /* brc 0xf,size */ @@ -605,43 +627,30 @@ static void bpf_jit_prologue(struct bpf_jit *jit, struct bpf_prog *fp, } /* Setup stack and backchain */ if (is_first_pass(jit) || (jit->seen & SEEN_STACK)) { - if (is_first_pass(jit) || (jit->seen & SEEN_FUNC)) - /* lgr %w1,%r15 (backchain) */ - EMIT4(0xb9040000, REG_W1, REG_15); + /* lgr %w1,%r15 (backchain) */ + EMIT4(0xb9040000, REG_W1, REG_15); /* la %bfp,STK_160_UNUSED(%r15) (BPF frame pointer) */ EMIT4_DISP(0x41000000, BPF_REG_FP, REG_15, STK_160_UNUSED); /* aghi %r15,-STK_OFF */ EMIT4_IMM(0xa70b0000, REG_15, -(STK_OFF + stack_depth)); - if (is_first_pass(jit) || (jit->seen & SEEN_FUNC)) - /* stg %w1,152(%r15) (backchain) */ - EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0, - REG_15, 152); + /* stg %w1,152(%r15) (backchain) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0, + REG_15, 152); } } /* - * Emit an expoline for a jump that follows + * Jump using a register either directly or via an expoline thunk */ -static void emit_expoline(struct bpf_jit *jit) -{ - /* exrl %r0,.+10 */ - EMIT6_PCREL_RIL(0xc6000000, jit->prg + 10); - /* j . */ - EMIT4_PCREL(0xa7f40000, 0); -} - -/* - * Emit __s390_indirect_jump_r1 thunk if necessary - */ -static void emit_r1_thunk(struct bpf_jit *jit) -{ - if (nospec_uses_trampoline()) { - jit->r1_thunk_ip = jit->prg; - emit_expoline(jit); - /* br %r1 */ - _EMIT2(0x07f1); - } -} +#define EMIT_JUMP_REG(reg) do { \ + if (nospec_uses_trampoline()) \ + /* brcl 0xf,__s390_indirect_jump_rN */ \ + EMIT6_PCREL_RILC_PTR(0xc0040000, 0x0f, \ + __s390_indirect_jump_r ## reg); \ + else \ + /* br %rN */ \ + _EMIT2(0x07f0 | reg); \ +} while (0) /* * Call r1 either directly or via __s390_indirect_jump_r1 thunk @@ -650,7 +659,8 @@ static void call_r1(struct bpf_jit *jit) { if (nospec_uses_trampoline()) /* brasl %r14,__s390_indirect_jump_r1 */ - EMIT6_PCREL_RILB(0xc0050000, REG_14, jit->r1_thunk_ip); + EMIT6_PCREL_RILB_PTR(0xc0050000, REG_14, + __s390_indirect_jump_r1); else /* basr %r14,%r1 */ EMIT2(0x0d00, REG_14, REG_1); @@ -666,16 +676,7 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth) EMIT4(0xb9040000, REG_2, BPF_REG_0); /* Restore registers */ save_restore_regs(jit, REGS_RESTORE, stack_depth, 0); - if (nospec_uses_trampoline()) { - jit->r14_thunk_ip = jit->prg; - /* Generate __s390_indirect_jump_r14 thunk */ - emit_expoline(jit); - } - /* br %r14 */ - _EMIT2(0x07fe); - - if (is_first_pass(jit) || (jit->seen & SEEN_FUNC)) - emit_r1_thunk(jit); + EMIT_JUMP_REG(14); jit->prg = ALIGN(jit->prg, 8); jit->prologue_plt = jit->prg; @@ -1877,7 +1878,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, /* aghi %r1,tail_call_start */ EMIT4_IMM(0xa70b0000, REG_1, jit->tail_call_start); /* brcl 0xf,__s390_indirect_jump_r1 */ - EMIT6_PCREL_RILC(0xc0040000, 0xf, jit->r1_thunk_ip); + EMIT6_PCREL_RILC_PTR(0xc0040000, 0xf, + __s390_indirect_jump_r1); } else { /* bc 0xf,tail_call_start(%r1) */ _EMIT4(0x47f01000 + jit->tail_call_start); @@ -2585,9 +2587,8 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, if (nr_stack_args > MAX_NR_STACK_ARGS) return -ENOTSUPP; - /* Return to %r14, since func_addr and %r0 are not available. */ - if ((!func_addr && !(flags & BPF_TRAMP_F_ORIG_STACK)) || - (flags & BPF_TRAMP_F_INDIRECT)) + /* Return to %r14 in the struct_ops case. */ + if (flags & BPF_TRAMP_F_INDIRECT) flags |= BPF_TRAMP_F_SKIP_FRAME; /* @@ -2847,17 +2848,10 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, 0xf000 | tjit->tccnt_off); /* aghi %r15,stack_size */ EMIT4_IMM(0xa70b0000, REG_15, tjit->stack_size); - /* Emit an expoline for the following indirect jump. */ - if (nospec_uses_trampoline()) - emit_expoline(jit); if (flags & BPF_TRAMP_F_SKIP_FRAME) - /* br %r14 */ - _EMIT2(0x07fe); + EMIT_JUMP_REG(14); else - /* br %r1 */ - _EMIT2(0x07f1); - - emit_r1_thunk(jit); + EMIT_JUMP_REG(1); return 0; } diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 5b464a568664..adcba7329386 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -143,6 +143,8 @@ #define SO_RCVPRIORITY 0x005b +#define SO_PASSRIGHTS 0x005c + #if !defined(__KERNEL__) diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile index 58ea4ef9b622..3453f330e363 100644 --- a/arch/sparc/kernel/Makefile +++ b/arch/sparc/kernel/Makefile @@ -35,6 +35,7 @@ obj-y += process.o obj-y += signal_$(BITS).o obj-y += sigutil_$(BITS).o obj-$(CONFIG_SPARC32) += ioport.o +obj-y += setup.o obj-y += setup_$(BITS).o obj-y += idprom.o obj-y += sys_sparc_$(BITS).o diff --git a/arch/sparc/kernel/setup.c b/arch/sparc/kernel/setup.c new file mode 100644 index 000000000000..4975867d9001 --- /dev/null +++ b/arch/sparc/kernel/setup.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <asm/setup.h> +#include <linux/sysctl.h> + +static const struct ctl_table sparc_sysctl_table[] = { + { + .procname = "reboot-cmd", + .data = reboot_command, + .maxlen = 256, + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "stop-a", + .data = &stop_a_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "scons-poweroff", + .data = &scons_pwroff, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_SPARC64 + { + .procname = "tsb-ratio", + .data = &sysctl_tsb_ratio, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +}; + + +static int __init init_sparc_sysctls(void) +{ + register_sysctl_init("kernel", sparc_sysctl_table); + return 0; +} + +arch_initcall(init_sparc_sysctls); diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 121f9f03bd5c..0be4937203c7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -38,6 +38,7 @@ config X86_64 select ARCH_HAS_ELFCORE_COMPAT select ZONE_DMA32 select EXECMEM if DYNAMIC_FTRACE + select ACPI_MRRM if ACPI config FORCE_DYNAMIC_FTRACE def_bool y @@ -507,8 +508,9 @@ config X86_MPPARSE config X86_CPU_RESCTRL bool "x86 CPU resource control support" depends on X86 && (CPU_SUP_INTEL || CPU_SUP_AMD) - select KERNFS - select PROC_CPU_RESCTRL if PROC_FS + depends on MISC_FILESYSTEMS + select ARCH_HAS_CPU_RESCTRL + select RESCTRL_FS select RESCTRL_FS_PSEUDO_LOCK help Enable x86 CPU resource control support. @@ -526,12 +528,6 @@ config X86_CPU_RESCTRL Say N if unsure. -config RESCTRL_FS_PSEUDO_LOCK - bool - help - Software mechanism to pin data in a cache portion using - micro-architecture specific knowledge. - config X86_FRED bool "Flexible Return and Event Delivery" depends on X86_64 diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index b2569257acd3..fbc1215d2746 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -1462,11 +1462,74 @@ e_restore_irq: return ret; } +/** + * snp_svsm_vtpm_probe() - Probe if SVSM provides a vTPM device + * + * Check that there is SVSM and that it supports at least TPM_SEND_COMMAND + * which is the only request used so far. + * + * Return: true if the platform provides a vTPM SVSM device, false otherwise. + */ +static bool snp_svsm_vtpm_probe(void) +{ + struct svsm_call call = {}; + + /* The vTPM device is available only if a SVSM is present */ + if (!snp_vmpl) + return false; + + call.caa = svsm_get_caa(); + call.rax = SVSM_VTPM_CALL(SVSM_VTPM_QUERY); + + if (svsm_perform_call_protocol(&call)) + return false; + + /* Check platform commands contains TPM_SEND_COMMAND - platform command 8 */ + return call.rcx_out & BIT_ULL(8); +} + +/** + * snp_svsm_vtpm_send_command() - Execute a vTPM operation on SVSM + * @buffer: A buffer used to both send the command and receive the response. + * + * Execute a SVSM_VTPM_CMD call as defined by + * "Secure VM Service Module for SEV-SNP Guests" Publication # 58019 Revision: 1.00 + * + * All command request/response buffers have a common structure as specified by + * the following table: + * Byte Size    In/Out    Description + * Offset    (Bytes) + * 0x000     4          In        Platform command + *                        Out       Platform command response size + * + * Each command can build upon this common request/response structure to create + * a structure specific to the command. See include/linux/tpm_svsm.h for more + * details. + * + * Return: 0 on success, -errno on failure + */ +int snp_svsm_vtpm_send_command(u8 *buffer) +{ + struct svsm_call call = {}; + + call.caa = svsm_get_caa(); + call.rax = SVSM_VTPM_CALL(SVSM_VTPM_CMD); + call.rcx = __pa(buffer); + + return svsm_perform_call_protocol(&call); +} +EXPORT_SYMBOL_GPL(snp_svsm_vtpm_send_command); + static struct platform_device sev_guest_device = { .name = "sev-guest", .id = -1, }; +static struct platform_device tpm_svsm_device = { + .name = "tpm-svsm", + .id = -1, +}; + static int __init snp_init_platform_device(void) { if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) @@ -1475,7 +1538,11 @@ static int __init snp_init_platform_device(void) if (platform_device_register(&sev_guest_device)) return -ENODEV; - pr_info("SNP guest platform device initialized.\n"); + if (snp_svsm_vtpm_probe() && + platform_device_register(&tpm_svsm_device)) + return -ENODEV; + + pr_info("SNP guest platform devices initialized.\n"); return 0; } device_initcall(snp_init_platform_device); diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index bd6afe805cf6..feb93b50e990 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -177,7 +177,7 @@ static inline bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 ignored, return READ_ONCE(tsk->rmid) == rmid; } -static inline void resctrl_sched_in(struct task_struct *tsk) +static inline void resctrl_arch_sched_in(struct task_struct *tsk) { if (static_branch_likely(&rdt_enable_key)) __resctrl_sched_in(tsk); @@ -196,25 +196,22 @@ static inline u32 resctrl_arch_rmid_idx_encode(u32 ignored, u32 rmid) /* x86 can always read an rmid, nothing needs allocating */ struct rdt_resource; -static inline void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, int evtid) +static inline void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r, + enum resctrl_event_id evtid) { might_sleep(); return NULL; -}; +} -static inline void resctrl_arch_mon_ctx_free(struct rdt_resource *r, int evtid, - void *ctx) { }; +static inline void resctrl_arch_mon_ctx_free(struct rdt_resource *r, + enum resctrl_event_id evtid, + void *ctx) { } -u64 resctrl_arch_get_prefetch_disable_bits(void); -int resctrl_arch_pseudo_lock_fn(void *_plr); -int resctrl_arch_measure_cycles_lat_fn(void *_plr); -int resctrl_arch_measure_l2_residency(void *_plr); -int resctrl_arch_measure_l3_residency(void *_plr); void resctrl_cpu_detect(struct cpuinfo_x86 *c); #else -static inline void resctrl_sched_in(struct task_struct *tsk) {} +static inline void resctrl_arch_sched_in(struct task_struct *tsk) {} static inline void resctrl_cpu_detect(struct cpuinfo_x86 *c) {} #endif /* CONFIG_X86_CPU_RESCTRL */ diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 6158893786d6..58e028d42e41 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -415,6 +415,10 @@ struct svsm_call { #define SVSM_ATTEST_SERVICES 0 #define SVSM_ATTEST_SINGLE_SERVICE 1 +#define SVSM_VTPM_CALL(x) ((2ULL << 32) | (x)) +#define SVSM_VTPM_QUERY 0 +#define SVSM_VTPM_CMD 1 + #ifdef CONFIG_AMD_MEM_ENCRYPT extern u8 snp_vmpl; @@ -512,6 +516,8 @@ void snp_msg_free(struct snp_msg_desc *mdesc); int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req, struct snp_guest_request_ioctl *rio); +int snp_svsm_vtpm_send_command(u8 *buffer); + void __init snp_secure_tsc_prepare(void); void __init snp_secure_tsc_init(void); @@ -583,6 +589,7 @@ static inline struct snp_msg_desc *snp_msg_alloc(void) { return NULL; } static inline void snp_msg_free(struct snp_msg_desc *mdesc) { } static inline int snp_send_guest_request(struct snp_msg_desc *mdesc, struct snp_guest_req *req, struct snp_guest_request_ioctl *rio) { return -ENODEV; } +static inline int snp_svsm_vtpm_send_command(u8 *buffer) { return -ENODEV; } static inline void __init snp_secure_tsc_prepare(void) { } static inline void __init snp_secure_tsc_init(void) { } diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index fee42a73d64a..93069b13d3af 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -864,7 +864,7 @@ void lapic_offline(void) __vector_cleanup(cl, false); irq_matrix_offline(vector_matrix); - WARN_ON_ONCE(try_to_del_timer_sync(&cl->timer) < 0); + WARN_ON_ONCE(timer_delete_sync_try(&cl->timer) < 0); WARN_ON_ONCE(!hlist_empty(&cl->head)); unlock_vector_lock(); diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index e2c6b471d230..8c18327eb10b 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -593,7 +593,7 @@ static void get_fixed_ranges(mtrr_type *frs) void mtrr_save_fixed_ranges(void *info) { - if (boot_cpu_has(X86_FEATURE_MTRR)) + if (mtrr_state.have_fixed) get_fixed_ranges(mtrr_state.fixed_ranges); } diff --git a/arch/x86/kernel/cpu/resctrl/Makefile b/arch/x86/kernel/cpu/resctrl/Makefile index 0c13b0befd8a..d8a04b195da2 100644 --- a/arch/x86/kernel/cpu/resctrl/Makefile +++ b/arch/x86/kernel/cpu/resctrl/Makefile @@ -2,4 +2,6 @@ obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK) += pseudo_lock.o + +# To allow define_trace.h's recursive include: CFLAGS_pseudo_lock.o = -I$(src) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index d987b11c168c..7109cbfcad4f 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -61,7 +61,6 @@ struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = { [RDT_RESOURCE_L3] = { .r_resctrl = { - .rid = RDT_RESOURCE_L3, .name = "L3", .ctrl_scope = RESCTRL_L3_CACHE, .mon_scope = RESCTRL_L3_CACHE, @@ -75,7 +74,6 @@ struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = { [RDT_RESOURCE_L2] = { .r_resctrl = { - .rid = RDT_RESOURCE_L2, .name = "L2", .ctrl_scope = RESCTRL_L2_CACHE, .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L2), @@ -87,7 +85,6 @@ struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = { [RDT_RESOURCE_MBA] = { .r_resctrl = { - .rid = RDT_RESOURCE_MBA, .name = "MB", .ctrl_scope = RESCTRL_L3_CACHE, .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_MBA), @@ -97,7 +94,6 @@ struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = { [RDT_RESOURCE_SMBA] = { .r_resctrl = { - .rid = RDT_RESOURCE_SMBA, .name = "SMBA", .ctrl_scope = RESCTRL_L3_CACHE, .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_SMBA), @@ -165,21 +161,6 @@ static inline void cache_alloc_hsw_probe(void) rdt_alloc_capable = true; } -bool is_mba_sc(struct rdt_resource *r) -{ - if (!r) - r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); - - /* - * The software controller support is only applicable to MBA resource. - * Make sure to check for resource type. - */ - if (r->rid != RDT_RESOURCE_MBA) - return false; - - return r->membw.mba_sc; -} - /* * rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values * exposed to user interface and the h/w understandable delay values. @@ -738,7 +719,7 @@ struct rdt_options { bool force_off, force_on; }; -static struct rdt_options rdt_options[] __initdata = { +static struct rdt_options rdt_options[] __ro_after_init = { RDT_OPT(RDT_FLAG_CMT, "cmt", X86_FEATURE_CQM_OCCUP_LLC), RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL), RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL), @@ -778,7 +759,7 @@ static int __init set_rdt_options(char *str) } __setup("rdt", set_rdt_options); -bool __init rdt_cpu_has(int flag) +bool rdt_cpu_has(int flag) { bool ret = boot_cpu_has(flag); struct rdt_options *o; @@ -798,7 +779,7 @@ bool __init rdt_cpu_has(int flag) return ret; } -__init bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt) +bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt) { if (!rdt_cpu_has(X86_FEATURE_BMEC)) return false; @@ -1012,7 +993,11 @@ void resctrl_cpu_detect(struct cpuinfo_x86 *c) static int __init resctrl_arch_late_init(void) { struct rdt_resource *r; - int state, ret; + int state, ret, i; + + /* for_each_rdt_resource() requires all rid to be initialised. */ + for (i = 0; i < RDT_NUM_RESOURCES; i++) + rdt_resources_all[i].r_resctrl.rid = i; /* * Initialize functions(or definitions) that are different diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 0a0ac5f6112e..1189c0df4ad7 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -16,277 +16,9 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/cpu.h> -#include <linux/kernfs.h> -#include <linux/seq_file.h> -#include <linux/slab.h> -#include <linux/tick.h> #include "internal.h" -struct rdt_parse_data { - struct rdtgroup *rdtgrp; - char *buf; -}; - -typedef int (ctrlval_parser_t)(struct rdt_parse_data *data, - struct resctrl_schema *s, - struct rdt_ctrl_domain *d); - -/* - * Check whether MBA bandwidth percentage value is correct. The value is - * checked against the minimum and max bandwidth values specified by the - * hardware. The allocated bandwidth percentage is rounded to the next - * control step available on the hardware. - */ -static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r) -{ - int ret; - u32 bw; - - /* - * Only linear delay values is supported for current Intel SKUs. - */ - if (!r->membw.delay_linear && r->membw.arch_needs_linear) { - rdt_last_cmd_puts("No support for non-linear MB domains\n"); - return false; - } - - ret = kstrtou32(buf, 10, &bw); - if (ret) { - rdt_last_cmd_printf("Invalid MB value %s\n", buf); - return false; - } - - /* Nothing else to do if software controller is enabled. */ - if (is_mba_sc(r)) { - *data = bw; - return true; - } - - if (bw < r->membw.min_bw || bw > r->membw.max_bw) { - rdt_last_cmd_printf("MB value %u out of range [%d,%d]\n", - bw, r->membw.min_bw, r->membw.max_bw); - return false; - } - - *data = roundup(bw, (unsigned long)r->membw.bw_gran); - return true; -} - -static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, - struct rdt_ctrl_domain *d) -{ - struct resctrl_staged_config *cfg; - u32 closid = data->rdtgrp->closid; - struct rdt_resource *r = s->res; - u32 bw_val; - - cfg = &d->staged_config[s->conf_type]; - if (cfg->have_new_ctrl) { - rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); - return -EINVAL; - } - - if (!bw_validate(data->buf, &bw_val, r)) - return -EINVAL; - - if (is_mba_sc(r)) { - d->mbps_val[closid] = bw_val; - return 0; - } - - cfg->new_ctrl = bw_val; - cfg->have_new_ctrl = true; - - return 0; -} - -/* - * Check whether a cache bit mask is valid. - * On Intel CPUs, non-contiguous 1s value support is indicated by CPUID: - * - CPUID.0x10.1:ECX[3]: L3 non-contiguous 1s value supported if 1 - * - CPUID.0x10.2:ECX[3]: L2 non-contiguous 1s value supported if 1 - * - * Haswell does not support a non-contiguous 1s value and additionally - * requires at least two bits set. - * AMD allows non-contiguous bitmasks. - */ -static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) -{ - u32 supported_bits = BIT_MASK(r->cache.cbm_len) - 1; - unsigned int cbm_len = r->cache.cbm_len; - unsigned long first_bit, zero_bit, val; - int ret; - - ret = kstrtoul(buf, 16, &val); - if (ret) { - rdt_last_cmd_printf("Non-hex character in the mask %s\n", buf); - return false; - } - - if ((r->cache.min_cbm_bits > 0 && val == 0) || val > supported_bits) { - rdt_last_cmd_puts("Mask out of range\n"); - return false; - } - - first_bit = find_first_bit(&val, cbm_len); - zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); - - /* Are non-contiguous bitmasks allowed? */ - if (!r->cache.arch_has_sparse_bitmasks && - (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) { - rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val); - return false; - } - - if ((zero_bit - first_bit) < r->cache.min_cbm_bits) { - rdt_last_cmd_printf("Need at least %d bits in the mask\n", - r->cache.min_cbm_bits); - return false; - } - - *data = val; - return true; -} - -/* - * Read one cache bit mask (hex). Check that it is valid for the current - * resource type. - */ -static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, - struct rdt_ctrl_domain *d) -{ - struct rdtgroup *rdtgrp = data->rdtgrp; - struct resctrl_staged_config *cfg; - struct rdt_resource *r = s->res; - u32 cbm_val; - - cfg = &d->staged_config[s->conf_type]; - if (cfg->have_new_ctrl) { - rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); - return -EINVAL; - } - - /* - * Cannot set up more than one pseudo-locked region in a cache - * hierarchy. - */ - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && - rdtgroup_pseudo_locked_in_hierarchy(d)) { - rdt_last_cmd_puts("Pseudo-locked region in hierarchy\n"); - return -EINVAL; - } - - if (!cbm_validate(data->buf, &cbm_val, r)) - return -EINVAL; - - if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE || - rdtgrp->mode == RDT_MODE_SHAREABLE) && - rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) { - rdt_last_cmd_puts("CBM overlaps with pseudo-locked region\n"); - return -EINVAL; - } - - /* - * The CBM may not overlap with the CBM of another closid if - * either is exclusive. - */ - if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, true)) { - rdt_last_cmd_puts("Overlaps with exclusive group\n"); - return -EINVAL; - } - - if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, false)) { - if (rdtgrp->mode == RDT_MODE_EXCLUSIVE || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - rdt_last_cmd_puts("Overlaps with other group\n"); - return -EINVAL; - } - } - - cfg->new_ctrl = cbm_val; - cfg->have_new_ctrl = true; - - return 0; -} - -/* - * For each domain in this resource we expect to find a series of: - * id=mask - * separated by ";". The "id" is in decimal, and must match one of - * the "id"s for this resource. - */ -static int parse_line(char *line, struct resctrl_schema *s, - struct rdtgroup *rdtgrp) -{ - enum resctrl_conf_type t = s->conf_type; - ctrlval_parser_t *parse_ctrlval = NULL; - struct resctrl_staged_config *cfg; - struct rdt_resource *r = s->res; - struct rdt_parse_data data; - struct rdt_ctrl_domain *d; - char *dom = NULL, *id; - unsigned long dom_id; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - switch (r->schema_fmt) { - case RESCTRL_SCHEMA_BITMAP: - parse_ctrlval = &parse_cbm; - break; - case RESCTRL_SCHEMA_RANGE: - parse_ctrlval = &parse_bw; - break; - } - - if (WARN_ON_ONCE(!parse_ctrlval)) - return -EINVAL; - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && - (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) { - rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n"); - return -EINVAL; - } - -next: - if (!line || line[0] == '\0') - return 0; - dom = strsep(&line, ";"); - id = strsep(&dom, "="); - if (!dom || kstrtoul(id, 10, &dom_id)) { - rdt_last_cmd_puts("Missing '=' or non-numeric domain\n"); - return -EINVAL; - } - dom = strim(dom); - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { - if (d->hdr.id == dom_id) { - data.buf = dom; - data.rdtgrp = rdtgrp; - if (parse_ctrlval(&data, s, d)) - return -EINVAL; - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - cfg = &d->staged_config[t]; - /* - * In pseudo-locking setup mode and just - * parsed a valid CBM that should be - * pseudo-locked. Only one locked region per - * resource group and domain so just do - * the required initialization for single - * region and return. - */ - rdtgrp->plr->s = s; - rdtgrp->plr->d = d; - rdtgrp->plr->cbm = cfg->new_ctrl; - d->plr = rdtgrp->plr; - return 0; - } - goto next; - } - } - return -EINVAL; -} - int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type t, u32 cfg_val) { @@ -351,100 +83,6 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) return 0; } -static int rdtgroup_parse_resource(char *resname, char *tok, - struct rdtgroup *rdtgrp) -{ - struct resctrl_schema *s; - - list_for_each_entry(s, &resctrl_schema_all, list) { - if (!strcmp(resname, s->name) && rdtgrp->closid < s->num_closid) - return parse_line(tok, s, rdtgrp); - } - rdt_last_cmd_printf("Unknown or unsupported resource name '%s'\n", resname); - return -EINVAL; -} - -ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct resctrl_schema *s; - struct rdtgroup *rdtgrp; - struct rdt_resource *r; - char *tok, *resname; - int ret = 0; - - /* Valid input requires a trailing newline */ - if (nbytes == 0 || buf[nbytes - 1] != '\n') - return -EINVAL; - buf[nbytes - 1] = '\0'; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - rdtgroup_kn_unlock(of->kn); - return -ENOENT; - } - rdt_last_cmd_clear(); - - /* - * No changes to pseudo-locked region allowed. It has to be removed - * and re-created instead. - */ - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - ret = -EINVAL; - rdt_last_cmd_puts("Resource group is pseudo-locked\n"); - goto out; - } - - rdt_staged_configs_clear(); - - while ((tok = strsep(&buf, "\n")) != NULL) { - resname = strim(strsep(&tok, ":")); - if (!tok) { - rdt_last_cmd_puts("Missing ':'\n"); - ret = -EINVAL; - goto out; - } - if (tok[0] == '\0') { - rdt_last_cmd_printf("Missing '%s' value\n", resname); - ret = -EINVAL; - goto out; - } - ret = rdtgroup_parse_resource(resname, tok, rdtgrp); - if (ret) - goto out; - } - - list_for_each_entry(s, &resctrl_schema_all, list) { - r = s->res; - - /* - * Writes to mba_sc resources update the software controller, - * not the control MSR. - */ - if (is_mba_sc(r)) - continue; - - ret = resctrl_arch_update_domains(r, rdtgrp->closid); - if (ret) - goto out; - } - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - /* - * If pseudo-locking fails we keep the resource group in - * mode RDT_MODE_PSEUDO_LOCKSETUP with its class of service - * active and updated for just the domain the pseudo-locked - * region was requested for. - */ - ret = rdtgroup_pseudo_lock_create(rdtgrp); - } - -out: - rdt_staged_configs_clear(); - rdtgroup_kn_unlock(of->kn); - return ret ?: nbytes; -} - u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type type) { @@ -453,276 +91,3 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, return hw_dom->ctrl_val[idx]; } - -static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid) -{ - struct rdt_resource *r = schema->res; - struct rdt_ctrl_domain *dom; - bool sep = false; - u32 ctrl_val; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - seq_printf(s, "%*s:", max_name_width, schema->name); - list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { - if (sep) - seq_puts(s, ";"); - - if (is_mba_sc(r)) - ctrl_val = dom->mbps_val[closid]; - else - ctrl_val = resctrl_arch_get_config(r, dom, closid, - schema->conf_type); - - seq_printf(s, schema->fmt_str, dom->hdr.id, ctrl_val); - sep = true; - } - seq_puts(s, "\n"); -} - -int rdtgroup_schemata_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct resctrl_schema *schema; - struct rdtgroup *rdtgrp; - int ret = 0; - u32 closid; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (rdtgrp) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - list_for_each_entry(schema, &resctrl_schema_all, list) { - seq_printf(s, "%s:uninitialized\n", schema->name); - } - } else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - if (!rdtgrp->plr->d) { - rdt_last_cmd_clear(); - rdt_last_cmd_puts("Cache domain offline\n"); - ret = -ENODEV; - } else { - seq_printf(s, "%s:%d=%x\n", - rdtgrp->plr->s->res->name, - rdtgrp->plr->d->hdr.id, - rdtgrp->plr->cbm); - } - } else { - closid = rdtgrp->closid; - list_for_each_entry(schema, &resctrl_schema_all, list) { - if (closid < schema->num_closid) - show_doms(s, schema, closid); - } - } - } else { - ret = -ENOENT; - } - rdtgroup_kn_unlock(of->kn); - return ret; -} - -static int smp_mon_event_count(void *arg) -{ - mon_event_count(arg); - - return 0; -} - -ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct rdtgroup *rdtgrp; - int ret = 0; - - /* Valid input requires a trailing newline */ - if (nbytes == 0 || buf[nbytes - 1] != '\n') - return -EINVAL; - buf[nbytes - 1] = '\0'; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - rdtgroup_kn_unlock(of->kn); - return -ENOENT; - } - rdt_last_cmd_clear(); - - if (!strcmp(buf, "mbm_local_bytes")) { - if (resctrl_arch_is_mbm_local_enabled()) - rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID; - else - ret = -EINVAL; - } else if (!strcmp(buf, "mbm_total_bytes")) { - if (resctrl_arch_is_mbm_total_enabled()) - rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID; - else - ret = -EINVAL; - } else { - ret = -EINVAL; - } - - if (ret) - rdt_last_cmd_printf("Unsupported event id '%s'\n", buf); - - rdtgroup_kn_unlock(of->kn); - - return ret ?: nbytes; -} - -int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct rdtgroup *rdtgrp; - int ret = 0; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - - if (rdtgrp) { - switch (rdtgrp->mba_mbps_event) { - case QOS_L3_MBM_LOCAL_EVENT_ID: - seq_puts(s, "mbm_local_bytes\n"); - break; - case QOS_L3_MBM_TOTAL_EVENT_ID: - seq_puts(s, "mbm_total_bytes\n"); - break; - default: - pr_warn_once("Bad event %d\n", rdtgrp->mba_mbps_event); - ret = -EINVAL; - break; - } - } else { - ret = -ENOENT; - } - - rdtgroup_kn_unlock(of->kn); - - return ret; -} - -struct rdt_domain_hdr *resctrl_find_domain(struct list_head *h, int id, - struct list_head **pos) -{ - struct rdt_domain_hdr *d; - struct list_head *l; - - list_for_each(l, h) { - d = list_entry(l, struct rdt_domain_hdr, list); - /* When id is found, return its domain. */ - if (id == d->id) - return d; - /* Stop searching when finding id's position in sorted list. */ - if (id < d->id) - break; - } - - if (pos) - *pos = l; - - return NULL; -} - -void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, - struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, - cpumask_t *cpumask, int evtid, int first) -{ - int cpu; - - /* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - /* - * Setup the parameters to pass to mon_event_count() to read the data. - */ - rr->rgrp = rdtgrp; - rr->evtid = evtid; - rr->r = r; - rr->d = d; - rr->first = first; - rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); - if (IS_ERR(rr->arch_mon_ctx)) { - rr->err = -EINVAL; - return; - } - - cpu = cpumask_any_housekeeping(cpumask, RESCTRL_PICK_ANY_CPU); - - /* - * cpumask_any_housekeeping() prefers housekeeping CPUs, but - * are all the CPUs nohz_full? If yes, pick a CPU to IPI. - * MPAM's resctrl_arch_rmid_read() is unable to read the - * counters on some platforms if its called in IRQ context. - */ - if (tick_nohz_full_cpu(cpu)) - smp_call_function_any(cpumask, mon_event_count, rr, 1); - else - smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); - - resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); -} - -int rdtgroup_mondata_show(struct seq_file *m, void *arg) -{ - struct kernfs_open_file *of = m->private; - struct rdt_domain_hdr *hdr; - struct rmid_read rr = {0}; - struct rdt_mon_domain *d; - u32 resid, evtid, domid; - struct rdtgroup *rdtgrp; - struct rdt_resource *r; - union mon_data_bits md; - int ret = 0; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - ret = -ENOENT; - goto out; - } - - md.priv = of->kn->priv; - resid = md.u.rid; - domid = md.u.domid; - evtid = md.u.evtid; - r = resctrl_arch_get_resource(resid); - - if (md.u.sum) { - /* - * This file requires summing across all domains that share - * the L3 cache id that was provided in the "domid" field of the - * mon_data_bits union. Search all domains in the resource for - * one that matches this cache id. - */ - list_for_each_entry(d, &r->mon_domains, hdr.list) { - if (d->ci->id == domid) { - rr.ci = d->ci; - mon_event_read(&rr, r, NULL, rdtgrp, - &d->ci->shared_cpu_map, evtid, false); - goto checkresult; - } - } - ret = -ENOENT; - goto out; - } else { - /* - * This file provides data from a single domain. Search - * the resource to find the domain with "domid". - */ - hdr = resctrl_find_domain(&r->mon_domains, domid, NULL); - if (!hdr || WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) { - ret = -ENOENT; - goto out; - } - d = container_of(hdr, struct rdt_mon_domain, hdr); - mon_event_read(&rr, r, d, rdtgrp, &d->hdr.cpu_mask, evtid, false); - } - -checkresult: - - if (rr.err == -EIO) - seq_puts(m, "Error\n"); - else if (rr.err == -EINVAL) - seq_puts(m, "Unavailable\n"); - else - seq_printf(m, "%llu\n", rr.val); - -out: - rdtgroup_kn_unlock(of->kn); - return ret; -} diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index eaae99602b61..5e3c41b36437 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -3,28 +3,21 @@ #define _ASM_X86_RESCTRL_INTERNAL_H #include <linux/resctrl.h> -#include <linux/sched.h> -#include <linux/kernfs.h> -#include <linux/fs_context.h> -#include <linux/jump_label.h> -#include <linux/tick.h> - -#include <asm/resctrl.h> #define L3_QOS_CDP_ENABLE 0x01ULL #define L2_QOS_CDP_ENABLE 0x01ULL -#define CQM_LIMBOCHECK_INTERVAL 1000 - #define MBM_CNTR_WIDTH_BASE 24 -#define MBM_OVERFLOW_INTERVAL 1000 -#define MAX_MBA_BW 100u + #define MBA_IS_LINEAR 0x4 + #define MBM_CNTR_WIDTH_OFFSET_AMD 20 #define RMID_VAL_ERROR BIT_ULL(63) + #define RMID_VAL_UNAVAIL BIT_ULL(62) + /* * With the above fields in use 62 bits remain in MSR_IA32_QM_CTR for * data to be returned. The counter width is discovered from the hardware @@ -33,278 +26,6 @@ #define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE) /** - * cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that - * aren't marked nohz_full - * @mask: The mask to pick a CPU from. - * @exclude_cpu:The CPU to avoid picking. - * - * Returns a CPU from @mask, but not @exclude_cpu. If there are housekeeping - * CPUs that don't use nohz_full, these are preferred. Pass - * RESCTRL_PICK_ANY_CPU to avoid excluding any CPUs. - * - * When a CPU is excluded, returns >= nr_cpu_ids if no CPUs are available. - */ -static inline unsigned int -cpumask_any_housekeeping(const struct cpumask *mask, int exclude_cpu) -{ - unsigned int cpu, hk_cpu; - - if (exclude_cpu == RESCTRL_PICK_ANY_CPU) - cpu = cpumask_any(mask); - else - cpu = cpumask_any_but(mask, exclude_cpu); - - /* Only continue if tick_nohz_full_mask has been initialized. */ - if (!tick_nohz_full_enabled()) - return cpu; - - /* If the CPU picked isn't marked nohz_full nothing more needs doing. */ - if (cpu < nr_cpu_ids && !tick_nohz_full_cpu(cpu)) - return cpu; - - /* Try to find a CPU that isn't nohz_full to use in preference */ - hk_cpu = cpumask_nth_andnot(0, mask, tick_nohz_full_mask); - if (hk_cpu == exclude_cpu) - hk_cpu = cpumask_nth_andnot(1, mask, tick_nohz_full_mask); - - if (hk_cpu < nr_cpu_ids) - cpu = hk_cpu; - - return cpu; -} - -struct rdt_fs_context { - struct kernfs_fs_context kfc; - bool enable_cdpl2; - bool enable_cdpl3; - bool enable_mba_mbps; - bool enable_debug; -}; - -static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) -{ - struct kernfs_fs_context *kfc = fc->fs_private; - - return container_of(kfc, struct rdt_fs_context, kfc); -} - -/** - * struct mon_evt - Entry in the event list of a resource - * @evtid: event id - * @name: name of the event - * @configurable: true if the event is configurable - * @list: entry in &rdt_resource->evt_list - */ -struct mon_evt { - enum resctrl_event_id evtid; - char *name; - bool configurable; - struct list_head list; -}; - -/** - * union mon_data_bits - Monitoring details for each event file. - * @priv: Used to store monitoring event data in @u - * as kernfs private data. - * @u.rid: Resource id associated with the event file. - * @u.evtid: Event id associated with the event file. - * @u.sum: Set when event must be summed across multiple - * domains. - * @u.domid: When @u.sum is zero this is the domain to which - * the event file belongs. When @sum is one this - * is the id of the L3 cache that all domains to be - * summed share. - * @u: Name of the bit fields struct. - */ -union mon_data_bits { - void *priv; - struct { - unsigned int rid : 10; - enum resctrl_event_id evtid : 7; - unsigned int sum : 1; - unsigned int domid : 14; - } u; -}; - -/** - * struct rmid_read - Data passed across smp_call*() to read event count. - * @rgrp: Resource group for which the counter is being read. If it is a parent - * resource group then its event count is summed with the count from all - * its child resource groups. - * @r: Resource describing the properties of the event being read. - * @d: Domain that the counter should be read from. If NULL then sum all - * domains in @r sharing L3 @ci.id - * @evtid: Which monitor event to read. - * @first: Initialize MBM counter when true. - * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains. - * @err: Error encountered when reading counter. - * @val: Returned value of event counter. If @rgrp is a parent resource group, - * @val includes the sum of event counts from its child resource groups. - * If @d is NULL, @val includes the sum of all domains in @r sharing @ci.id, - * (summed across child resource groups if @rgrp is a parent resource group). - * @arch_mon_ctx: Hardware monitor allocated for this read request (MPAM only). - */ -struct rmid_read { - struct rdtgroup *rgrp; - struct rdt_resource *r; - struct rdt_mon_domain *d; - enum resctrl_event_id evtid; - bool first; - struct cacheinfo *ci; - int err; - u64 val; - void *arch_mon_ctx; -}; - -extern struct list_head resctrl_schema_all; -extern bool resctrl_mounted; - -enum rdt_group_type { - RDTCTRL_GROUP = 0, - RDTMON_GROUP, - RDT_NUM_GROUP, -}; - -/** - * enum rdtgrp_mode - Mode of a RDT resource group - * @RDT_MODE_SHAREABLE: This resource group allows sharing of its allocations - * @RDT_MODE_EXCLUSIVE: No sharing of this resource group's allocations allowed - * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking - * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations - * allowed AND the allocations are Cache Pseudo-Locked - * @RDT_NUM_MODES: Total number of modes - * - * The mode of a resource group enables control over the allowed overlap - * between allocations associated with different resource groups (classes - * of service). User is able to modify the mode of a resource group by - * writing to the "mode" resctrl file associated with the resource group. - * - * The "shareable", "exclusive", and "pseudo-locksetup" modes are set by - * writing the appropriate text to the "mode" file. A resource group enters - * "pseudo-locked" mode after the schemata is written while the resource - * group is in "pseudo-locksetup" mode. - */ -enum rdtgrp_mode { - RDT_MODE_SHAREABLE = 0, - RDT_MODE_EXCLUSIVE, - RDT_MODE_PSEUDO_LOCKSETUP, - RDT_MODE_PSEUDO_LOCKED, - - /* Must be last */ - RDT_NUM_MODES, -}; - -/** - * struct mongroup - store mon group's data in resctrl fs. - * @mon_data_kn: kernfs node for the mon_data directory - * @parent: parent rdtgrp - * @crdtgrp_list: child rdtgroup node list - * @rmid: rmid for this rdtgroup - */ -struct mongroup { - struct kernfs_node *mon_data_kn; - struct rdtgroup *parent; - struct list_head crdtgrp_list; - u32 rmid; -}; - -/** - * struct rdtgroup - store rdtgroup's data in resctrl file system. - * @kn: kernfs node - * @rdtgroup_list: linked list for all rdtgroups - * @closid: closid for this rdtgroup - * @cpu_mask: CPUs assigned to this rdtgroup - * @flags: status bits - * @waitcount: how many cpus expect to find this - * group when they acquire rdtgroup_mutex - * @type: indicates type of this rdtgroup - either - * monitor only or ctrl_mon group - * @mon: mongroup related data - * @mode: mode of resource group - * @mba_mbps_event: input monitoring event id when mba_sc is enabled - * @plr: pseudo-locked region - */ -struct rdtgroup { - struct kernfs_node *kn; - struct list_head rdtgroup_list; - u32 closid; - struct cpumask cpu_mask; - int flags; - atomic_t waitcount; - enum rdt_group_type type; - struct mongroup mon; - enum rdtgrp_mode mode; - enum resctrl_event_id mba_mbps_event; - struct pseudo_lock_region *plr; -}; - -/* rdtgroup.flags */ -#define RDT_DELETED 1 - -/* rftype.flags */ -#define RFTYPE_FLAGS_CPUS_LIST 1 - -/* - * Define the file type flags for base and info directories. - */ -#define RFTYPE_INFO BIT(0) -#define RFTYPE_BASE BIT(1) -#define RFTYPE_CTRL BIT(4) -#define RFTYPE_MON BIT(5) -#define RFTYPE_TOP BIT(6) -#define RFTYPE_RES_CACHE BIT(8) -#define RFTYPE_RES_MB BIT(9) -#define RFTYPE_DEBUG BIT(10) -#define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) -#define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) -#define RFTYPE_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP) -#define RFTYPE_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) -#define RFTYPE_MON_BASE (RFTYPE_BASE | RFTYPE_MON) - -/* List of all resource groups */ -extern struct list_head rdt_all_groups; - -extern int max_name_width; - -/** - * struct rftype - describe each file in the resctrl file system - * @name: File name - * @mode: Access mode - * @kf_ops: File operations - * @flags: File specific RFTYPE_FLAGS_* flags - * @fflags: File specific RFTYPE_* flags - * @seq_show: Show content of the file - * @write: Write to the file - */ -struct rftype { - char *name; - umode_t mode; - const struct kernfs_ops *kf_ops; - unsigned long flags; - unsigned long fflags; - - int (*seq_show)(struct kernfs_open_file *of, - struct seq_file *sf, void *v); - /* - * write() is the generic write callback which maps directly to - * kernfs write operation and overrides all other operations. - * Maximum write size is determined by ->max_write_len. - */ - ssize_t (*write)(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off); -}; - -/** - * struct mbm_state - status for each MBM counter in each domain - * @prev_bw_bytes: Previous bytes value read for bandwidth calculation - * @prev_bw: The most recent bandwidth in MBps - */ -struct mbm_state { - u64 prev_bw_bytes; - u32 prev_bw; -}; - -/** * struct arch_mbm_state - values used to compute resctrl_arch_rmid_read()s * return value. * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes) @@ -401,24 +122,7 @@ static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r return container_of(r, struct rdt_hw_resource, r_resctrl); } -extern struct mutex rdtgroup_mutex; - -static inline const char *rdt_kn_name(const struct kernfs_node *kn) -{ - return rcu_dereference_check(kn->name, lockdep_is_held(&rdtgroup_mutex)); -} - extern struct rdt_hw_resource rdt_resources_all[]; -extern struct rdtgroup rdtgroup_default; -extern struct dentry *debugfs_resctrl; -extern enum resctrl_event_id mba_mbps_default_event; - -static inline bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l) -{ - return rdt_resources_all[l].cdp_enabled; -} - -int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable); void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d); @@ -455,99 +159,14 @@ union cpuid_0x10_x_edx { unsigned int full; }; -void rdt_last_cmd_clear(void); -void rdt_last_cmd_puts(const char *s); -__printf(1, 2) -void rdt_last_cmd_printf(const char *fmt, ...); - void rdt_ctrl_update(void *arg); -struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); -void rdtgroup_kn_unlock(struct kernfs_node *kn); -int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name); -int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, - umode_t mask); -ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off); -int rdtgroup_schemata_show(struct kernfs_open_file *of, - struct seq_file *s, void *v); -ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off); -int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of, - struct seq_file *s, void *v); -bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d, - unsigned long cbm, int closid, bool exclusive); -unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_ctrl_domain *d, - unsigned long cbm); -enum rdtgrp_mode rdtgroup_mode_by_closid(int closid); -int rdtgroup_tasks_assigned(struct rdtgroup *r); -int closids_supported(void); -void closid_free(int closid); -int alloc_rmid(u32 closid); -void free_rmid(u32 closid, u32 rmid); -int rdt_get_mon_l3_config(struct rdt_resource *r); -void resctrl_mon_resource_exit(void); -bool __init rdt_cpu_has(int flag); -void mon_event_count(void *info); -int rdtgroup_mondata_show(struct seq_file *m, void *arg); -void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, - struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, - cpumask_t *cpumask, int evtid, int first); -int __init resctrl_mon_resource_init(void); -void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, - unsigned long delay_ms, - int exclude_cpu); -void mbm_handle_overflow(struct work_struct *work); -void __init intel_rdt_mbm_apply_quirk(void); -bool is_mba_sc(struct rdt_resource *r); -void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, - int exclude_cpu); -void cqm_handle_limbo(struct work_struct *work); -bool has_busy_rmid(struct rdt_mon_domain *d); -void __check_limbo(struct rdt_mon_domain *d, bool force_free); -void rdt_domain_reconfigure_cdp(struct rdt_resource *r); -void resctrl_file_fflags_init(const char *config, unsigned long fflags); -void rdt_staged_configs_clear(void); -bool closid_allocated(unsigned int closid); -int resctrl_find_cleanest_closid(void); - -#ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK -int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); -int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp); -bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm); -bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d); -int rdt_pseudo_lock_init(void); -void rdt_pseudo_lock_release(void); -int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp); -void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); -#else -static inline int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) -{ - return -EOPNOTSUPP; -} -static inline int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) -{ - return -EOPNOTSUPP; -} - -static inline bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm) -{ - return false; -} +int rdt_get_mon_l3_config(struct rdt_resource *r); -static inline bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d) -{ - return false; -} +bool rdt_cpu_has(int flag); -static inline int rdt_pseudo_lock_init(void) { return 0; } -static inline void rdt_pseudo_lock_release(void) { } -static inline int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) -{ - return -EOPNOTSUPP; -} +void __init intel_rdt_mbm_apply_quirk(void); -static inline void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) { } -#endif /* CONFIG_RESCTRL_FS_PSEUDO_LOCK */ +void rdt_domain_reconfigure_cdp(struct rdt_resource *r); #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 591b0b44d260..c261558276cd 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -18,63 +18,12 @@ #define pr_fmt(fmt) "resctrl: " fmt #include <linux/cpu.h> -#include <linux/module.h> -#include <linux/sizes.h> -#include <linux/slab.h> +#include <linux/resctrl.h> #include <asm/cpu_device_id.h> #include <asm/msr.h> -#include <asm/resctrl.h> #include "internal.h" -#include "trace.h" - -/** - * struct rmid_entry - dirty tracking for all RMID. - * @closid: The CLOSID for this entry. - * @rmid: The RMID for this entry. - * @busy: The number of domains with cached data using this RMID. - * @list: Member of the rmid_free_lru list when busy == 0. - * - * Depending on the architecture the correct monitor is accessed using - * both @closid and @rmid, or @rmid only. - * - * Take the rdtgroup_mutex when accessing. - */ -struct rmid_entry { - u32 closid; - u32 rmid; - int busy; - struct list_head list; -}; - -/* - * @rmid_free_lru - A least recently used list of free RMIDs - * These RMIDs are guaranteed to have an occupancy less than the - * threshold occupancy - */ -static LIST_HEAD(rmid_free_lru); - -/* - * @closid_num_dirty_rmid The number of dirty RMID each CLOSID has. - * Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined. - * Indexed by CLOSID. Protected by rdtgroup_mutex. - */ -static u32 *closid_num_dirty_rmid; - -/* - * @rmid_limbo_count - count of currently unused but (potentially) - * dirty RMIDs. - * This counts RMIDs that no one is currently using but that - * may have a occupancy value > resctrl_rmid_realloc_threshold. User can - * change the threshold occupancy value. - */ -static unsigned int rmid_limbo_count; - -/* - * @rmid_entry - The entry in the limbo and free lists. - */ -static struct rmid_entry *rmid_ptrs; /* * Global boolean for rdt_monitor which is true if any @@ -87,23 +36,12 @@ bool rdt_mon_capable; */ unsigned int rdt_mon_features; -/* - * This is the threshold cache occupancy in bytes at which we will consider an - * RMID available for re-allocation. - */ -unsigned int resctrl_rmid_realloc_threshold; - -/* - * This is the maximum value for the reallocation threshold, in bytes. - */ -unsigned int resctrl_rmid_realloc_limit; - #define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) static int snc_nodes_per_l3_cache = 1; /* - * The correction factor table is documented in Documentation/arch/x86/resctrl.rst. + * The correction factor table is documented in Documentation/filesystems/resctrl.rst. * If rmid > rmid threshold, MBM total and local values should be multiplied * by the correction factor. * @@ -152,6 +90,7 @@ static const struct mbm_correction_factor_table { }; static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX; + static u64 mbm_cf __read_mostly; static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) @@ -164,33 +103,6 @@ static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) } /* - * x86 and arm64 differ in their handling of monitoring. - * x86's RMID are independent numbers, there is only one source of traffic - * with an RMID value of '1'. - * arm64's PMG extends the PARTID/CLOSID space, there are multiple sources of - * traffic with a PMG value of '1', one for each CLOSID, meaning the RMID - * value is no longer unique. - * To account for this, resctrl uses an index. On x86 this is just the RMID, - * on arm64 it encodes the CLOSID and RMID. This gives a unique number. - * - * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code - * must accept an attempt to read every index. - */ -static inline struct rmid_entry *__rmid_entry(u32 idx) -{ - struct rmid_entry *entry; - u32 closid, rmid; - - entry = &rmid_ptrs[idx]; - resctrl_arch_rmid_idx_decode(idx, &closid, &rmid); - - WARN_ON_ONCE(entry->closid != closid); - WARN_ON_ONCE(entry->rmid != rmid); - - return entry; -} - -/* * When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by * "snc_nodes_per_l3_cache == 1") no translation of the RMID value is * needed. The physical RMID is the same as the logical RMID. @@ -261,12 +173,11 @@ static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_do return &hw_dom->arch_mbm_total[rmid]; case QOS_L3_MBM_LOCAL_EVENT_ID: return &hw_dom->arch_mbm_local[rmid]; + default: + /* Never expect to get here */ + WARN_ON_ONCE(1); + return NULL; } - - /* Never expect to get here */ - WARN_ON_ONCE(1); - - return NULL; } void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, @@ -347,769 +258,6 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, return 0; } -static void limbo_release_entry(struct rmid_entry *entry) -{ - lockdep_assert_held(&rdtgroup_mutex); - - rmid_limbo_count--; - list_add_tail(&entry->list, &rmid_free_lru); - - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) - closid_num_dirty_rmid[entry->closid]--; -} - -/* - * Check the RMIDs that are marked as busy for this domain. If the - * reported LLC occupancy is below the threshold clear the busy bit and - * decrement the count. If the busy count gets to zero on an RMID, we - * free the RMID - */ -void __check_limbo(struct rdt_mon_domain *d, bool force_free) -{ - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - struct rmid_entry *entry; - u32 idx, cur_idx = 1; - void *arch_mon_ctx; - bool rmid_dirty; - u64 val = 0; - - arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID); - if (IS_ERR(arch_mon_ctx)) { - pr_warn_ratelimited("Failed to allocate monitor context: %ld", - PTR_ERR(arch_mon_ctx)); - return; - } - - /* - * Skip RMID 0 and start from RMID 1 and check all the RMIDs that - * are marked as busy for occupancy < threshold. If the occupancy - * is less than the threshold decrement the busy counter of the - * RMID and move it to the free list when the counter reaches 0. - */ - for (;;) { - idx = find_next_bit(d->rmid_busy_llc, idx_limit, cur_idx); - if (idx >= idx_limit) - break; - - entry = __rmid_entry(idx); - if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid, - QOS_L3_OCCUP_EVENT_ID, &val, - arch_mon_ctx)) { - rmid_dirty = true; - } else { - rmid_dirty = (val >= resctrl_rmid_realloc_threshold); - - /* - * x86's CLOSID and RMID are independent numbers, so the entry's - * CLOSID is an empty CLOSID (X86_RESCTRL_EMPTY_CLOSID). On Arm the - * RMID (PMG) extends the CLOSID (PARTID) space with bits that aren't - * used to select the configuration. It is thus necessary to track both - * CLOSID and RMID because there may be dependencies between them - * on some architectures. - */ - trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->hdr.id, val); - } - - if (force_free || !rmid_dirty) { - clear_bit(idx, d->rmid_busy_llc); - if (!--entry->busy) - limbo_release_entry(entry); - } - cur_idx = idx + 1; - } - - resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx); -} - -bool has_busy_rmid(struct rdt_mon_domain *d) -{ - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - - return find_first_bit(d->rmid_busy_llc, idx_limit) != idx_limit; -} - -static struct rmid_entry *resctrl_find_free_rmid(u32 closid) -{ - struct rmid_entry *itr; - u32 itr_idx, cmp_idx; - - if (list_empty(&rmid_free_lru)) - return rmid_limbo_count ? ERR_PTR(-EBUSY) : ERR_PTR(-ENOSPC); - - list_for_each_entry(itr, &rmid_free_lru, list) { - /* - * Get the index of this free RMID, and the index it would need - * to be if it were used with this CLOSID. - * If the CLOSID is irrelevant on this architecture, the two - * index values are always the same on every entry and thus the - * very first entry will be returned. - */ - itr_idx = resctrl_arch_rmid_idx_encode(itr->closid, itr->rmid); - cmp_idx = resctrl_arch_rmid_idx_encode(closid, itr->rmid); - - if (itr_idx == cmp_idx) - return itr; - } - - return ERR_PTR(-ENOSPC); -} - -/** - * resctrl_find_cleanest_closid() - Find a CLOSID where all the associated - * RMID are clean, or the CLOSID that has - * the most clean RMID. - * - * MPAM's equivalent of RMID are per-CLOSID, meaning a freshly allocated CLOSID - * may not be able to allocate clean RMID. To avoid this the allocator will - * choose the CLOSID with the most clean RMID. - * - * When the CLOSID and RMID are independent numbers, the first free CLOSID will - * be returned. - */ -int resctrl_find_cleanest_closid(void) -{ - u32 cleanest_closid = ~0; - int i = 0; - - lockdep_assert_held(&rdtgroup_mutex); - - if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) - return -EIO; - - for (i = 0; i < closids_supported(); i++) { - int num_dirty; - - if (closid_allocated(i)) - continue; - - num_dirty = closid_num_dirty_rmid[i]; - if (num_dirty == 0) - return i; - - if (cleanest_closid == ~0) - cleanest_closid = i; - - if (num_dirty < closid_num_dirty_rmid[cleanest_closid]) - cleanest_closid = i; - } - - if (cleanest_closid == ~0) - return -ENOSPC; - - return cleanest_closid; -} - -/* - * For MPAM the RMID value is not unique, and has to be considered with - * the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which - * allows all domains to be managed by a single free list. - * Each domain also has a rmid_busy_llc to reduce the work of the limbo handler. - */ -int alloc_rmid(u32 closid) -{ - struct rmid_entry *entry; - - lockdep_assert_held(&rdtgroup_mutex); - - entry = resctrl_find_free_rmid(closid); - if (IS_ERR(entry)) - return PTR_ERR(entry); - - list_del(&entry->list); - return entry->rmid; -} - -static void add_rmid_to_limbo(struct rmid_entry *entry) -{ - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - struct rdt_mon_domain *d; - u32 idx; - - lockdep_assert_held(&rdtgroup_mutex); - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid); - - entry->busy = 0; - list_for_each_entry(d, &r->mon_domains, hdr.list) { - /* - * For the first limbo RMID in the domain, - * setup up the limbo worker. - */ - if (!has_busy_rmid(d)) - cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL, - RESCTRL_PICK_ANY_CPU); - set_bit(idx, d->rmid_busy_llc); - entry->busy++; - } - - rmid_limbo_count++; - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) - closid_num_dirty_rmid[entry->closid]++; -} - -void free_rmid(u32 closid, u32 rmid) -{ - u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); - struct rmid_entry *entry; - - lockdep_assert_held(&rdtgroup_mutex); - - /* - * Do not allow the default rmid to be free'd. Comparing by index - * allows architectures that ignore the closid parameter to avoid an - * unnecessary check. - */ - if (!resctrl_arch_mon_capable() || - idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, - RESCTRL_RESERVED_RMID)) - return; - - entry = __rmid_entry(idx); - - if (resctrl_arch_is_llc_occupancy_enabled()) - add_rmid_to_limbo(entry); - else - list_add_tail(&entry->list, &rmid_free_lru); -} - -static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, - u32 rmid, enum resctrl_event_id evtid) -{ - u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); - - switch (evtid) { - case QOS_L3_MBM_TOTAL_EVENT_ID: - return &d->mbm_total[idx]; - case QOS_L3_MBM_LOCAL_EVENT_ID: - return &d->mbm_local[idx]; - default: - return NULL; - } -} - -static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) -{ - int cpu = smp_processor_id(); - struct rdt_mon_domain *d; - struct mbm_state *m; - int err, ret; - u64 tval = 0; - - if (rr->first) { - resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); - m = get_mbm_state(rr->d, closid, rmid, rr->evtid); - if (m) - memset(m, 0, sizeof(struct mbm_state)); - return 0; - } - - if (rr->d) { - /* Reading a single domain, must be on a CPU in that domain. */ - if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask)) - return -EINVAL; - rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, - rr->evtid, &tval, rr->arch_mon_ctx); - if (rr->err) - return rr->err; - - rr->val += tval; - - return 0; - } - - /* Summing domains that share a cache, must be on a CPU for that cache. */ - if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map)) - return -EINVAL; - - /* - * Legacy files must report the sum of an event across all - * domains that share the same L3 cache instance. - * Report success if a read from any domain succeeds, -EINVAL - * (translated to "Unavailable" for user space) if reading from - * all domains fail for any reason. - */ - ret = -EINVAL; - list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { - if (d->ci->id != rr->ci->id) - continue; - err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, - rr->evtid, &tval, rr->arch_mon_ctx); - if (!err) { - rr->val += tval; - ret = 0; - } - } - - if (ret) - rr->err = ret; - - return ret; -} - -/* - * mbm_bw_count() - Update bw count from values previously read by - * __mon_event_count(). - * @closid: The closid used to identify the cached mbm_state. - * @rmid: The rmid used to identify the cached mbm_state. - * @rr: The struct rmid_read populated by __mon_event_count(). - * - * Supporting function to calculate the memory bandwidth - * and delta bandwidth in MBps. The chunks value previously read by - * __mon_event_count() is compared with the chunks value from the previous - * invocation. This must be called once per second to maintain values in MBps. - */ -static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr) -{ - u64 cur_bw, bytes, cur_bytes; - struct mbm_state *m; - - m = get_mbm_state(rr->d, closid, rmid, rr->evtid); - if (WARN_ON_ONCE(!m)) - return; - - cur_bytes = rr->val; - bytes = cur_bytes - m->prev_bw_bytes; - m->prev_bw_bytes = cur_bytes; - - cur_bw = bytes / SZ_1M; - - m->prev_bw = cur_bw; -} - -/* - * This is scheduled by mon_event_read() to read the CQM/MBM counters - * on a domain. - */ -void mon_event_count(void *info) -{ - struct rdtgroup *rdtgrp, *entry; - struct rmid_read *rr = info; - struct list_head *head; - int ret; - - rdtgrp = rr->rgrp; - - ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr); - - /* - * For Ctrl groups read data from child monitor groups and - * add them together. Count events which are read successfully. - * Discard the rmid_read's reporting errors. - */ - head = &rdtgrp->mon.crdtgrp_list; - - if (rdtgrp->type == RDTCTRL_GROUP) { - list_for_each_entry(entry, head, mon.crdtgrp_list) { - if (__mon_event_count(entry->closid, entry->mon.rmid, - rr) == 0) - ret = 0; - } - } - - /* - * __mon_event_count() calls for newly created monitor groups may - * report -EINVAL/Unavailable if the monitor hasn't seen any traffic. - * Discard error if any of the monitor event reads succeeded. - */ - if (ret == 0) - rr->err = 0; -} - -static struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, - struct rdt_resource *r) -{ - struct rdt_ctrl_domain *d; - - lockdep_assert_cpus_held(); - - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { - /* Find the domain that contains this CPU */ - if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) - return d; - } - - return NULL; -} - -/* - * Feedback loop for MBA software controller (mba_sc) - * - * mba_sc is a feedback loop where we periodically read MBM counters and - * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so - * that: - * - * current bandwidth(cur_bw) < user specified bandwidth(user_bw) - * - * This uses the MBM counters to measure the bandwidth and MBA throttle - * MSRs to control the bandwidth for a particular rdtgrp. It builds on the - * fact that resctrl rdtgroups have both monitoring and control. - * - * The frequency of the checks is 1s and we just tag along the MBM overflow - * timer. Having 1s interval makes the calculation of bandwidth simpler. - * - * Although MBA's goal is to restrict the bandwidth to a maximum, there may - * be a need to increase the bandwidth to avoid unnecessarily restricting - * the L2 <-> L3 traffic. - * - * Since MBA controls the L2 external bandwidth where as MBM measures the - * L3 external bandwidth the following sequence could lead to such a - * situation. - * - * Consider an rdtgroup which had high L3 <-> memory traffic in initial - * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but - * after some time rdtgroup has mostly L2 <-> L3 traffic. - * - * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its - * throttle MSRs already have low percentage values. To avoid - * unnecessarily restricting such rdtgroups, we also increase the bandwidth. - */ -static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) -{ - u32 closid, rmid, cur_msr_val, new_msr_val; - struct mbm_state *pmbm_data, *cmbm_data; - struct rdt_ctrl_domain *dom_mba; - enum resctrl_event_id evt_id; - struct rdt_resource *r_mba; - struct list_head *head; - struct rdtgroup *entry; - u32 cur_bw, user_bw; - - r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA); - evt_id = rgrp->mba_mbps_event; - - closid = rgrp->closid; - rmid = rgrp->mon.rmid; - pmbm_data = get_mbm_state(dom_mbm, closid, rmid, evt_id); - if (WARN_ON_ONCE(!pmbm_data)) - return; - - dom_mba = get_ctrl_domain_from_cpu(smp_processor_id(), r_mba); - if (!dom_mba) { - pr_warn_once("Failure to get domain for MBA update\n"); - return; - } - - cur_bw = pmbm_data->prev_bw; - user_bw = dom_mba->mbps_val[closid]; - - /* MBA resource doesn't support CDP */ - cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE); - - /* - * For Ctrl groups read data from child monitor groups. - */ - head = &rgrp->mon.crdtgrp_list; - list_for_each_entry(entry, head, mon.crdtgrp_list) { - cmbm_data = get_mbm_state(dom_mbm, entry->closid, entry->mon.rmid, evt_id); - if (WARN_ON_ONCE(!cmbm_data)) - return; - cur_bw += cmbm_data->prev_bw; - } - - /* - * Scale up/down the bandwidth linearly for the ctrl group. The - * bandwidth step is the bandwidth granularity specified by the - * hardware. - * Always increase throttling if current bandwidth is above the - * target set by user. - * But avoid thrashing up and down on every poll by checking - * whether a decrease in throttling is likely to push the group - * back over target. E.g. if currently throttling to 30% of bandwidth - * on a system with 10% granularity steps, check whether moving to - * 40% would go past the limit by multiplying current bandwidth by - * "(30 + 10) / 30". - */ - if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) { - new_msr_val = cur_msr_val - r_mba->membw.bw_gran; - } else if (cur_msr_val < MAX_MBA_BW && - (user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) { - new_msr_val = cur_msr_val + r_mba->membw.bw_gran; - } else { - return; - } - - resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); -} - -static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d, - u32 closid, u32 rmid, enum resctrl_event_id evtid) -{ - struct rmid_read rr = {0}; - - rr.r = r; - rr.d = d; - rr.evtid = evtid; - rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); - if (IS_ERR(rr.arch_mon_ctx)) { - pr_warn_ratelimited("Failed to allocate monitor context: %ld", - PTR_ERR(rr.arch_mon_ctx)); - return; - } - - __mon_event_count(closid, rmid, &rr); - - /* - * If the software controller is enabled, compute the - * bandwidth for this event id. - */ - if (is_mba_sc(NULL)) - mbm_bw_count(closid, rmid, &rr); - - resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); -} - -static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, - u32 closid, u32 rmid) -{ - /* - * This is protected from concurrent reads from user as both - * the user and overflow handler hold the global mutex. - */ - if (resctrl_arch_is_mbm_total_enabled()) - mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID); - - if (resctrl_arch_is_mbm_local_enabled()) - mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID); -} - -/* - * Handler to scan the limbo list and move the RMIDs - * to free list whose occupancy < threshold_occupancy. - */ -void cqm_handle_limbo(struct work_struct *work) -{ - unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); - struct rdt_mon_domain *d; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - d = container_of(work, struct rdt_mon_domain, cqm_limbo.work); - - __check_limbo(d, false); - - if (has_busy_rmid(d)) { - d->cqm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, - RESCTRL_PICK_ANY_CPU); - schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo, - delay); - } - - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); -} - -/** - * cqm_setup_limbo_handler() - Schedule the limbo handler to run for this - * domain. - * @dom: The domain the limbo handler should run for. - * @delay_ms: How far in the future the handler should run. - * @exclude_cpu: Which CPU the handler should not run on, - * RESCTRL_PICK_ANY_CPU to pick any CPU. - */ -void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, - int exclude_cpu) -{ - unsigned long delay = msecs_to_jiffies(delay_ms); - int cpu; - - cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); - dom->cqm_work_cpu = cpu; - - if (cpu < nr_cpu_ids) - schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); -} - -void mbm_handle_overflow(struct work_struct *work) -{ - unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); - struct rdtgroup *prgrp, *crgrp; - struct rdt_mon_domain *d; - struct list_head *head; - struct rdt_resource *r; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - /* - * If the filesystem has been unmounted this work no longer needs to - * run. - */ - if (!resctrl_mounted || !resctrl_arch_mon_capable()) - goto out_unlock; - - r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - d = container_of(work, struct rdt_mon_domain, mbm_over.work); - - list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - mbm_update(r, d, prgrp->closid, prgrp->mon.rmid); - - head = &prgrp->mon.crdtgrp_list; - list_for_each_entry(crgrp, head, mon.crdtgrp_list) - mbm_update(r, d, crgrp->closid, crgrp->mon.rmid); - - if (is_mba_sc(NULL)) - update_mba_bw(prgrp, d); - } - - /* - * Re-check for housekeeping CPUs. This allows the overflow handler to - * move off a nohz_full CPU quickly. - */ - d->mbm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, - RESCTRL_PICK_ANY_CPU); - schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay); - -out_unlock: - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); -} - -/** - * mbm_setup_overflow_handler() - Schedule the overflow handler to run for this - * domain. - * @dom: The domain the overflow handler should run for. - * @delay_ms: How far in the future the handler should run. - * @exclude_cpu: Which CPU the handler should not run on, - * RESCTRL_PICK_ANY_CPU to pick any CPU. - */ -void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, - int exclude_cpu) -{ - unsigned long delay = msecs_to_jiffies(delay_ms); - int cpu; - - /* - * When a domain comes online there is no guarantee the filesystem is - * mounted. If not, there is no need to catch counter overflow. - */ - if (!resctrl_mounted || !resctrl_arch_mon_capable()) - return; - cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); - dom->mbm_work_cpu = cpu; - - if (cpu < nr_cpu_ids) - schedule_delayed_work_on(cpu, &dom->mbm_over, delay); -} - -static int dom_data_init(struct rdt_resource *r) -{ - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - u32 num_closid = resctrl_arch_get_num_closid(r); - struct rmid_entry *entry = NULL; - int err = 0, i; - u32 idx; - - mutex_lock(&rdtgroup_mutex); - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { - u32 *tmp; - - /* - * If the architecture hasn't provided a sanitised value here, - * this may result in larger arrays than necessary. Resctrl will - * use a smaller system wide value based on the resources in - * use. - */ - tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL); - if (!tmp) { - err = -ENOMEM; - goto out_unlock; - } - - closid_num_dirty_rmid = tmp; - } - - rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL); - if (!rmid_ptrs) { - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { - kfree(closid_num_dirty_rmid); - closid_num_dirty_rmid = NULL; - } - err = -ENOMEM; - goto out_unlock; - } - - for (i = 0; i < idx_limit; i++) { - entry = &rmid_ptrs[i]; - INIT_LIST_HEAD(&entry->list); - - resctrl_arch_rmid_idx_decode(i, &entry->closid, &entry->rmid); - list_add_tail(&entry->list, &rmid_free_lru); - } - - /* - * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and - * are always allocated. These are used for the rdtgroup_default - * control group, which will be setup later in resctrl_init(). - */ - idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, - RESCTRL_RESERVED_RMID); - entry = __rmid_entry(idx); - list_del(&entry->list); - -out_unlock: - mutex_unlock(&rdtgroup_mutex); - - return err; -} - -static void dom_data_exit(struct rdt_resource *r) -{ - mutex_lock(&rdtgroup_mutex); - - if (!r->mon_capable) - goto out_unlock; - - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { - kfree(closid_num_dirty_rmid); - closid_num_dirty_rmid = NULL; - } - - kfree(rmid_ptrs); - rmid_ptrs = NULL; - -out_unlock: - mutex_unlock(&rdtgroup_mutex); -} - -static struct mon_evt llc_occupancy_event = { - .name = "llc_occupancy", - .evtid = QOS_L3_OCCUP_EVENT_ID, -}; - -static struct mon_evt mbm_total_event = { - .name = "mbm_total_bytes", - .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, -}; - -static struct mon_evt mbm_local_event = { - .name = "mbm_local_bytes", - .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, -}; - -/* - * Initialize the event list for the resource. - * - * Note that MBM events are also part of RDT_RESOURCE_L3 resource - * because as per the SDM the total and local memory bandwidth - * are enumerated as part of L3 monitoring. - */ -static void l3_mon_evt_init(struct rdt_resource *r) -{ - INIT_LIST_HEAD(&r->evt_list); - - if (resctrl_arch_is_llc_occupancy_enabled()) - list_add_tail(&llc_occupancy_event.list, &r->evt_list); - if (resctrl_arch_is_mbm_total_enabled()) - list_add_tail(&mbm_total_event.list, &r->evt_list); - if (resctrl_arch_is_mbm_local_enabled()) - list_add_tail(&mbm_local_event.list, &r->evt_list); -} - /* * The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1 * which indicates that RMIDs are configured in legacy mode. @@ -1193,51 +341,6 @@ static __init int snc_get_config(void) return ret; } -/** - * resctrl_mon_resource_init() - Initialise global monitoring structures. - * - * Allocate and initialise global monitor resources that do not belong to a - * specific domain. i.e. the rmid_ptrs[] used for the limbo and free lists. - * Called once during boot after the struct rdt_resource's have been configured - * but before the filesystem is mounted. - * Resctrl's cpuhp callbacks may be called before this point to bring a domain - * online. - * - * Returns 0 for success, or -ENOMEM. - */ -int __init resctrl_mon_resource_init(void) -{ - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - int ret; - - if (!r->mon_capable) - return 0; - - ret = dom_data_init(r); - if (ret) - return ret; - - l3_mon_evt_init(r); - - if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) { - mbm_total_event.configurable = true; - resctrl_file_fflags_init("mbm_total_bytes_config", - RFTYPE_MON_INFO | RFTYPE_RES_CACHE); - } - if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) { - mbm_local_event.configurable = true; - resctrl_file_fflags_init("mbm_local_bytes_config", - RFTYPE_MON_INFO | RFTYPE_RES_CACHE); - } - - if (resctrl_arch_is_mbm_local_enabled()) - mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID; - else if (resctrl_arch_is_mbm_total_enabled()) - mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID; - - return 0; -} - int __init rdt_get_mon_l3_config(struct rdt_resource *r) { unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; @@ -1285,13 +388,6 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) return 0; } -void resctrl_mon_resource_exit(void) -{ - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - - dom_data_exit(r); -} - void __init intel_rdt_mbm_apply_quirk(void) { int cf_index; diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 1190c48a16b2..de580eca3363 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -11,19 +11,13 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/cacheflush.h> #include <linux/cpu.h> -#include <linux/cpumask.h> -#include <linux/debugfs.h> -#include <linux/kthread.h> -#include <linux/mman.h> #include <linux/perf_event.h> #include <linux/pm_qos.h> -#include <linux/slab.h> -#include <linux/uaccess.h> +#include <linux/resctrl.h> -#include <asm/cacheflush.h> #include <asm/cpu_device_id.h> -#include <asm/resctrl.h> #include <asm/perf_event.h> #include <asm/msr.h> @@ -31,7 +25,8 @@ #include "internal.h" #define CREATE_TRACE_POINTS -#include "trace.h" + +#include "pseudo_lock_trace.h" /* * The bits needed to disable hardware prefetching varies based on the @@ -39,29 +34,6 @@ */ static u64 prefetch_disable_bits; -/* - * Major number assigned to and shared by all devices exposing - * pseudo-locked regions. - */ -static unsigned int pseudo_lock_major; -static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0); - -static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode) -{ - const struct rdtgroup *rdtgrp; - - rdtgrp = dev_get_drvdata(dev); - if (mode) - *mode = 0600; - guard(mutex)(&rdtgroup_mutex); - return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdt_kn_name(rdtgrp->kn)); -} - -static const struct class pseudo_lock_class = { - .name = "pseudo_lock", - .devnode = pseudo_lock_devnode, -}; - /** * resctrl_arch_get_prefetch_disable_bits - prefetch disable bits of supported * platforms @@ -123,298 +95,6 @@ u64 resctrl_arch_get_prefetch_disable_bits(void) } /** - * pseudo_lock_minor_get - Obtain available minor number - * @minor: Pointer to where new minor number will be stored - * - * A bitmask is used to track available minor numbers. Here the next free - * minor number is marked as unavailable and returned. - * - * Return: 0 on success, <0 on failure. - */ -static int pseudo_lock_minor_get(unsigned int *minor) -{ - unsigned long first_bit; - - first_bit = find_first_bit(&pseudo_lock_minor_avail, MINORBITS); - - if (first_bit == MINORBITS) - return -ENOSPC; - - __clear_bit(first_bit, &pseudo_lock_minor_avail); - *minor = first_bit; - - return 0; -} - -/** - * pseudo_lock_minor_release - Return minor number to available - * @minor: The minor number made available - */ -static void pseudo_lock_minor_release(unsigned int minor) -{ - __set_bit(minor, &pseudo_lock_minor_avail); -} - -/** - * region_find_by_minor - Locate a pseudo-lock region by inode minor number - * @minor: The minor number of the device representing pseudo-locked region - * - * When the character device is accessed we need to determine which - * pseudo-locked region it belongs to. This is done by matching the minor - * number of the device to the pseudo-locked region it belongs. - * - * Minor numbers are assigned at the time a pseudo-locked region is associated - * with a cache instance. - * - * Return: On success return pointer to resource group owning the pseudo-locked - * region, NULL on failure. - */ -static struct rdtgroup *region_find_by_minor(unsigned int minor) -{ - struct rdtgroup *rdtgrp, *rdtgrp_match = NULL; - - list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { - if (rdtgrp->plr && rdtgrp->plr->minor == minor) { - rdtgrp_match = rdtgrp; - break; - } - } - return rdtgrp_match; -} - -/** - * struct pseudo_lock_pm_req - A power management QoS request list entry - * @list: Entry within the @pm_reqs list for a pseudo-locked region - * @req: PM QoS request - */ -struct pseudo_lock_pm_req { - struct list_head list; - struct dev_pm_qos_request req; -}; - -static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr) -{ - struct pseudo_lock_pm_req *pm_req, *next; - - list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) { - dev_pm_qos_remove_request(&pm_req->req); - list_del(&pm_req->list); - kfree(pm_req); - } -} - -/** - * pseudo_lock_cstates_constrain - Restrict cores from entering C6 - * @plr: Pseudo-locked region - * - * To prevent the cache from being affected by power management entering - * C6 has to be avoided. This is accomplished by requesting a latency - * requirement lower than lowest C6 exit latency of all supported - * platforms as found in the cpuidle state tables in the intel_idle driver. - * At this time it is possible to do so with a single latency requirement - * for all supported platforms. - * - * Since Goldmont is supported, which is affected by X86_BUG_MONITOR, - * the ACPI latencies need to be considered while keeping in mind that C2 - * may be set to map to deeper sleep states. In this case the latency - * requirement needs to prevent entering C2 also. - * - * Return: 0 on success, <0 on failure - */ -static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr) -{ - struct pseudo_lock_pm_req *pm_req; - int cpu; - int ret; - - for_each_cpu(cpu, &plr->d->hdr.cpu_mask) { - pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL); - if (!pm_req) { - rdt_last_cmd_puts("Failure to allocate memory for PM QoS\n"); - ret = -ENOMEM; - goto out_err; - } - ret = dev_pm_qos_add_request(get_cpu_device(cpu), - &pm_req->req, - DEV_PM_QOS_RESUME_LATENCY, - 30); - if (ret < 0) { - rdt_last_cmd_printf("Failed to add latency req CPU%d\n", - cpu); - kfree(pm_req); - ret = -1; - goto out_err; - } - list_add(&pm_req->list, &plr->pm_reqs); - } - - return 0; - -out_err: - pseudo_lock_cstates_relax(plr); - return ret; -} - -/** - * pseudo_lock_region_clear - Reset pseudo-lock region data - * @plr: pseudo-lock region - * - * All content of the pseudo-locked region is reset - any memory allocated - * freed. - * - * Return: void - */ -static void pseudo_lock_region_clear(struct pseudo_lock_region *plr) -{ - plr->size = 0; - plr->line_size = 0; - kfree(plr->kmem); - plr->kmem = NULL; - plr->s = NULL; - if (plr->d) - plr->d->plr = NULL; - plr->d = NULL; - plr->cbm = 0; - plr->debugfs_dir = NULL; -} - -/** - * pseudo_lock_region_init - Initialize pseudo-lock region information - * @plr: pseudo-lock region - * - * Called after user provided a schemata to be pseudo-locked. From the - * schemata the &struct pseudo_lock_region is on entry already initialized - * with the resource, domain, and capacity bitmask. Here the information - * required for pseudo-locking is deduced from this data and &struct - * pseudo_lock_region initialized further. This information includes: - * - size in bytes of the region to be pseudo-locked - * - cache line size to know the stride with which data needs to be accessed - * to be pseudo-locked - * - a cpu associated with the cache instance on which the pseudo-locking - * flow can be executed - * - * Return: 0 on success, <0 on failure. Descriptive error will be written - * to last_cmd_status buffer. - */ -static int pseudo_lock_region_init(struct pseudo_lock_region *plr) -{ - enum resctrl_scope scope = plr->s->res->ctrl_scope; - struct cacheinfo *ci; - int ret; - - if (WARN_ON_ONCE(scope != RESCTRL_L2_CACHE && scope != RESCTRL_L3_CACHE)) - return -ENODEV; - - /* Pick the first cpu we find that is associated with the cache. */ - plr->cpu = cpumask_first(&plr->d->hdr.cpu_mask); - - if (!cpu_online(plr->cpu)) { - rdt_last_cmd_printf("CPU %u associated with cache not online\n", - plr->cpu); - ret = -ENODEV; - goto out_region; - } - - ci = get_cpu_cacheinfo_level(plr->cpu, scope); - if (ci) { - plr->line_size = ci->coherency_line_size; - plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm); - return 0; - } - - ret = -1; - rdt_last_cmd_puts("Unable to determine cache line size\n"); -out_region: - pseudo_lock_region_clear(plr); - return ret; -} - -/** - * pseudo_lock_init - Initialize a pseudo-lock region - * @rdtgrp: resource group to which new pseudo-locked region will belong - * - * A pseudo-locked region is associated with a resource group. When this - * association is created the pseudo-locked region is initialized. The - * details of the pseudo-locked region are not known at this time so only - * allocation is done and association established. - * - * Return: 0 on success, <0 on failure - */ -static int pseudo_lock_init(struct rdtgroup *rdtgrp) -{ - struct pseudo_lock_region *plr; - - plr = kzalloc(sizeof(*plr), GFP_KERNEL); - if (!plr) - return -ENOMEM; - - init_waitqueue_head(&plr->lock_thread_wq); - INIT_LIST_HEAD(&plr->pm_reqs); - rdtgrp->plr = plr; - return 0; -} - -/** - * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked - * @plr: pseudo-lock region - * - * Initialize the details required to set up the pseudo-locked region and - * allocate the contiguous memory that will be pseudo-locked to the cache. - * - * Return: 0 on success, <0 on failure. Descriptive error will be written - * to last_cmd_status buffer. - */ -static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr) -{ - int ret; - - ret = pseudo_lock_region_init(plr); - if (ret < 0) - return ret; - - /* - * We do not yet support contiguous regions larger than - * KMALLOC_MAX_SIZE. - */ - if (plr->size > KMALLOC_MAX_SIZE) { - rdt_last_cmd_puts("Requested region exceeds maximum size\n"); - ret = -E2BIG; - goto out_region; - } - - plr->kmem = kzalloc(plr->size, GFP_KERNEL); - if (!plr->kmem) { - rdt_last_cmd_puts("Unable to allocate memory\n"); - ret = -ENOMEM; - goto out_region; - } - - ret = 0; - goto out; -out_region: - pseudo_lock_region_clear(plr); -out: - return ret; -} - -/** - * pseudo_lock_free - Free a pseudo-locked region - * @rdtgrp: resource group to which pseudo-locked region belonged - * - * The pseudo-locked region's resources have already been released, or not - * yet created at this point. Now it can be freed and disassociated from the - * resource group. - * - * Return: void - */ -static void pseudo_lock_free(struct rdtgroup *rdtgrp) -{ - pseudo_lock_region_clear(rdtgrp->plr); - kfree(rdtgrp->plr); - rdtgrp->plr = NULL; -} - -/** * resctrl_arch_pseudo_lock_fn - Load kernel memory into cache * @_plr: the pseudo-lock region descriptor * @@ -544,340 +224,6 @@ int resctrl_arch_pseudo_lock_fn(void *_plr) } /** - * rdtgroup_monitor_in_progress - Test if monitoring in progress - * @rdtgrp: resource group being queried - * - * Return: 1 if monitor groups have been created for this resource - * group, 0 otherwise. - */ -static int rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp) -{ - return !list_empty(&rdtgrp->mon.crdtgrp_list); -} - -/** - * rdtgroup_locksetup_user_restrict - Restrict user access to group - * @rdtgrp: resource group needing access restricted - * - * A resource group used for cache pseudo-locking cannot have cpus or tasks - * assigned to it. This is communicated to the user by restricting access - * to all the files that can be used to make such changes. - * - * Permissions restored with rdtgroup_locksetup_user_restore() - * - * Return: 0 on success, <0 on failure. If a failure occurs during the - * restriction of access an attempt will be made to restore permissions but - * the state of the mode of these files will be uncertain when a failure - * occurs. - */ -static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp) -{ - int ret; - - ret = rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); - if (ret) - return ret; - - ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); - if (ret) - goto err_tasks; - - ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); - if (ret) - goto err_cpus; - - if (resctrl_arch_mon_capable()) { - ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups"); - if (ret) - goto err_cpus_list; - } - - ret = 0; - goto out; - -err_cpus_list: - rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); -err_cpus: - rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); -err_tasks: - rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); -out: - return ret; -} - -/** - * rdtgroup_locksetup_user_restore - Restore user access to group - * @rdtgrp: resource group needing access restored - * - * Restore all file access previously removed using - * rdtgroup_locksetup_user_restrict() - * - * Return: 0 on success, <0 on failure. If a failure occurs during the - * restoration of access an attempt will be made to restrict permissions - * again but the state of the mode of these files will be uncertain when - * a failure occurs. - */ -static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp) -{ - int ret; - - ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); - if (ret) - return ret; - - ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); - if (ret) - goto err_tasks; - - ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); - if (ret) - goto err_cpus; - - if (resctrl_arch_mon_capable()) { - ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777); - if (ret) - goto err_cpus_list; - } - - ret = 0; - goto out; - -err_cpus_list: - rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); -err_cpus: - rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); -err_tasks: - rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); -out: - return ret; -} - -/** - * rdtgroup_locksetup_enter - Resource group enters locksetup mode - * @rdtgrp: resource group requested to enter locksetup mode - * - * A resource group enters locksetup mode to reflect that it would be used - * to represent a pseudo-locked region and is in the process of being set - * up to do so. A resource group used for a pseudo-locked region would - * lose the closid associated with it so we cannot allow it to have any - * tasks or cpus assigned nor permit tasks or cpus to be assigned in the - * future. Monitoring of a pseudo-locked region is not allowed either. - * - * The above and more restrictions on a pseudo-locked region are checked - * for and enforced before the resource group enters the locksetup mode. - * - * Returns: 0 if the resource group successfully entered locksetup mode, <0 - * on failure. On failure the last_cmd_status buffer is updated with text to - * communicate details of failure to the user. - */ -int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) -{ - int ret; - - /* - * The default resource group can neither be removed nor lose the - * default closid associated with it. - */ - if (rdtgrp == &rdtgroup_default) { - rdt_last_cmd_puts("Cannot pseudo-lock default group\n"); - return -EINVAL; - } - - /* - * Cache Pseudo-locking not supported when CDP is enabled. - * - * Some things to consider if you would like to enable this - * support (using L3 CDP as example): - * - When CDP is enabled two separate resources are exposed, - * L3DATA and L3CODE, but they are actually on the same cache. - * The implication for pseudo-locking is that if a - * pseudo-locked region is created on a domain of one - * resource (eg. L3CODE), then a pseudo-locked region cannot - * be created on that same domain of the other resource - * (eg. L3DATA). This is because the creation of a - * pseudo-locked region involves a call to wbinvd that will - * affect all cache allocations on particular domain. - * - Considering the previous, it may be possible to only - * expose one of the CDP resources to pseudo-locking and - * hide the other. For example, we could consider to only - * expose L3DATA and since the L3 cache is unified it is - * still possible to place instructions there are execute it. - * - If only one region is exposed to pseudo-locking we should - * still keep in mind that availability of a portion of cache - * for pseudo-locking should take into account both resources. - * Similarly, if a pseudo-locked region is created in one - * resource, the portion of cache used by it should be made - * unavailable to all future allocations from both resources. - */ - if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3) || - resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) { - rdt_last_cmd_puts("CDP enabled\n"); - return -EINVAL; - } - - /* - * Not knowing the bits to disable prefetching implies that this - * platform does not support Cache Pseudo-Locking. - */ - if (resctrl_arch_get_prefetch_disable_bits() == 0) { - rdt_last_cmd_puts("Pseudo-locking not supported\n"); - return -EINVAL; - } - - if (rdtgroup_monitor_in_progress(rdtgrp)) { - rdt_last_cmd_puts("Monitoring in progress\n"); - return -EINVAL; - } - - if (rdtgroup_tasks_assigned(rdtgrp)) { - rdt_last_cmd_puts("Tasks assigned to resource group\n"); - return -EINVAL; - } - - if (!cpumask_empty(&rdtgrp->cpu_mask)) { - rdt_last_cmd_puts("CPUs assigned to resource group\n"); - return -EINVAL; - } - - if (rdtgroup_locksetup_user_restrict(rdtgrp)) { - rdt_last_cmd_puts("Unable to modify resctrl permissions\n"); - return -EIO; - } - - ret = pseudo_lock_init(rdtgrp); - if (ret) { - rdt_last_cmd_puts("Unable to init pseudo-lock region\n"); - goto out_release; - } - - /* - * If this system is capable of monitoring a rmid would have been - * allocated when the control group was created. This is not needed - * anymore when this group would be used for pseudo-locking. This - * is safe to call on platforms not capable of monitoring. - */ - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - - ret = 0; - goto out; - -out_release: - rdtgroup_locksetup_user_restore(rdtgrp); -out: - return ret; -} - -/** - * rdtgroup_locksetup_exit - resource group exist locksetup mode - * @rdtgrp: resource group - * - * When a resource group exits locksetup mode the earlier restrictions are - * lifted. - * - * Return: 0 on success, <0 on failure - */ -int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) -{ - int ret; - - if (resctrl_arch_mon_capable()) { - ret = alloc_rmid(rdtgrp->closid); - if (ret < 0) { - rdt_last_cmd_puts("Out of RMIDs\n"); - return ret; - } - rdtgrp->mon.rmid = ret; - } - - ret = rdtgroup_locksetup_user_restore(rdtgrp); - if (ret) { - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - return ret; - } - - pseudo_lock_free(rdtgrp); - return 0; -} - -/** - * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked - * @d: RDT domain - * @cbm: CBM to test - * - * @d represents a cache instance and @cbm a capacity bitmask that is - * considered for it. Determine if @cbm overlaps with any existing - * pseudo-locked region on @d. - * - * @cbm is unsigned long, even if only 32 bits are used, to make the - * bitmap functions work correctly. - * - * Return: true if @cbm overlaps with pseudo-locked region on @d, false - * otherwise. - */ -bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm) -{ - unsigned int cbm_len; - unsigned long cbm_b; - - if (d->plr) { - cbm_len = d->plr->s->res->cache.cbm_len; - cbm_b = d->plr->cbm; - if (bitmap_intersects(&cbm, &cbm_b, cbm_len)) - return true; - } - return false; -} - -/** - * rdtgroup_pseudo_locked_in_hierarchy - Pseudo-locked region in cache hierarchy - * @d: RDT domain under test - * - * The setup of a pseudo-locked region affects all cache instances within - * the hierarchy of the region. It is thus essential to know if any - * pseudo-locked regions exist within a cache hierarchy to prevent any - * attempts to create new pseudo-locked regions in the same hierarchy. - * - * Return: true if a pseudo-locked region exists in the hierarchy of @d or - * if it is not possible to test due to memory allocation issue, - * false otherwise. - */ -bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d) -{ - struct rdt_ctrl_domain *d_i; - cpumask_var_t cpu_with_psl; - struct rdt_resource *r; - bool ret = false; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL)) - return true; - - /* - * First determine which cpus have pseudo-locked regions - * associated with them. - */ - for_each_alloc_capable_rdt_resource(r) { - list_for_each_entry(d_i, &r->ctrl_domains, hdr.list) { - if (d_i->plr) - cpumask_or(cpu_with_psl, cpu_with_psl, - &d_i->hdr.cpu_mask); - } - } - - /* - * Next test if new pseudo-locked region would intersect with - * existing region. - */ - if (cpumask_intersects(&d->hdr.cpu_mask, cpu_with_psl)) - ret = true; - - free_cpumask_var(cpu_with_psl); - return ret; -} - -/** * resctrl_arch_measure_cycles_lat_fn - Measure cycle latency to read * pseudo-locked memory * @_plr: pseudo-lock region to measure @@ -1169,433 +515,3 @@ out: wake_up_interruptible(&plr->lock_thread_wq); return 0; } - -/** - * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region - * @rdtgrp: Resource group to which the pseudo-locked region belongs. - * @sel: Selector of which measurement to perform on a pseudo-locked region. - * - * The measurement of latency to access a pseudo-locked region should be - * done from a cpu that is associated with that pseudo-locked region. - * Determine which cpu is associated with this region and start a thread on - * that cpu to perform the measurement, wait for that thread to complete. - * - * Return: 0 on success, <0 on failure - */ -static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) -{ - struct pseudo_lock_region *plr = rdtgrp->plr; - struct task_struct *thread; - unsigned int cpu; - int ret = -1; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - if (rdtgrp->flags & RDT_DELETED) { - ret = -ENODEV; - goto out; - } - - if (!plr->d) { - ret = -ENODEV; - goto out; - } - - plr->thread_done = 0; - cpu = cpumask_first(&plr->d->hdr.cpu_mask); - if (!cpu_online(cpu)) { - ret = -ENODEV; - goto out; - } - - plr->cpu = cpu; - - if (sel == 1) - thread = kthread_run_on_cpu(resctrl_arch_measure_cycles_lat_fn, - plr, cpu, "pseudo_lock_measure/%u"); - else if (sel == 2) - thread = kthread_run_on_cpu(resctrl_arch_measure_l2_residency, - plr, cpu, "pseudo_lock_measure/%u"); - else if (sel == 3) - thread = kthread_run_on_cpu(resctrl_arch_measure_l3_residency, - plr, cpu, "pseudo_lock_measure/%u"); - else - goto out; - - if (IS_ERR(thread)) { - ret = PTR_ERR(thread); - goto out; - } - - ret = wait_event_interruptible(plr->lock_thread_wq, - plr->thread_done == 1); - if (ret < 0) - goto out; - - ret = 0; - -out: - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - return ret; -} - -static ssize_t pseudo_lock_measure_trigger(struct file *file, - const char __user *user_buf, - size_t count, loff_t *ppos) -{ - struct rdtgroup *rdtgrp = file->private_data; - size_t buf_size; - char buf[32]; - int ret; - int sel; - - buf_size = min(count, (sizeof(buf) - 1)); - if (copy_from_user(buf, user_buf, buf_size)) - return -EFAULT; - - buf[buf_size] = '\0'; - ret = kstrtoint(buf, 10, &sel); - if (ret == 0) { - if (sel != 1 && sel != 2 && sel != 3) - return -EINVAL; - ret = debugfs_file_get(file->f_path.dentry); - if (ret) - return ret; - ret = pseudo_lock_measure_cycles(rdtgrp, sel); - if (ret == 0) - ret = count; - debugfs_file_put(file->f_path.dentry); - } - - return ret; -} - -static const struct file_operations pseudo_measure_fops = { - .write = pseudo_lock_measure_trigger, - .open = simple_open, - .llseek = default_llseek, -}; - -/** - * rdtgroup_pseudo_lock_create - Create a pseudo-locked region - * @rdtgrp: resource group to which pseudo-lock region belongs - * - * Called when a resource group in the pseudo-locksetup mode receives a - * valid schemata that should be pseudo-locked. Since the resource group is - * in pseudo-locksetup mode the &struct pseudo_lock_region has already been - * allocated and initialized with the essential information. If a failure - * occurs the resource group remains in the pseudo-locksetup mode with the - * &struct pseudo_lock_region associated with it, but cleared from all - * information and ready for the user to re-attempt pseudo-locking by - * writing the schemata again. - * - * Return: 0 if the pseudo-locked region was successfully pseudo-locked, <0 - * on failure. Descriptive error will be written to last_cmd_status buffer. - */ -int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) -{ - struct pseudo_lock_region *plr = rdtgrp->plr; - struct task_struct *thread; - unsigned int new_minor; - struct device *dev; - char *kn_name __free(kfree) = NULL; - int ret; - - ret = pseudo_lock_region_alloc(plr); - if (ret < 0) - return ret; - - ret = pseudo_lock_cstates_constrain(plr); - if (ret < 0) { - ret = -EINVAL; - goto out_region; - } - kn_name = kstrdup(rdt_kn_name(rdtgrp->kn), GFP_KERNEL); - if (!kn_name) { - ret = -ENOMEM; - goto out_cstates; - } - - plr->thread_done = 0; - - thread = kthread_run_on_cpu(resctrl_arch_pseudo_lock_fn, plr, - plr->cpu, "pseudo_lock/%u"); - if (IS_ERR(thread)) { - ret = PTR_ERR(thread); - rdt_last_cmd_printf("Locking thread returned error %d\n", ret); - goto out_cstates; - } - - ret = wait_event_interruptible(plr->lock_thread_wq, - plr->thread_done == 1); - if (ret < 0) { - /* - * If the thread does not get on the CPU for whatever - * reason and the process which sets up the region is - * interrupted then this will leave the thread in runnable - * state and once it gets on the CPU it will dereference - * the cleared, but not freed, plr struct resulting in an - * empty pseudo-locking loop. - */ - rdt_last_cmd_puts("Locking thread interrupted\n"); - goto out_cstates; - } - - ret = pseudo_lock_minor_get(&new_minor); - if (ret < 0) { - rdt_last_cmd_puts("Unable to obtain a new minor number\n"); - goto out_cstates; - } - - /* - * Unlock access but do not release the reference. The - * pseudo-locked region will still be here on return. - * - * The mutex has to be released temporarily to avoid a potential - * deadlock with the mm->mmap_lock which is obtained in the - * device_create() and debugfs_create_dir() callpath below as well as - * before the mmap() callback is called. - */ - mutex_unlock(&rdtgroup_mutex); - - if (!IS_ERR_OR_NULL(debugfs_resctrl)) { - plr->debugfs_dir = debugfs_create_dir(kn_name, debugfs_resctrl); - if (!IS_ERR_OR_NULL(plr->debugfs_dir)) - debugfs_create_file("pseudo_lock_measure", 0200, - plr->debugfs_dir, rdtgrp, - &pseudo_measure_fops); - } - - dev = device_create(&pseudo_lock_class, NULL, - MKDEV(pseudo_lock_major, new_minor), - rdtgrp, "%s", kn_name); - - mutex_lock(&rdtgroup_mutex); - - if (IS_ERR(dev)) { - ret = PTR_ERR(dev); - rdt_last_cmd_printf("Failed to create character device: %d\n", - ret); - goto out_debugfs; - } - - /* We released the mutex - check if group was removed while we did so */ - if (rdtgrp->flags & RDT_DELETED) { - ret = -ENODEV; - goto out_device; - } - - plr->minor = new_minor; - - rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED; - closid_free(rdtgrp->closid); - rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0444); - rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0444); - - ret = 0; - goto out; - -out_device: - device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor)); -out_debugfs: - debugfs_remove_recursive(plr->debugfs_dir); - pseudo_lock_minor_release(new_minor); -out_cstates: - pseudo_lock_cstates_relax(plr); -out_region: - pseudo_lock_region_clear(plr); -out: - return ret; -} - -/** - * rdtgroup_pseudo_lock_remove - Remove a pseudo-locked region - * @rdtgrp: resource group to which the pseudo-locked region belongs - * - * The removal of a pseudo-locked region can be initiated when the resource - * group is removed from user space via a "rmdir" from userspace or the - * unmount of the resctrl filesystem. On removal the resource group does - * not go back to pseudo-locksetup mode before it is removed, instead it is - * removed directly. There is thus asymmetry with the creation where the - * &struct pseudo_lock_region is removed here while it was not created in - * rdtgroup_pseudo_lock_create(). - * - * Return: void - */ -void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) -{ - struct pseudo_lock_region *plr = rdtgrp->plr; - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - /* - * Default group cannot be a pseudo-locked region so we can - * free closid here. - */ - closid_free(rdtgrp->closid); - goto free; - } - - pseudo_lock_cstates_relax(plr); - debugfs_remove_recursive(rdtgrp->plr->debugfs_dir); - device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor)); - pseudo_lock_minor_release(plr->minor); - -free: - pseudo_lock_free(rdtgrp); -} - -static int pseudo_lock_dev_open(struct inode *inode, struct file *filp) -{ - struct rdtgroup *rdtgrp; - - mutex_lock(&rdtgroup_mutex); - - rdtgrp = region_find_by_minor(iminor(inode)); - if (!rdtgrp) { - mutex_unlock(&rdtgroup_mutex); - return -ENODEV; - } - - filp->private_data = rdtgrp; - atomic_inc(&rdtgrp->waitcount); - /* Perform a non-seekable open - llseek is not supported */ - filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); - - mutex_unlock(&rdtgroup_mutex); - - return 0; -} - -static int pseudo_lock_dev_release(struct inode *inode, struct file *filp) -{ - struct rdtgroup *rdtgrp; - - mutex_lock(&rdtgroup_mutex); - rdtgrp = filp->private_data; - WARN_ON(!rdtgrp); - if (!rdtgrp) { - mutex_unlock(&rdtgroup_mutex); - return -ENODEV; - } - filp->private_data = NULL; - atomic_dec(&rdtgrp->waitcount); - mutex_unlock(&rdtgroup_mutex); - return 0; -} - -static int pseudo_lock_dev_mremap(struct vm_area_struct *area) -{ - /* Not supported */ - return -EINVAL; -} - -static const struct vm_operations_struct pseudo_mmap_ops = { - .mremap = pseudo_lock_dev_mremap, -}; - -static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) -{ - unsigned long vsize = vma->vm_end - vma->vm_start; - unsigned long off = vma->vm_pgoff << PAGE_SHIFT; - struct pseudo_lock_region *plr; - struct rdtgroup *rdtgrp; - unsigned long physical; - unsigned long psize; - - mutex_lock(&rdtgroup_mutex); - - rdtgrp = filp->private_data; - WARN_ON(!rdtgrp); - if (!rdtgrp) { - mutex_unlock(&rdtgroup_mutex); - return -ENODEV; - } - - plr = rdtgrp->plr; - - if (!plr->d) { - mutex_unlock(&rdtgroup_mutex); - return -ENODEV; - } - - /* - * Task is required to run with affinity to the cpus associated - * with the pseudo-locked region. If this is not the case the task - * may be scheduled elsewhere and invalidate entries in the - * pseudo-locked region. - */ - if (!cpumask_subset(current->cpus_ptr, &plr->d->hdr.cpu_mask)) { - mutex_unlock(&rdtgroup_mutex); - return -EINVAL; - } - - physical = __pa(plr->kmem) >> PAGE_SHIFT; - psize = plr->size - off; - - if (off > plr->size) { - mutex_unlock(&rdtgroup_mutex); - return -ENOSPC; - } - - /* - * Ensure changes are carried directly to the memory being mapped, - * do not allow copy-on-write mapping. - */ - if (!(vma->vm_flags & VM_SHARED)) { - mutex_unlock(&rdtgroup_mutex); - return -EINVAL; - } - - if (vsize > psize) { - mutex_unlock(&rdtgroup_mutex); - return -ENOSPC; - } - - memset(plr->kmem + off, 0, vsize); - - if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff, - vsize, vma->vm_page_prot)) { - mutex_unlock(&rdtgroup_mutex); - return -EAGAIN; - } - vma->vm_ops = &pseudo_mmap_ops; - mutex_unlock(&rdtgroup_mutex); - return 0; -} - -static const struct file_operations pseudo_lock_dev_fops = { - .owner = THIS_MODULE, - .read = NULL, - .write = NULL, - .open = pseudo_lock_dev_open, - .release = pseudo_lock_dev_release, - .mmap = pseudo_lock_dev_mmap, -}; - -int rdt_pseudo_lock_init(void) -{ - int ret; - - ret = register_chrdev(0, "pseudo_lock", &pseudo_lock_dev_fops); - if (ret < 0) - return ret; - - pseudo_lock_major = ret; - - ret = class_register(&pseudo_lock_class); - if (ret) { - unregister_chrdev(pseudo_lock_major, "pseudo_lock"); - return ret; - } - - return 0; -} - -void rdt_pseudo_lock_release(void) -{ - class_unregister(&pseudo_lock_class); - unregister_chrdev(pseudo_lock_major, "pseudo_lock"); - pseudo_lock_major = 0; -} diff --git a/arch/x86/kernel/cpu/resctrl/trace.h b/arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h index 2a506316b303..7c8aef08010f 100644 --- a/arch/x86/kernel/cpu/resctrl/trace.h +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h @@ -2,8 +2,8 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM resctrl -#if !defined(_TRACE_RESCTRL_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_RESCTRL_H +#if !defined(_X86_RESCTRL_PSEUDO_LOCK_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _X86_RESCTRL_PSEUDO_LOCK_TRACE_H #include <linux/tracepoint.h> @@ -35,25 +35,11 @@ TRACE_EVENT(pseudo_lock_l3, TP_printk("hits=%llu miss=%llu", __entry->l3_hits, __entry->l3_miss)); -TRACE_EVENT(mon_llc_occupancy_limbo, - TP_PROTO(u32 ctrl_hw_id, u32 mon_hw_id, int domain_id, u64 llc_occupancy_bytes), - TP_ARGS(ctrl_hw_id, mon_hw_id, domain_id, llc_occupancy_bytes), - TP_STRUCT__entry(__field(u32, ctrl_hw_id) - __field(u32, mon_hw_id) - __field(int, domain_id) - __field(u64, llc_occupancy_bytes)), - TP_fast_assign(__entry->ctrl_hw_id = ctrl_hw_id; - __entry->mon_hw_id = mon_hw_id; - __entry->domain_id = domain_id; - __entry->llc_occupancy_bytes = llc_occupancy_bytes;), - TP_printk("ctrl_hw_id=%u mon_hw_id=%u domain_id=%d llc_occupancy_bytes=%llu", - __entry->ctrl_hw_id, __entry->mon_hw_id, __entry->domain_id, - __entry->llc_occupancy_bytes) - ); - -#endif /* _TRACE_RESCTRL_H */ +#endif /* _X86_RESCTRL_PSEUDO_LOCK_TRACE_H */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . -#define TRACE_INCLUDE_FILE trace + +#define TRACE_INCLUDE_FILE pseudo_lock_trace + #include <trace/define_trace.h> diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index c85ace29ea3a..885026468440 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -18,6 +18,7 @@ #include <linux/fs_parser.h> #include <linux/sysfs.h> #include <linux/kernfs.h> +#include <linux/resctrl.h> #include <linux/seq_buf.h> #include <linux/seq_file.h> #include <linux/sched/signal.h> @@ -29,341 +30,16 @@ #include <uapi/linux/magic.h> #include <asm/msr.h> -#include <asm/resctrl.h> #include "internal.h" DEFINE_STATIC_KEY_FALSE(rdt_enable_key); -DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key); -DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key); - -/* Mutex to protect rdtgroup access. */ -DEFINE_MUTEX(rdtgroup_mutex); - -static struct kernfs_root *rdt_root; -struct rdtgroup rdtgroup_default; -LIST_HEAD(rdt_all_groups); - -/* list of entries for the schemata file */ -LIST_HEAD(resctrl_schema_all); - -/* The filesystem can only be mounted once. */ -bool resctrl_mounted; - -/* Kernel fs node for "info" directory under root */ -static struct kernfs_node *kn_info; - -/* Kernel fs node for "mon_groups" directory under root */ -static struct kernfs_node *kn_mongrp; - -/* Kernel fs node for "mon_data" directory under root */ -static struct kernfs_node *kn_mondata; - -/* - * Used to store the max resource name width to display the schemata names in - * a tabular format. - */ -int max_name_width; - -static struct seq_buf last_cmd_status; -static char last_cmd_status_buf[512]; - -static int rdtgroup_setup_root(struct rdt_fs_context *ctx); -static void rdtgroup_destroy_root(void); - -struct dentry *debugfs_resctrl; - -/* - * Memory bandwidth monitoring event to use for the default CTRL_MON group - * and each new CTRL_MON group created by the user. Only relevant when - * the filesystem is mounted with the "mba_MBps" option so it does not - * matter that it remains uninitialized on systems that do not support - * the "mba_MBps" option. - */ -enum resctrl_event_id mba_mbps_default_event; - -static bool resctrl_debug; - -void rdt_last_cmd_clear(void) -{ - lockdep_assert_held(&rdtgroup_mutex); - seq_buf_clear(&last_cmd_status); -} - -void rdt_last_cmd_puts(const char *s) -{ - lockdep_assert_held(&rdtgroup_mutex); - seq_buf_puts(&last_cmd_status, s); -} - -void rdt_last_cmd_printf(const char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - lockdep_assert_held(&rdtgroup_mutex); - seq_buf_vprintf(&last_cmd_status, fmt, ap); - va_end(ap); -} - -void rdt_staged_configs_clear(void) -{ - struct rdt_ctrl_domain *dom; - struct rdt_resource *r; - - lockdep_assert_held(&rdtgroup_mutex); - - for_each_alloc_capable_rdt_resource(r) { - list_for_each_entry(dom, &r->ctrl_domains, hdr.list) - memset(dom->staged_config, 0, sizeof(dom->staged_config)); - } -} - -static bool resctrl_is_mbm_enabled(void) -{ - return (resctrl_arch_is_mbm_total_enabled() || - resctrl_arch_is_mbm_local_enabled()); -} - -static bool resctrl_is_mbm_event(int e) -{ - return (e >= QOS_L3_MBM_TOTAL_EVENT_ID && - e <= QOS_L3_MBM_LOCAL_EVENT_ID); -} - -/* - * Trivial allocator for CLOSIDs. Since h/w only supports a small number, - * we can keep a bitmap of free CLOSIDs in a single integer. - * - * Using a global CLOSID across all resources has some advantages and - * some drawbacks: - * + We can simply set current's closid to assign a task to a resource - * group. - * + Context switch code can avoid extra memory references deciding which - * CLOSID to load into the PQR_ASSOC MSR - * - We give up some options in configuring resource groups across multi-socket - * systems. - * - Our choices on how to configure each resource become progressively more - * limited as the number of resources grows. - */ -static unsigned long closid_free_map; -static int closid_free_map_len; - -int closids_supported(void) -{ - return closid_free_map_len; -} - -static void closid_init(void) -{ - struct resctrl_schema *s; - u32 rdt_min_closid = 32; - - /* Compute rdt_min_closid across all resources */ - list_for_each_entry(s, &resctrl_schema_all, list) - rdt_min_closid = min(rdt_min_closid, s->num_closid); - - closid_free_map = BIT_MASK(rdt_min_closid) - 1; - - /* RESCTRL_RESERVED_CLOSID is always reserved for the default group */ - __clear_bit(RESCTRL_RESERVED_CLOSID, &closid_free_map); - closid_free_map_len = rdt_min_closid; -} - -static int closid_alloc(void) -{ - int cleanest_closid; - u32 closid; - - lockdep_assert_held(&rdtgroup_mutex); - - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) && - resctrl_arch_is_llc_occupancy_enabled()) { - cleanest_closid = resctrl_find_cleanest_closid(); - if (cleanest_closid < 0) - return cleanest_closid; - closid = cleanest_closid; - } else { - closid = ffs(closid_free_map); - if (closid == 0) - return -ENOSPC; - closid--; - } - __clear_bit(closid, &closid_free_map); - - return closid; -} - -void closid_free(int closid) -{ - lockdep_assert_held(&rdtgroup_mutex); - - __set_bit(closid, &closid_free_map); -} - -/** - * closid_allocated - test if provided closid is in use - * @closid: closid to be tested - * - * Return: true if @closid is currently associated with a resource group, - * false if @closid is free - */ -bool closid_allocated(unsigned int closid) -{ - lockdep_assert_held(&rdtgroup_mutex); - - return !test_bit(closid, &closid_free_map); -} - -/** - * rdtgroup_mode_by_closid - Return mode of resource group with closid - * @closid: closid if the resource group - * - * Each resource group is associated with a @closid. Here the mode - * of a resource group can be queried by searching for it using its closid. - * - * Return: mode as &enum rdtgrp_mode of resource group with closid @closid - */ -enum rdtgrp_mode rdtgroup_mode_by_closid(int closid) -{ - struct rdtgroup *rdtgrp; - - list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { - if (rdtgrp->closid == closid) - return rdtgrp->mode; - } - - return RDT_NUM_MODES; -} - -static const char * const rdt_mode_str[] = { - [RDT_MODE_SHAREABLE] = "shareable", - [RDT_MODE_EXCLUSIVE] = "exclusive", - [RDT_MODE_PSEUDO_LOCKSETUP] = "pseudo-locksetup", - [RDT_MODE_PSEUDO_LOCKED] = "pseudo-locked", -}; - -/** - * rdtgroup_mode_str - Return the string representation of mode - * @mode: the resource group mode as &enum rdtgroup_mode - * - * Return: string representation of valid mode, "unknown" otherwise - */ -static const char *rdtgroup_mode_str(enum rdtgrp_mode mode) -{ - if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES) - return "unknown"; - - return rdt_mode_str[mode]; -} -/* set uid and gid of rdtgroup dirs and files to that of the creator */ -static int rdtgroup_kn_set_ugid(struct kernfs_node *kn) -{ - struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, - .ia_uid = current_fsuid(), - .ia_gid = current_fsgid(), }; - - if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) && - gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID)) - return 0; - - return kernfs_setattr(kn, &iattr); -} - -static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft) -{ - struct kernfs_node *kn; - int ret; - - kn = __kernfs_create_file(parent_kn, rft->name, rft->mode, - GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, - 0, rft->kf_ops, rft, NULL, NULL); - if (IS_ERR(kn)) - return PTR_ERR(kn); - - ret = rdtgroup_kn_set_ugid(kn); - if (ret) { - kernfs_remove(kn); - return ret; - } - - return 0; -} - -static int rdtgroup_seqfile_show(struct seq_file *m, void *arg) -{ - struct kernfs_open_file *of = m->private; - struct rftype *rft = of->kn->priv; - - if (rft->seq_show) - return rft->seq_show(of, m, arg); - return 0; -} - -static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf, - size_t nbytes, loff_t off) -{ - struct rftype *rft = of->kn->priv; - - if (rft->write) - return rft->write(of, buf, nbytes, off); - - return -EINVAL; -} - -static const struct kernfs_ops rdtgroup_kf_single_ops = { - .atomic_write_len = PAGE_SIZE, - .write = rdtgroup_file_write, - .seq_show = rdtgroup_seqfile_show, -}; - -static const struct kernfs_ops kf_mondata_ops = { - .atomic_write_len = PAGE_SIZE, - .seq_show = rdtgroup_mondata_show, -}; - -static bool is_cpu_list(struct kernfs_open_file *of) -{ - struct rftype *rft = of->kn->priv; - - return rft->flags & RFTYPE_FLAGS_CPUS_LIST; -} - -static int rdtgroup_cpus_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct rdtgroup *rdtgrp; - struct cpumask *mask; - int ret = 0; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - - if (rdtgrp) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - if (!rdtgrp->plr->d) { - rdt_last_cmd_clear(); - rdt_last_cmd_puts("Cache domain offline\n"); - ret = -ENODEV; - } else { - mask = &rdtgrp->plr->d->hdr.cpu_mask; - seq_printf(s, is_cpu_list(of) ? - "%*pbl\n" : "%*pb\n", - cpumask_pr_args(mask)); - } - } else { - seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", - cpumask_pr_args(&rdtgrp->cpu_mask)); - } - } else { - ret = -ENOENT; - } - rdtgroup_kn_unlock(of->kn); +DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key); - return ret; -} +DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key); /* - * This is safe against resctrl_sched_in() called from __switch_to() + * This is safe against resctrl_arch_sched_in() called from __switch_to() * because __switch_to() is executed with interrupts disabled. A local call * from update_closid_rmid() is protected against __switch_to() because * preemption is disabled. @@ -382,1223 +58,7 @@ void resctrl_arch_sync_cpu_closid_rmid(void *info) * executing task might have its own closid selected. Just reuse * the context switch code. */ - resctrl_sched_in(current); -} - -/* - * Update the PGR_ASSOC MSR on all cpus in @cpu_mask, - * - * Per task closids/rmids must have been set up before calling this function. - * @r may be NULL. - */ -static void -update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r) -{ - struct resctrl_cpu_defaults defaults, *p = NULL; - - if (r) { - defaults.closid = r->closid; - defaults.rmid = r->mon.rmid; - p = &defaults; - } - - on_each_cpu_mask(cpu_mask, resctrl_arch_sync_cpu_closid_rmid, p, 1); -} - -static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, - cpumask_var_t tmpmask) -{ - struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp; - struct list_head *head; - - /* Check whether cpus belong to parent ctrl group */ - cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask); - if (!cpumask_empty(tmpmask)) { - rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n"); - return -EINVAL; - } - - /* Check whether cpus are dropped from this group */ - cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); - if (!cpumask_empty(tmpmask)) { - /* Give any dropped cpus to parent rdtgroup */ - cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask); - update_closid_rmid(tmpmask, prgrp); - } - - /* - * If we added cpus, remove them from previous group that owned them - * and update per-cpu rmid - */ - cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); - if (!cpumask_empty(tmpmask)) { - head = &prgrp->mon.crdtgrp_list; - list_for_each_entry(crgrp, head, mon.crdtgrp_list) { - if (crgrp == rdtgrp) - continue; - cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask, - tmpmask); - } - update_closid_rmid(tmpmask, rdtgrp); - } - - /* Done pushing/pulling - update this group with new mask */ - cpumask_copy(&rdtgrp->cpu_mask, newmask); - - return 0; -} - -static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m) -{ - struct rdtgroup *crgrp; - - cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m); - /* update the child mon group masks as well*/ - list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list) - cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask); -} - -static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, - cpumask_var_t tmpmask, cpumask_var_t tmpmask1) -{ - struct rdtgroup *r, *crgrp; - struct list_head *head; - - /* Check whether cpus are dropped from this group */ - cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); - if (!cpumask_empty(tmpmask)) { - /* Can't drop from default group */ - if (rdtgrp == &rdtgroup_default) { - rdt_last_cmd_puts("Can't drop CPUs from default group\n"); - return -EINVAL; - } - - /* Give any dropped cpus to rdtgroup_default */ - cpumask_or(&rdtgroup_default.cpu_mask, - &rdtgroup_default.cpu_mask, tmpmask); - update_closid_rmid(tmpmask, &rdtgroup_default); - } - - /* - * If we added cpus, remove them from previous group and - * the prev group's child groups that owned them - * and update per-cpu closid/rmid. - */ - cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); - if (!cpumask_empty(tmpmask)) { - list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { - if (r == rdtgrp) - continue; - cpumask_and(tmpmask1, &r->cpu_mask, tmpmask); - if (!cpumask_empty(tmpmask1)) - cpumask_rdtgrp_clear(r, tmpmask1); - } - update_closid_rmid(tmpmask, rdtgrp); - } - - /* Done pushing/pulling - update this group with new mask */ - cpumask_copy(&rdtgrp->cpu_mask, newmask); - - /* - * Clear child mon group masks since there is a new parent mask - * now and update the rmid for the cpus the child lost. - */ - head = &rdtgrp->mon.crdtgrp_list; - list_for_each_entry(crgrp, head, mon.crdtgrp_list) { - cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask); - update_closid_rmid(tmpmask, rdtgrp); - cpumask_clear(&crgrp->cpu_mask); - } - - return 0; -} - -static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - cpumask_var_t tmpmask, newmask, tmpmask1; - struct rdtgroup *rdtgrp; - int ret; - - if (!buf) - return -EINVAL; - - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) - return -ENOMEM; - if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) { - free_cpumask_var(tmpmask); - return -ENOMEM; - } - if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) { - free_cpumask_var(tmpmask); - free_cpumask_var(newmask); - return -ENOMEM; - } - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - ret = -ENOENT; - goto unlock; - } - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - ret = -EINVAL; - rdt_last_cmd_puts("Pseudo-locking in progress\n"); - goto unlock; - } - - if (is_cpu_list(of)) - ret = cpulist_parse(buf, newmask); - else - ret = cpumask_parse(buf, newmask); - - if (ret) { - rdt_last_cmd_puts("Bad CPU list/mask\n"); - goto unlock; - } - - /* check that user didn't specify any offline cpus */ - cpumask_andnot(tmpmask, newmask, cpu_online_mask); - if (!cpumask_empty(tmpmask)) { - ret = -EINVAL; - rdt_last_cmd_puts("Can only assign online CPUs\n"); - goto unlock; - } - - if (rdtgrp->type == RDTCTRL_GROUP) - ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1); - else if (rdtgrp->type == RDTMON_GROUP) - ret = cpus_mon_write(rdtgrp, newmask, tmpmask); - else - ret = -EINVAL; - -unlock: - rdtgroup_kn_unlock(of->kn); - free_cpumask_var(tmpmask); - free_cpumask_var(newmask); - free_cpumask_var(tmpmask1); - - return ret ?: nbytes; -} - -/** - * rdtgroup_remove - the helper to remove resource group safely - * @rdtgrp: resource group to remove - * - * On resource group creation via a mkdir, an extra kernfs_node reference is - * taken to ensure that the rdtgroup structure remains accessible for the - * rdtgroup_kn_unlock() calls where it is removed. - * - * Drop the extra reference here, then free the rdtgroup structure. - * - * Return: void - */ -static void rdtgroup_remove(struct rdtgroup *rdtgrp) -{ - kernfs_put(rdtgrp->kn); - kfree(rdtgrp); -} - -static void _update_task_closid_rmid(void *task) -{ - /* - * If the task is still current on this CPU, update PQR_ASSOC MSR. - * Otherwise, the MSR is updated when the task is scheduled in. - */ - if (task == current) - resctrl_sched_in(task); -} - -static void update_task_closid_rmid(struct task_struct *t) -{ - if (IS_ENABLED(CONFIG_SMP) && task_curr(t)) - smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1); - else - _update_task_closid_rmid(t); -} - -static bool task_in_rdtgroup(struct task_struct *tsk, struct rdtgroup *rdtgrp) -{ - u32 closid, rmid = rdtgrp->mon.rmid; - - if (rdtgrp->type == RDTCTRL_GROUP) - closid = rdtgrp->closid; - else if (rdtgrp->type == RDTMON_GROUP) - closid = rdtgrp->mon.parent->closid; - else - return false; - - return resctrl_arch_match_closid(tsk, closid) && - resctrl_arch_match_rmid(tsk, closid, rmid); -} - -static int __rdtgroup_move_task(struct task_struct *tsk, - struct rdtgroup *rdtgrp) -{ - /* If the task is already in rdtgrp, no need to move the task. */ - if (task_in_rdtgroup(tsk, rdtgrp)) - return 0; - - /* - * Set the task's closid/rmid before the PQR_ASSOC MSR can be - * updated by them. - * - * For ctrl_mon groups, move both closid and rmid. - * For monitor groups, can move the tasks only from - * their parent CTRL group. - */ - if (rdtgrp->type == RDTMON_GROUP && - !resctrl_arch_match_closid(tsk, rdtgrp->mon.parent->closid)) { - rdt_last_cmd_puts("Can't move task to different control group\n"); - return -EINVAL; - } - - if (rdtgrp->type == RDTMON_GROUP) - resctrl_arch_set_closid_rmid(tsk, rdtgrp->mon.parent->closid, - rdtgrp->mon.rmid); - else - resctrl_arch_set_closid_rmid(tsk, rdtgrp->closid, - rdtgrp->mon.rmid); - - /* - * Ensure the task's closid and rmid are written before determining if - * the task is current that will decide if it will be interrupted. - * This pairs with the full barrier between the rq->curr update and - * resctrl_sched_in() during context switch. - */ - smp_mb(); - - /* - * By now, the task's closid and rmid are set. If the task is current - * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource - * group go into effect. If the task is not current, the MSR will be - * updated when the task is scheduled in. - */ - update_task_closid_rmid(tsk); - - return 0; -} - -static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) -{ - return (resctrl_arch_alloc_capable() && (r->type == RDTCTRL_GROUP) && - resctrl_arch_match_closid(t, r->closid)); -} - -static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) -{ - return (resctrl_arch_mon_capable() && (r->type == RDTMON_GROUP) && - resctrl_arch_match_rmid(t, r->mon.parent->closid, - r->mon.rmid)); -} - -/** - * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group - * @r: Resource group - * - * Return: 1 if tasks have been assigned to @r, 0 otherwise - */ -int rdtgroup_tasks_assigned(struct rdtgroup *r) -{ - struct task_struct *p, *t; - int ret = 0; - - lockdep_assert_held(&rdtgroup_mutex); - - rcu_read_lock(); - for_each_process_thread(p, t) { - if (is_closid_match(t, r) || is_rmid_match(t, r)) { - ret = 1; - break; - } - } - rcu_read_unlock(); - - return ret; -} - -static int rdtgroup_task_write_permission(struct task_struct *task, - struct kernfs_open_file *of) -{ - const struct cred *tcred = get_task_cred(task); - const struct cred *cred = current_cred(); - int ret = 0; - - /* - * Even if we're attaching all tasks in the thread group, we only - * need to check permissions on one of them. - */ - if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && - !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) { - rdt_last_cmd_printf("No permission to move task %d\n", task->pid); - ret = -EPERM; - } - - put_cred(tcred); - return ret; -} - -static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp, - struct kernfs_open_file *of) -{ - struct task_struct *tsk; - int ret; - - rcu_read_lock(); - if (pid) { - tsk = find_task_by_vpid(pid); - if (!tsk) { - rcu_read_unlock(); - rdt_last_cmd_printf("No task %d\n", pid); - return -ESRCH; - } - } else { - tsk = current; - } - - get_task_struct(tsk); - rcu_read_unlock(); - - ret = rdtgroup_task_write_permission(tsk, of); - if (!ret) - ret = __rdtgroup_move_task(tsk, rdtgrp); - - put_task_struct(tsk); - return ret; -} - -static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct rdtgroup *rdtgrp; - char *pid_str; - int ret = 0; - pid_t pid; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - rdtgroup_kn_unlock(of->kn); - return -ENOENT; - } - rdt_last_cmd_clear(); - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - ret = -EINVAL; - rdt_last_cmd_puts("Pseudo-locking in progress\n"); - goto unlock; - } - - while (buf && buf[0] != '\0' && buf[0] != '\n') { - pid_str = strim(strsep(&buf, ",")); - - if (kstrtoint(pid_str, 0, &pid)) { - rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str); - ret = -EINVAL; - break; - } - - if (pid < 0) { - rdt_last_cmd_printf("Invalid pid %d\n", pid); - ret = -EINVAL; - break; - } - - ret = rdtgroup_move_task(pid, rdtgrp, of); - if (ret) { - rdt_last_cmd_printf("Error while processing task %d\n", pid); - break; - } - } - -unlock: - rdtgroup_kn_unlock(of->kn); - - return ret ?: nbytes; -} - -static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) -{ - struct task_struct *p, *t; - pid_t pid; - - rcu_read_lock(); - for_each_process_thread(p, t) { - if (is_closid_match(t, r) || is_rmid_match(t, r)) { - pid = task_pid_vnr(t); - if (pid) - seq_printf(s, "%d\n", pid); - } - } - rcu_read_unlock(); -} - -static int rdtgroup_tasks_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct rdtgroup *rdtgrp; - int ret = 0; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (rdtgrp) - show_rdt_tasks(rdtgrp, s); - else - ret = -ENOENT; - rdtgroup_kn_unlock(of->kn); - - return ret; -} - -static int rdtgroup_closid_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct rdtgroup *rdtgrp; - int ret = 0; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (rdtgrp) - seq_printf(s, "%u\n", rdtgrp->closid); - else - ret = -ENOENT; - rdtgroup_kn_unlock(of->kn); - - return ret; -} - -static int rdtgroup_rmid_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct rdtgroup *rdtgrp; - int ret = 0; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (rdtgrp) - seq_printf(s, "%u\n", rdtgrp->mon.rmid); - else - ret = -ENOENT; - rdtgroup_kn_unlock(of->kn); - - return ret; -} - -#ifdef CONFIG_PROC_CPU_RESCTRL - -/* - * A task can only be part of one resctrl control group and of one monitor - * group which is associated to that control group. - * - * 1) res: - * mon: - * - * resctrl is not available. - * - * 2) res:/ - * mon: - * - * Task is part of the root resctrl control group, and it is not associated - * to any monitor group. - * - * 3) res:/ - * mon:mon0 - * - * Task is part of the root resctrl control group and monitor group mon0. - * - * 4) res:group0 - * mon: - * - * Task is part of resctrl control group group0, and it is not associated - * to any monitor group. - * - * 5) res:group0 - * mon:mon1 - * - * Task is part of resctrl control group group0 and monitor group mon1. - */ -int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns, - struct pid *pid, struct task_struct *tsk) -{ - struct rdtgroup *rdtg; - int ret = 0; - - mutex_lock(&rdtgroup_mutex); - - /* Return empty if resctrl has not been mounted. */ - if (!resctrl_mounted) { - seq_puts(s, "res:\nmon:\n"); - goto unlock; - } - - list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) { - struct rdtgroup *crg; - - /* - * Task information is only relevant for shareable - * and exclusive groups. - */ - if (rdtg->mode != RDT_MODE_SHAREABLE && - rdtg->mode != RDT_MODE_EXCLUSIVE) - continue; - - if (!resctrl_arch_match_closid(tsk, rdtg->closid)) - continue; - - seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "", - rdt_kn_name(rdtg->kn)); - seq_puts(s, "mon:"); - list_for_each_entry(crg, &rdtg->mon.crdtgrp_list, - mon.crdtgrp_list) { - if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid, - crg->mon.rmid)) - continue; - seq_printf(s, "%s", rdt_kn_name(crg->kn)); - break; - } - seq_putc(s, '\n'); - goto unlock; - } - /* - * The above search should succeed. Otherwise return - * with an error. - */ - ret = -ENOENT; -unlock: - mutex_unlock(&rdtgroup_mutex); - - return ret; -} -#endif - -static int rdt_last_cmd_status_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - int len; - - mutex_lock(&rdtgroup_mutex); - len = seq_buf_used(&last_cmd_status); - if (len) - seq_printf(seq, "%.*s", len, last_cmd_status_buf); - else - seq_puts(seq, "ok\n"); - mutex_unlock(&rdtgroup_mutex); - return 0; -} - -static void *rdt_kn_parent_priv(struct kernfs_node *kn) -{ - /* - * The parent pointer is only valid within RCU section since it can be - * replaced. - */ - guard(rcu)(); - return rcu_dereference(kn->__parent)->priv; -} - -static int rdt_num_closids_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - - seq_printf(seq, "%u\n", s->num_closid); - return 0; -} - -static int rdt_default_ctrl_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - - seq_printf(seq, "%x\n", resctrl_get_default_ctrl(r)); - return 0; -} - -static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - - seq_printf(seq, "%u\n", r->cache.min_cbm_bits); - return 0; -} - -static int rdt_shareable_bits_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - - seq_printf(seq, "%x\n", r->cache.shareable_bits); - return 0; -} - -/* - * rdt_bit_usage_show - Display current usage of resources - * - * A domain is a shared resource that can now be allocated differently. Here - * we display the current regions of the domain as an annotated bitmask. - * For each domain of this resource its allocation bitmask - * is annotated as below to indicate the current usage of the corresponding bit: - * 0 - currently unused - * X - currently available for sharing and used by software and hardware - * H - currently used by hardware only but available for software use - * S - currently used and shareable by software only - * E - currently used exclusively by one resource group - * P - currently pseudo-locked by one resource group - */ -static int rdt_bit_usage_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - /* - * Use unsigned long even though only 32 bits are used to ensure - * test_bit() is used safely. - */ - unsigned long sw_shareable = 0, hw_shareable = 0; - unsigned long exclusive = 0, pseudo_locked = 0; - struct rdt_resource *r = s->res; - struct rdt_ctrl_domain *dom; - int i, hwb, swb, excl, psl; - enum rdtgrp_mode mode; - bool sep = false; - u32 ctrl_val; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - hw_shareable = r->cache.shareable_bits; - list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { - if (sep) - seq_putc(seq, ';'); - sw_shareable = 0; - exclusive = 0; - seq_printf(seq, "%d=", dom->hdr.id); - for (i = 0; i < closids_supported(); i++) { - if (!closid_allocated(i)) - continue; - ctrl_val = resctrl_arch_get_config(r, dom, i, - s->conf_type); - mode = rdtgroup_mode_by_closid(i); - switch (mode) { - case RDT_MODE_SHAREABLE: - sw_shareable |= ctrl_val; - break; - case RDT_MODE_EXCLUSIVE: - exclusive |= ctrl_val; - break; - case RDT_MODE_PSEUDO_LOCKSETUP: - /* - * RDT_MODE_PSEUDO_LOCKSETUP is possible - * here but not included since the CBM - * associated with this CLOSID in this mode - * is not initialized and no task or cpu can be - * assigned this CLOSID. - */ - break; - case RDT_MODE_PSEUDO_LOCKED: - case RDT_NUM_MODES: - WARN(1, - "invalid mode for closid %d\n", i); - break; - } - } - for (i = r->cache.cbm_len - 1; i >= 0; i--) { - pseudo_locked = dom->plr ? dom->plr->cbm : 0; - hwb = test_bit(i, &hw_shareable); - swb = test_bit(i, &sw_shareable); - excl = test_bit(i, &exclusive); - psl = test_bit(i, &pseudo_locked); - if (hwb && swb) - seq_putc(seq, 'X'); - else if (hwb && !swb) - seq_putc(seq, 'H'); - else if (!hwb && swb) - seq_putc(seq, 'S'); - else if (excl) - seq_putc(seq, 'E'); - else if (psl) - seq_putc(seq, 'P'); - else /* Unused bits remain */ - seq_putc(seq, '0'); - } - sep = true; - } - seq_putc(seq, '\n'); - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - return 0; -} - -static int rdt_min_bw_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - - seq_printf(seq, "%u\n", r->membw.min_bw); - return 0; -} - -static int rdt_num_rmids_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - - seq_printf(seq, "%d\n", r->num_rmid); - - return 0; -} - -static int rdt_mon_features_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - struct mon_evt *mevt; - - list_for_each_entry(mevt, &r->evt_list, list) { - seq_printf(seq, "%s\n", mevt->name); - if (mevt->configurable) - seq_printf(seq, "%s_config\n", mevt->name); - } - - return 0; -} - -static int rdt_bw_gran_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - - seq_printf(seq, "%u\n", r->membw.bw_gran); - return 0; -} - -static int rdt_delay_linear_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - - seq_printf(seq, "%u\n", r->membw.delay_linear); - return 0; -} - -static int max_threshold_occ_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold); - - return 0; -} - -static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - - switch (r->membw.throttle_mode) { - case THREAD_THROTTLE_PER_THREAD: - seq_puts(seq, "per-thread\n"); - return 0; - case THREAD_THROTTLE_MAX: - seq_puts(seq, "max\n"); - return 0; - case THREAD_THROTTLE_UNDEFINED: - seq_puts(seq, "undefined\n"); - return 0; - } - - WARN_ON_ONCE(1); - - return 0; -} - -static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - unsigned int bytes; - int ret; - - ret = kstrtouint(buf, 0, &bytes); - if (ret) - return ret; - - if (bytes > resctrl_rmid_realloc_limit) - return -EINVAL; - - resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(bytes); - - return nbytes; -} - -/* - * rdtgroup_mode_show - Display mode of this resource group - */ -static int rdtgroup_mode_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct rdtgroup *rdtgrp; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - rdtgroup_kn_unlock(of->kn); - return -ENOENT; - } - - seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode)); - - rdtgroup_kn_unlock(of->kn); - return 0; -} - -static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type) -{ - switch (my_type) { - case CDP_CODE: - return CDP_DATA; - case CDP_DATA: - return CDP_CODE; - default: - case CDP_NONE: - return CDP_NONE; - } -} - -static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - - seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks); - - return 0; -} - -/** - * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other - * @r: Resource to which domain instance @d belongs. - * @d: The domain instance for which @closid is being tested. - * @cbm: Capacity bitmask being tested. - * @closid: Intended closid for @cbm. - * @type: CDP type of @r. - * @exclusive: Only check if overlaps with exclusive resource groups - * - * Checks if provided @cbm intended to be used for @closid on domain - * @d overlaps with any other closids or other hardware usage associated - * with this domain. If @exclusive is true then only overlaps with - * resource groups in exclusive mode will be considered. If @exclusive - * is false then overlaps with any resource group or hardware entities - * will be considered. - * - * @cbm is unsigned long, even if only 32 bits are used, to make the - * bitmap functions work correctly. - * - * Return: false if CBM does not overlap, true if it does. - */ -static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_ctrl_domain *d, - unsigned long cbm, int closid, - enum resctrl_conf_type type, bool exclusive) -{ - enum rdtgrp_mode mode; - unsigned long ctrl_b; - int i; - - /* Check for any overlap with regions used by hardware directly */ - if (!exclusive) { - ctrl_b = r->cache.shareable_bits; - if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) - return true; - } - - /* Check for overlap with other resource groups */ - for (i = 0; i < closids_supported(); i++) { - ctrl_b = resctrl_arch_get_config(r, d, i, type); - mode = rdtgroup_mode_by_closid(i); - if (closid_allocated(i) && i != closid && - mode != RDT_MODE_PSEUDO_LOCKSETUP) { - if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) { - if (exclusive) { - if (mode == RDT_MODE_EXCLUSIVE) - return true; - continue; - } - return true; - } - } - } - - return false; -} - -/** - * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware - * @s: Schema for the resource to which domain instance @d belongs. - * @d: The domain instance for which @closid is being tested. - * @cbm: Capacity bitmask being tested. - * @closid: Intended closid for @cbm. - * @exclusive: Only check if overlaps with exclusive resource groups - * - * Resources that can be allocated using a CBM can use the CBM to control - * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test - * for overlap. Overlap test is not limited to the specific resource for - * which the CBM is intended though - when dealing with CDP resources that - * share the underlying hardware the overlap check should be performed on - * the CDP resource sharing the hardware also. - * - * Refer to description of __rdtgroup_cbm_overlaps() for the details of the - * overlap test. - * - * Return: true if CBM overlap detected, false if there is no overlap - */ -bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d, - unsigned long cbm, int closid, bool exclusive) -{ - enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type); - struct rdt_resource *r = s->res; - - if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, s->conf_type, - exclusive)) - return true; - - if (!resctrl_arch_get_cdp_enabled(r->rid)) - return false; - return __rdtgroup_cbm_overlaps(r, d, cbm, closid, peer_type, exclusive); -} - -/** - * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive - * @rdtgrp: Resource group identified through its closid. - * - * An exclusive resource group implies that there should be no sharing of - * its allocated resources. At the time this group is considered to be - * exclusive this test can determine if its current schemata supports this - * setting by testing for overlap with all other resource groups. - * - * Return: true if resource group can be exclusive, false if there is overlap - * with allocations of other resource groups and thus this resource group - * cannot be exclusive. - */ -static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) -{ - int closid = rdtgrp->closid; - struct rdt_ctrl_domain *d; - struct resctrl_schema *s; - struct rdt_resource *r; - bool has_cache = false; - u32 ctrl; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - list_for_each_entry(s, &resctrl_schema_all, list) { - r = s->res; - if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA) - continue; - has_cache = true; - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { - ctrl = resctrl_arch_get_config(r, d, closid, - s->conf_type); - if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) { - rdt_last_cmd_puts("Schemata overlaps\n"); - return false; - } - } - } - - if (!has_cache) { - rdt_last_cmd_puts("Cannot be exclusive without CAT/CDP\n"); - return false; - } - - return true; -} - -/* - * rdtgroup_mode_write - Modify the resource group's mode - */ -static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct rdtgroup *rdtgrp; - enum rdtgrp_mode mode; - int ret = 0; - - /* Valid input requires a trailing newline */ - if (nbytes == 0 || buf[nbytes - 1] != '\n') - return -EINVAL; - buf[nbytes - 1] = '\0'; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - rdtgroup_kn_unlock(of->kn); - return -ENOENT; - } - - rdt_last_cmd_clear(); - - mode = rdtgrp->mode; - - if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) || - (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) || - (!strcmp(buf, "pseudo-locksetup") && - mode == RDT_MODE_PSEUDO_LOCKSETUP) || - (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED)) - goto out; - - if (mode == RDT_MODE_PSEUDO_LOCKED) { - rdt_last_cmd_puts("Cannot change pseudo-locked group\n"); - ret = -EINVAL; - goto out; - } - - if (!strcmp(buf, "shareable")) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - ret = rdtgroup_locksetup_exit(rdtgrp); - if (ret) - goto out; - } - rdtgrp->mode = RDT_MODE_SHAREABLE; - } else if (!strcmp(buf, "exclusive")) { - if (!rdtgroup_mode_test_exclusive(rdtgrp)) { - ret = -EINVAL; - goto out; - } - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - ret = rdtgroup_locksetup_exit(rdtgrp); - if (ret) - goto out; - } - rdtgrp->mode = RDT_MODE_EXCLUSIVE; - } else if (IS_ENABLED(CONFIG_RESCTRL_FS_PSEUDO_LOCK) && - !strcmp(buf, "pseudo-locksetup")) { - ret = rdtgroup_locksetup_enter(rdtgrp); - if (ret) - goto out; - rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP; - } else { - rdt_last_cmd_puts("Unknown or unsupported mode\n"); - ret = -EINVAL; - } - -out: - rdtgroup_kn_unlock(of->kn); - return ret ?: nbytes; -} - -/** - * rdtgroup_cbm_to_size - Translate CBM to size in bytes - * @r: RDT resource to which @d belongs. - * @d: RDT domain instance. - * @cbm: bitmask for which the size should be computed. - * - * The bitmask provided associated with the RDT domain instance @d will be - * translated into how many bytes it represents. The size in bytes is - * computed by first dividing the total cache size by the CBM length to - * determine how many bytes each bit in the bitmask represents. The result - * is multiplied with the number of bits set in the bitmask. - * - * @cbm is unsigned long, even if only 32 bits are used to make the - * bitmap functions work correctly. - */ -unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, - struct rdt_ctrl_domain *d, unsigned long cbm) -{ - unsigned int size = 0; - struct cacheinfo *ci; - int num_b; - - if (WARN_ON_ONCE(r->ctrl_scope != RESCTRL_L2_CACHE && r->ctrl_scope != RESCTRL_L3_CACHE)) - return size; - - num_b = bitmap_weight(&cbm, r->cache.cbm_len); - ci = get_cpu_cacheinfo_level(cpumask_any(&d->hdr.cpu_mask), r->ctrl_scope); - if (ci) - size = ci->size / r->cache.cbm_len * num_b; - - return size; -} - -/* - * rdtgroup_size_show - Display size in bytes of allocated regions - * - * The "size" file mirrors the layout of the "schemata" file, printing the - * size in bytes of each region instead of the capacity bitmask. - */ -static int rdtgroup_size_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct resctrl_schema *schema; - enum resctrl_conf_type type; - struct rdt_ctrl_domain *d; - struct rdtgroup *rdtgrp; - struct rdt_resource *r; - unsigned int size; - int ret = 0; - u32 closid; - bool sep; - u32 ctrl; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - rdtgroup_kn_unlock(of->kn); - return -ENOENT; - } - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - if (!rdtgrp->plr->d) { - rdt_last_cmd_clear(); - rdt_last_cmd_puts("Cache domain offline\n"); - ret = -ENODEV; - } else { - seq_printf(s, "%*s:", max_name_width, - rdtgrp->plr->s->name); - size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res, - rdtgrp->plr->d, - rdtgrp->plr->cbm); - seq_printf(s, "%d=%u\n", rdtgrp->plr->d->hdr.id, size); - } - goto out; - } - - closid = rdtgrp->closid; - - list_for_each_entry(schema, &resctrl_schema_all, list) { - r = schema->res; - type = schema->conf_type; - sep = false; - seq_printf(s, "%*s:", max_name_width, schema->name); - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { - if (sep) - seq_putc(s, ';'); - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - size = 0; - } else { - if (is_mba_sc(r)) - ctrl = d->mbps_val[closid]; - else - ctrl = resctrl_arch_get_config(r, d, - closid, - type); - if (r->rid == RDT_RESOURCE_MBA || - r->rid == RDT_RESOURCE_SMBA) - size = ctrl; - else - size = rdtgroup_cbm_to_size(r, d, ctrl); - } - seq_printf(s, "%d=%u", d->hdr.id, size); - sep = true; - } - seq_putc(s, '\n'); - } - -out: - rdtgroup_kn_unlock(of->kn); - - return ret; + resctrl_arch_sched_in(current); } #define INVALID_CONFIG_INDEX UINT_MAX @@ -1642,62 +102,6 @@ void resctrl_arch_mon_event_config_read(void *_config_info) config_info->mon_config = msrval & MAX_EVT_CONFIG_BITS; } -static void mondata_config_read(struct resctrl_mon_config_info *mon_info) -{ - smp_call_function_any(&mon_info->d->hdr.cpu_mask, - resctrl_arch_mon_event_config_read, mon_info, 1); -} - -static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid) -{ - struct resctrl_mon_config_info mon_info; - struct rdt_mon_domain *dom; - bool sep = false; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - list_for_each_entry(dom, &r->mon_domains, hdr.list) { - if (sep) - seq_puts(s, ";"); - - memset(&mon_info, 0, sizeof(struct resctrl_mon_config_info)); - mon_info.r = r; - mon_info.d = dom; - mon_info.evtid = evtid; - mondata_config_read(&mon_info); - - seq_printf(s, "%d=0x%02x", dom->hdr.id, mon_info.mon_config); - sep = true; - } - seq_puts(s, "\n"); - - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - - return 0; -} - -static int mbm_total_bytes_config_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - - mbm_config_show(seq, r, QOS_L3_MBM_TOTAL_EVENT_ID); - - return 0; -} - -static int mbm_local_bytes_config_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - - mbm_config_show(seq, r, QOS_L3_MBM_LOCAL_EVENT_ID); - - return 0; -} - void resctrl_arch_mon_event_config_write(void *_config_info) { struct resctrl_mon_config_info *config_info = _config_info; @@ -1711,618 +115,6 @@ void resctrl_arch_mon_event_config_write(void *_config_info) wrmsrq(MSR_IA32_EVT_CFG_BASE + index, config_info->mon_config); } -static void mbm_config_write_domain(struct rdt_resource *r, - struct rdt_mon_domain *d, u32 evtid, u32 val) -{ - struct resctrl_mon_config_info mon_info = {0}; - - /* - * Read the current config value first. If both are the same then - * no need to write it again. - */ - mon_info.r = r; - mon_info.d = d; - mon_info.evtid = evtid; - mondata_config_read(&mon_info); - if (mon_info.mon_config == val) - return; - - mon_info.mon_config = val; - - /* - * Update MSR_IA32_EVT_CFG_BASE MSR on one of the CPUs in the - * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE - * are scoped at the domain level. Writing any of these MSRs - * on one CPU is observed by all the CPUs in the domain. - */ - smp_call_function_any(&d->hdr.cpu_mask, resctrl_arch_mon_event_config_write, - &mon_info, 1); - - /* - * When an Event Configuration is changed, the bandwidth counters - * for all RMIDs and Events will be cleared by the hardware. The - * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for - * every RMID on the next read to any event for every RMID. - * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62) - * cleared while it is tracked by the hardware. Clear the - * mbm_local and mbm_total counts for all the RMIDs. - */ - resctrl_arch_reset_rmid_all(r, d); -} - -static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) -{ - char *dom_str = NULL, *id_str; - unsigned long dom_id, val; - struct rdt_mon_domain *d; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - -next: - if (!tok || tok[0] == '\0') - return 0; - - /* Start processing the strings for each domain */ - dom_str = strim(strsep(&tok, ";")); - id_str = strsep(&dom_str, "="); - - if (!id_str || kstrtoul(id_str, 10, &dom_id)) { - rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n"); - return -EINVAL; - } - - if (!dom_str || kstrtoul(dom_str, 16, &val)) { - rdt_last_cmd_puts("Non-numeric event configuration value\n"); - return -EINVAL; - } - - /* Value from user cannot be more than the supported set of events */ - if ((val & r->mbm_cfg_mask) != val) { - rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n", - r->mbm_cfg_mask); - return -EINVAL; - } - - list_for_each_entry(d, &r->mon_domains, hdr.list) { - if (d->hdr.id == dom_id) { - mbm_config_write_domain(r, d, evtid, val); - goto next; - } - } - - return -EINVAL; -} - -static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, - loff_t off) -{ - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - int ret; - - /* Valid input requires a trailing newline */ - if (nbytes == 0 || buf[nbytes - 1] != '\n') - return -EINVAL; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - rdt_last_cmd_clear(); - - buf[nbytes - 1] = '\0'; - - ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID); - - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - - return ret ?: nbytes; -} - -static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, - loff_t off) -{ - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - int ret; - - /* Valid input requires a trailing newline */ - if (nbytes == 0 || buf[nbytes - 1] != '\n') - return -EINVAL; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - rdt_last_cmd_clear(); - - buf[nbytes - 1] = '\0'; - - ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID); - - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - - return ret ?: nbytes; -} - -/* rdtgroup information files for one cache resource. */ -static struct rftype res_common_files[] = { - { - .name = "last_cmd_status", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_last_cmd_status_show, - .fflags = RFTYPE_TOP_INFO, - }, - { - .name = "num_closids", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_num_closids_show, - .fflags = RFTYPE_CTRL_INFO, - }, - { - .name = "mon_features", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_mon_features_show, - .fflags = RFTYPE_MON_INFO, - }, - { - .name = "num_rmids", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_num_rmids_show, - .fflags = RFTYPE_MON_INFO, - }, - { - .name = "cbm_mask", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_default_ctrl_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "min_cbm_bits", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_min_cbm_bits_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "shareable_bits", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_shareable_bits_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "bit_usage", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_bit_usage_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "min_bandwidth", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_min_bw_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, - }, - { - .name = "bandwidth_gran", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_bw_gran_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, - }, - { - .name = "delay_linear", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_delay_linear_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, - }, - /* - * Platform specific which (if any) capabilities are provided by - * thread_throttle_mode. Defer "fflags" initialization to platform - * discovery. - */ - { - .name = "thread_throttle_mode", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_thread_throttle_mode_show, - }, - { - .name = "max_threshold_occupancy", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = max_threshold_occ_write, - .seq_show = max_threshold_occ_show, - .fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "mbm_total_bytes_config", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = mbm_total_bytes_config_show, - .write = mbm_total_bytes_config_write, - }, - { - .name = "mbm_local_bytes_config", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = mbm_local_bytes_config_show, - .write = mbm_local_bytes_config_write, - }, - { - .name = "cpus", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_cpus_write, - .seq_show = rdtgroup_cpus_show, - .fflags = RFTYPE_BASE, - }, - { - .name = "cpus_list", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_cpus_write, - .seq_show = rdtgroup_cpus_show, - .flags = RFTYPE_FLAGS_CPUS_LIST, - .fflags = RFTYPE_BASE, - }, - { - .name = "tasks", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_tasks_write, - .seq_show = rdtgroup_tasks_show, - .fflags = RFTYPE_BASE, - }, - { - .name = "mon_hw_id", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdtgroup_rmid_show, - .fflags = RFTYPE_MON_BASE | RFTYPE_DEBUG, - }, - { - .name = "schemata", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_schemata_write, - .seq_show = rdtgroup_schemata_show, - .fflags = RFTYPE_CTRL_BASE, - }, - { - .name = "mba_MBps_event", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_mba_mbps_event_write, - .seq_show = rdtgroup_mba_mbps_event_show, - }, - { - .name = "mode", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_mode_write, - .seq_show = rdtgroup_mode_show, - .fflags = RFTYPE_CTRL_BASE, - }, - { - .name = "size", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdtgroup_size_show, - .fflags = RFTYPE_CTRL_BASE, - }, - { - .name = "sparse_masks", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_has_sparse_bitmasks_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "ctrl_hw_id", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdtgroup_closid_show, - .fflags = RFTYPE_CTRL_BASE | RFTYPE_DEBUG, - }, - -}; - -static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) -{ - struct rftype *rfts, *rft; - int ret, len; - - rfts = res_common_files; - len = ARRAY_SIZE(res_common_files); - - lockdep_assert_held(&rdtgroup_mutex); - - if (resctrl_debug) - fflags |= RFTYPE_DEBUG; - - for (rft = rfts; rft < rfts + len; rft++) { - if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) { - ret = rdtgroup_add_file(kn, rft); - if (ret) - goto error; - } - } - - return 0; -error: - pr_warn("Failed to add %s, err=%d\n", rft->name, ret); - while (--rft >= rfts) { - if ((fflags & rft->fflags) == rft->fflags) - kernfs_remove_by_name(kn, rft->name); - } - return ret; -} - -static struct rftype *rdtgroup_get_rftype_by_name(const char *name) -{ - struct rftype *rfts, *rft; - int len; - - rfts = res_common_files; - len = ARRAY_SIZE(res_common_files); - - for (rft = rfts; rft < rfts + len; rft++) { - if (!strcmp(rft->name, name)) - return rft; - } - - return NULL; -} - -static void thread_throttle_mode_init(void) -{ - enum membw_throttle_mode throttle_mode = THREAD_THROTTLE_UNDEFINED; - struct rdt_resource *r_mba, *r_smba; - - r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA); - if (r_mba->alloc_capable && - r_mba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED) - throttle_mode = r_mba->membw.throttle_mode; - - r_smba = resctrl_arch_get_resource(RDT_RESOURCE_SMBA); - if (r_smba->alloc_capable && - r_smba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED) - throttle_mode = r_smba->membw.throttle_mode; - - if (throttle_mode == THREAD_THROTTLE_UNDEFINED) - return; - - resctrl_file_fflags_init("thread_throttle_mode", - RFTYPE_CTRL_INFO | RFTYPE_RES_MB); -} - -void resctrl_file_fflags_init(const char *config, unsigned long fflags) -{ - struct rftype *rft; - - rft = rdtgroup_get_rftype_by_name(config); - if (rft) - rft->fflags = fflags; -} - -/** - * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file - * @r: The resource group with which the file is associated. - * @name: Name of the file - * - * The permissions of named resctrl file, directory, or link are modified - * to not allow read, write, or execute by any user. - * - * WARNING: This function is intended to communicate to the user that the - * resctrl file has been locked down - that it is not relevant to the - * particular state the system finds itself in. It should not be relied - * on to protect from user access because after the file's permissions - * are restricted the user can still change the permissions using chmod - * from the command line. - * - * Return: 0 on success, <0 on failure. - */ -int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name) -{ - struct iattr iattr = {.ia_valid = ATTR_MODE,}; - struct kernfs_node *kn; - int ret = 0; - - kn = kernfs_find_and_get_ns(r->kn, name, NULL); - if (!kn) - return -ENOENT; - - switch (kernfs_type(kn)) { - case KERNFS_DIR: - iattr.ia_mode = S_IFDIR; - break; - case KERNFS_FILE: - iattr.ia_mode = S_IFREG; - break; - case KERNFS_LINK: - iattr.ia_mode = S_IFLNK; - break; - } - - ret = kernfs_setattr(kn, &iattr); - kernfs_put(kn); - return ret; -} - -/** - * rdtgroup_kn_mode_restore - Restore user access to named resctrl file - * @r: The resource group with which the file is associated. - * @name: Name of the file - * @mask: Mask of permissions that should be restored - * - * Restore the permissions of the named file. If @name is a directory the - * permissions of its parent will be used. - * - * Return: 0 on success, <0 on failure. - */ -int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, - umode_t mask) -{ - struct iattr iattr = {.ia_valid = ATTR_MODE,}; - struct kernfs_node *kn, *parent; - struct rftype *rfts, *rft; - int ret, len; - - rfts = res_common_files; - len = ARRAY_SIZE(res_common_files); - - for (rft = rfts; rft < rfts + len; rft++) { - if (!strcmp(rft->name, name)) - iattr.ia_mode = rft->mode & mask; - } - - kn = kernfs_find_and_get_ns(r->kn, name, NULL); - if (!kn) - return -ENOENT; - - switch (kernfs_type(kn)) { - case KERNFS_DIR: - parent = kernfs_get_parent(kn); - if (parent) { - iattr.ia_mode |= parent->mode; - kernfs_put(parent); - } - iattr.ia_mode |= S_IFDIR; - break; - case KERNFS_FILE: - iattr.ia_mode |= S_IFREG; - break; - case KERNFS_LINK: - iattr.ia_mode |= S_IFLNK; - break; - } - - ret = kernfs_setattr(kn, &iattr); - kernfs_put(kn); - return ret; -} - -static int rdtgroup_mkdir_info_resdir(void *priv, char *name, - unsigned long fflags) -{ - struct kernfs_node *kn_subdir; - int ret; - - kn_subdir = kernfs_create_dir(kn_info, name, - kn_info->mode, priv); - if (IS_ERR(kn_subdir)) - return PTR_ERR(kn_subdir); - - ret = rdtgroup_kn_set_ugid(kn_subdir); - if (ret) - return ret; - - ret = rdtgroup_add_files(kn_subdir, fflags); - if (!ret) - kernfs_activate(kn_subdir); - - return ret; -} - -static unsigned long fflags_from_resource(struct rdt_resource *r) -{ - switch (r->rid) { - case RDT_RESOURCE_L3: - case RDT_RESOURCE_L2: - return RFTYPE_RES_CACHE; - case RDT_RESOURCE_MBA: - case RDT_RESOURCE_SMBA: - return RFTYPE_RES_MB; - } - - return WARN_ON_ONCE(1); -} - -static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) -{ - struct resctrl_schema *s; - struct rdt_resource *r; - unsigned long fflags; - char name[32]; - int ret; - - /* create the directory */ - kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); - if (IS_ERR(kn_info)) - return PTR_ERR(kn_info); - - ret = rdtgroup_add_files(kn_info, RFTYPE_TOP_INFO); - if (ret) - goto out_destroy; - - /* loop over enabled controls, these are all alloc_capable */ - list_for_each_entry(s, &resctrl_schema_all, list) { - r = s->res; - fflags = fflags_from_resource(r) | RFTYPE_CTRL_INFO; - ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags); - if (ret) - goto out_destroy; - } - - for_each_mon_capable_rdt_resource(r) { - fflags = fflags_from_resource(r) | RFTYPE_MON_INFO; - sprintf(name, "%s_MON", r->name); - ret = rdtgroup_mkdir_info_resdir(r, name, fflags); - if (ret) - goto out_destroy; - } - - ret = rdtgroup_kn_set_ugid(kn_info); - if (ret) - goto out_destroy; - - kernfs_activate(kn_info); - - return 0; - -out_destroy: - kernfs_remove(kn_info); - return ret; -} - -static int -mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp, - char *name, struct kernfs_node **dest_kn) -{ - struct kernfs_node *kn; - int ret; - - /* create the directory */ - kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); - if (IS_ERR(kn)) - return PTR_ERR(kn); - - if (dest_kn) - *dest_kn = kn; - - ret = rdtgroup_kn_set_ugid(kn); - if (ret) - goto out_destroy; - - kernfs_activate(kn); - - return 0; - -out_destroy: - kernfs_remove(kn); - return ret; -} - static void l3_qos_cfg_update(void *arg) { bool *enable = arg; @@ -2337,11 +129,6 @@ static void l2_qos_cfg_update(void *arg) wrmsrq(MSR_IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL); } -static inline bool is_mba_linear(void) -{ - return resctrl_arch_get_resource(RDT_RESOURCE_MBA)->membw.delay_linear; -} - static int set_cache_qos_cfg(int level, bool enable) { void (*update)(void *arg); @@ -2397,76 +184,6 @@ void rdt_domain_reconfigure_cdp(struct rdt_resource *r) l3_qos_cfg_update(&hw_res->cdp_enabled); } -static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_ctrl_domain *d) -{ - u32 num_closid = resctrl_arch_get_num_closid(r); - int cpu = cpumask_any(&d->hdr.cpu_mask); - int i; - - d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val), - GFP_KERNEL, cpu_to_node(cpu)); - if (!d->mbps_val) - return -ENOMEM; - - for (i = 0; i < num_closid; i++) - d->mbps_val[i] = MBA_MAX_MBPS; - - return 0; -} - -static void mba_sc_domain_destroy(struct rdt_resource *r, - struct rdt_ctrl_domain *d) -{ - kfree(d->mbps_val); - d->mbps_val = NULL; -} - -/* - * MBA software controller is supported only if - * MBM is supported and MBA is in linear scale, - * and the MBM monitor scope is the same as MBA - * control scope. - */ -static bool supports_mba_mbps(void) -{ - struct rdt_resource *rmbm = resctrl_arch_get_resource(RDT_RESOURCE_L3); - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); - - return (resctrl_is_mbm_enabled() && - r->alloc_capable && is_mba_linear() && - r->ctrl_scope == rmbm->mon_scope); -} - -/* - * Enable or disable the MBA software controller - * which helps user specify bandwidth in MBps. - */ -static int set_mba_sc(bool mba_sc) -{ - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); - u32 num_closid = resctrl_arch_get_num_closid(r); - struct rdt_ctrl_domain *d; - unsigned long fflags; - int i; - - if (!supports_mba_mbps() || mba_sc == is_mba_sc(r)) - return -EINVAL; - - r->membw.mba_sc = mba_sc; - - rdtgroup_default.mba_mbps_event = mba_mbps_default_event; - - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { - for (i = 0; i < num_closid; i++) - d->mbps_val[i] = MBA_MAX_MBPS; - } - - fflags = mba_sc ? RFTYPE_CTRL_BASE | RFTYPE_MON_BASE : 0; - resctrl_file_fflags_init("mba_MBps_event", fflags); - - return 0; -} - static int cdp_enable(int level) { struct rdt_resource *r_l = &rdt_resources_all[level].r_resctrl; @@ -2507,419 +224,9 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable) return 0; } -/* - * We don't allow rdtgroup directories to be created anywhere - * except the root directory. Thus when looking for the rdtgroup - * structure for a kernfs node we are either looking at a directory, - * in which case the rdtgroup structure is pointed at by the "priv" - * field, otherwise we have a file, and need only look to the parent - * to find the rdtgroup. - */ -static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn) -{ - if (kernfs_type(kn) == KERNFS_DIR) { - /* - * All the resource directories use "kn->priv" - * to point to the "struct rdtgroup" for the - * resource. "info" and its subdirectories don't - * have rdtgroup structures, so return NULL here. - */ - if (kn == kn_info || - rcu_access_pointer(kn->__parent) == kn_info) - return NULL; - else - return kn->priv; - } else { - return rdt_kn_parent_priv(kn); - } -} - -static void rdtgroup_kn_get(struct rdtgroup *rdtgrp, struct kernfs_node *kn) -{ - atomic_inc(&rdtgrp->waitcount); - kernfs_break_active_protection(kn); -} - -static void rdtgroup_kn_put(struct rdtgroup *rdtgrp, struct kernfs_node *kn) -{ - if (atomic_dec_and_test(&rdtgrp->waitcount) && - (rdtgrp->flags & RDT_DELETED)) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) - rdtgroup_pseudo_lock_remove(rdtgrp); - kernfs_unbreak_active_protection(kn); - rdtgroup_remove(rdtgrp); - } else { - kernfs_unbreak_active_protection(kn); - } -} - -struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) -{ - struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); - - if (!rdtgrp) - return NULL; - - rdtgroup_kn_get(rdtgrp, kn); - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - /* Was this group deleted while we waited? */ - if (rdtgrp->flags & RDT_DELETED) - return NULL; - - return rdtgrp; -} - -void rdtgroup_kn_unlock(struct kernfs_node *kn) -{ - struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); - - if (!rdtgrp) - return; - - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - - rdtgroup_kn_put(rdtgrp, kn); -} - -static int mkdir_mondata_all(struct kernfs_node *parent_kn, - struct rdtgroup *prgrp, - struct kernfs_node **mon_data_kn); - -static void rdt_disable_ctx(void) -{ - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); - set_mba_sc(false); - - resctrl_debug = false; -} - -static int rdt_enable_ctx(struct rdt_fs_context *ctx) -{ - int ret = 0; - - if (ctx->enable_cdpl2) { - ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true); - if (ret) - goto out_done; - } - - if (ctx->enable_cdpl3) { - ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true); - if (ret) - goto out_cdpl2; - } - - if (ctx->enable_mba_mbps) { - ret = set_mba_sc(true); - if (ret) - goto out_cdpl3; - } - - if (ctx->enable_debug) - resctrl_debug = true; - - return 0; - -out_cdpl3: - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); -out_cdpl2: - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); -out_done: - return ret; -} - -static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type) -{ - struct resctrl_schema *s; - const char *suffix = ""; - int ret, cl; - - s = kzalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return -ENOMEM; - - s->res = r; - s->num_closid = resctrl_arch_get_num_closid(r); - if (resctrl_arch_get_cdp_enabled(r->rid)) - s->num_closid /= 2; - - s->conf_type = type; - switch (type) { - case CDP_CODE: - suffix = "CODE"; - break; - case CDP_DATA: - suffix = "DATA"; - break; - case CDP_NONE: - suffix = ""; - break; - } - - ret = snprintf(s->name, sizeof(s->name), "%s%s", r->name, suffix); - if (ret >= sizeof(s->name)) { - kfree(s); - return -EINVAL; - } - - cl = strlen(s->name); - - /* - * If CDP is supported by this resource, but not enabled, - * include the suffix. This ensures the tabular format of the - * schemata file does not change between mounts of the filesystem. - */ - if (r->cdp_capable && !resctrl_arch_get_cdp_enabled(r->rid)) - cl += 4; - - if (cl > max_name_width) - max_name_width = cl; - - switch (r->schema_fmt) { - case RESCTRL_SCHEMA_BITMAP: - s->fmt_str = "%d=%x"; - break; - case RESCTRL_SCHEMA_RANGE: - s->fmt_str = "%d=%u"; - break; - } - - if (WARN_ON_ONCE(!s->fmt_str)) { - kfree(s); - return -EINVAL; - } - - INIT_LIST_HEAD(&s->list); - list_add(&s->list, &resctrl_schema_all); - - return 0; -} - -static int schemata_list_create(void) -{ - struct rdt_resource *r; - int ret = 0; - - for_each_alloc_capable_rdt_resource(r) { - if (resctrl_arch_get_cdp_enabled(r->rid)) { - ret = schemata_list_add(r, CDP_CODE); - if (ret) - break; - - ret = schemata_list_add(r, CDP_DATA); - } else { - ret = schemata_list_add(r, CDP_NONE); - } - - if (ret) - break; - } - - return ret; -} - -static void schemata_list_destroy(void) -{ - struct resctrl_schema *s, *tmp; - - list_for_each_entry_safe(s, tmp, &resctrl_schema_all, list) { - list_del(&s->list); - kfree(s); - } -} - -static int rdt_get_tree(struct fs_context *fc) -{ - struct rdt_fs_context *ctx = rdt_fc2context(fc); - unsigned long flags = RFTYPE_CTRL_BASE; - struct rdt_mon_domain *dom; - struct rdt_resource *r; - int ret; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - /* - * resctrl file system can only be mounted once. - */ - if (resctrl_mounted) { - ret = -EBUSY; - goto out; - } - - ret = rdtgroup_setup_root(ctx); - if (ret) - goto out; - - ret = rdt_enable_ctx(ctx); - if (ret) - goto out_root; - - ret = schemata_list_create(); - if (ret) { - schemata_list_destroy(); - goto out_ctx; - } - - closid_init(); - - if (resctrl_arch_mon_capable()) - flags |= RFTYPE_MON; - - ret = rdtgroup_add_files(rdtgroup_default.kn, flags); - if (ret) - goto out_schemata_free; - - kernfs_activate(rdtgroup_default.kn); - - ret = rdtgroup_create_info_dir(rdtgroup_default.kn); - if (ret < 0) - goto out_schemata_free; - - if (resctrl_arch_mon_capable()) { - ret = mongroup_create_dir(rdtgroup_default.kn, - &rdtgroup_default, "mon_groups", - &kn_mongrp); - if (ret < 0) - goto out_info; - - ret = mkdir_mondata_all(rdtgroup_default.kn, - &rdtgroup_default, &kn_mondata); - if (ret < 0) - goto out_mongrp; - rdtgroup_default.mon.mon_data_kn = kn_mondata; - } - - ret = rdt_pseudo_lock_init(); - if (ret) - goto out_mondata; - - ret = kernfs_get_tree(fc); - if (ret < 0) - goto out_psl; - - if (resctrl_arch_alloc_capable()) - resctrl_arch_enable_alloc(); - if (resctrl_arch_mon_capable()) - resctrl_arch_enable_mon(); - - if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable()) - resctrl_mounted = true; - - if (resctrl_is_mbm_enabled()) { - r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - list_for_each_entry(dom, &r->mon_domains, hdr.list) - mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL, - RESCTRL_PICK_ANY_CPU); - } - - goto out; - -out_psl: - rdt_pseudo_lock_release(); -out_mondata: - if (resctrl_arch_mon_capable()) - kernfs_remove(kn_mondata); -out_mongrp: - if (resctrl_arch_mon_capable()) - kernfs_remove(kn_mongrp); -out_info: - kernfs_remove(kn_info); -out_schemata_free: - schemata_list_destroy(); -out_ctx: - rdt_disable_ctx(); -out_root: - rdtgroup_destroy_root(); -out: - rdt_last_cmd_clear(); - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - return ret; -} - -enum rdt_param { - Opt_cdp, - Opt_cdpl2, - Opt_mba_mbps, - Opt_debug, - nr__rdt_params -}; - -static const struct fs_parameter_spec rdt_fs_parameters[] = { - fsparam_flag("cdp", Opt_cdp), - fsparam_flag("cdpl2", Opt_cdpl2), - fsparam_flag("mba_MBps", Opt_mba_mbps), - fsparam_flag("debug", Opt_debug), - {} -}; - -static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) -{ - struct rdt_fs_context *ctx = rdt_fc2context(fc); - struct fs_parse_result result; - const char *msg; - int opt; - - opt = fs_parse(fc, rdt_fs_parameters, param, &result); - if (opt < 0) - return opt; - - switch (opt) { - case Opt_cdp: - ctx->enable_cdpl3 = true; - return 0; - case Opt_cdpl2: - ctx->enable_cdpl2 = true; - return 0; - case Opt_mba_mbps: - msg = "mba_MBps requires MBM and linear scale MBA at L3 scope"; - if (!supports_mba_mbps()) - return invalfc(fc, msg); - ctx->enable_mba_mbps = true; - return 0; - case Opt_debug: - ctx->enable_debug = true; - return 0; - } - - return -EINVAL; -} - -static void rdt_fs_context_free(struct fs_context *fc) +bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l) { - struct rdt_fs_context *ctx = rdt_fc2context(fc); - - kernfs_free_fs_context(fc); - kfree(ctx); -} - -static const struct fs_context_operations rdt_fs_context_ops = { - .free = rdt_fs_context_free, - .parse_param = rdt_parse_param, - .get_tree = rdt_get_tree, -}; - -static int rdt_init_fs_context(struct fs_context *fc) -{ - struct rdt_fs_context *ctx; - - ctx = kzalloc(sizeof(struct rdt_fs_context), GFP_KERNEL); - if (!ctx) - return -ENOMEM; - - ctx->kfc.magic = RDTGROUP_SUPER_MAGIC; - fc->fs_private = &ctx->kfc; - fc->ops = &rdt_fs_context_ops; - put_user_ns(fc->user_ns); - fc->user_ns = get_user_ns(&init_user_ns); - fc->global = true; - return 0; + return rdt_resources_all[l].cdp_enabled; } void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) @@ -2953,1460 +260,3 @@ void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) return; } - -/* - * Move tasks from one to the other group. If @from is NULL, then all tasks - * in the systems are moved unconditionally (used for teardown). - * - * If @mask is not NULL the cpus on which moved tasks are running are set - * in that mask so the update smp function call is restricted to affected - * cpus. - */ -static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, - struct cpumask *mask) -{ - struct task_struct *p, *t; - - read_lock(&tasklist_lock); - for_each_process_thread(p, t) { - if (!from || is_closid_match(t, from) || - is_rmid_match(t, from)) { - resctrl_arch_set_closid_rmid(t, to->closid, - to->mon.rmid); - - /* - * Order the closid/rmid stores above before the loads - * in task_curr(). This pairs with the full barrier - * between the rq->curr update and resctrl_sched_in() - * during context switch. - */ - smp_mb(); - - /* - * If the task is on a CPU, set the CPU in the mask. - * The detection is inaccurate as tasks might move or - * schedule before the smp function call takes place. - * In such a case the function call is pointless, but - * there is no other side effect. - */ - if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t)) - cpumask_set_cpu(task_cpu(t), mask); - } - } - read_unlock(&tasklist_lock); -} - -static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) -{ - struct rdtgroup *sentry, *stmp; - struct list_head *head; - - head = &rdtgrp->mon.crdtgrp_list; - list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { - free_rmid(sentry->closid, sentry->mon.rmid); - list_del(&sentry->mon.crdtgrp_list); - - if (atomic_read(&sentry->waitcount) != 0) - sentry->flags = RDT_DELETED; - else - rdtgroup_remove(sentry); - } -} - -/* - * Forcibly remove all of subdirectories under root. - */ -static void rmdir_all_sub(void) -{ - struct rdtgroup *rdtgrp, *tmp; - - /* Move all tasks to the default resource group */ - rdt_move_group_tasks(NULL, &rdtgroup_default, NULL); - - list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) { - /* Free any child rmids */ - free_all_child_rdtgrp(rdtgrp); - - /* Remove each rdtgroup other than root */ - if (rdtgrp == &rdtgroup_default) - continue; - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) - rdtgroup_pseudo_lock_remove(rdtgrp); - - /* - * Give any CPUs back to the default group. We cannot copy - * cpu_online_mask because a CPU might have executed the - * offline callback already, but is still marked online. - */ - cpumask_or(&rdtgroup_default.cpu_mask, - &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); - - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - - kernfs_remove(rdtgrp->kn); - list_del(&rdtgrp->rdtgroup_list); - - if (atomic_read(&rdtgrp->waitcount) != 0) - rdtgrp->flags = RDT_DELETED; - else - rdtgroup_remove(rdtgrp); - } - /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ - update_closid_rmid(cpu_online_mask, &rdtgroup_default); - - kernfs_remove(kn_info); - kernfs_remove(kn_mongrp); - kernfs_remove(kn_mondata); -} - -static void rdt_kill_sb(struct super_block *sb) -{ - struct rdt_resource *r; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - rdt_disable_ctx(); - - /* Put everything back to default values. */ - for_each_alloc_capable_rdt_resource(r) - resctrl_arch_reset_all_ctrls(r); - - rmdir_all_sub(); - rdt_pseudo_lock_release(); - rdtgroup_default.mode = RDT_MODE_SHAREABLE; - schemata_list_destroy(); - rdtgroup_destroy_root(); - if (resctrl_arch_alloc_capable()) - resctrl_arch_disable_alloc(); - if (resctrl_arch_mon_capable()) - resctrl_arch_disable_mon(); - resctrl_mounted = false; - kernfs_kill_sb(sb); - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); -} - -static struct file_system_type rdt_fs_type = { - .name = "resctrl", - .init_fs_context = rdt_init_fs_context, - .parameters = rdt_fs_parameters, - .kill_sb = rdt_kill_sb, -}; - -static int mon_addfile(struct kernfs_node *parent_kn, const char *name, - void *priv) -{ - struct kernfs_node *kn; - int ret = 0; - - kn = __kernfs_create_file(parent_kn, name, 0444, - GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, - &kf_mondata_ops, priv, NULL, NULL); - if (IS_ERR(kn)) - return PTR_ERR(kn); - - ret = rdtgroup_kn_set_ugid(kn); - if (ret) { - kernfs_remove(kn); - return ret; - } - - return ret; -} - -static void mon_rmdir_one_subdir(struct kernfs_node *pkn, char *name, char *subname) -{ - struct kernfs_node *kn; - - kn = kernfs_find_and_get(pkn, name); - if (!kn) - return; - kernfs_put(kn); - - if (kn->dir.subdirs <= 1) - kernfs_remove(kn); - else - kernfs_remove_by_name(kn, subname); -} - -/* - * Remove all subdirectories of mon_data of ctrl_mon groups - * and monitor groups for the given domain. - * Remove files and directories containing "sum" of domain data - * when last domain being summed is removed. - */ -static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - struct rdt_mon_domain *d) -{ - struct rdtgroup *prgrp, *crgrp; - char subname[32]; - bool snc_mode; - char name[32]; - - snc_mode = r->mon_scope == RESCTRL_L3_NODE; - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); - if (snc_mode) - sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id); - - list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - mon_rmdir_one_subdir(prgrp->mon.mon_data_kn, name, subname); - - list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) - mon_rmdir_one_subdir(crgrp->mon.mon_data_kn, name, subname); - } -} - -static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, - struct rdt_resource *r, struct rdtgroup *prgrp, - bool do_sum) -{ - struct rmid_read rr = {0}; - union mon_data_bits priv; - struct mon_evt *mevt; - int ret; - - if (WARN_ON(list_empty(&r->evt_list))) - return -EPERM; - - priv.u.rid = r->rid; - priv.u.domid = do_sum ? d->ci->id : d->hdr.id; - priv.u.sum = do_sum; - list_for_each_entry(mevt, &r->evt_list, list) { - priv.u.evtid = mevt->evtid; - ret = mon_addfile(kn, mevt->name, priv.priv); - if (ret) - return ret; - - if (!do_sum && resctrl_is_mbm_event(mevt->evtid)) - mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true); - } - - return 0; -} - -static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, - struct rdt_mon_domain *d, - struct rdt_resource *r, struct rdtgroup *prgrp) -{ - struct kernfs_node *kn, *ckn; - char name[32]; - bool snc_mode; - int ret = 0; - - lockdep_assert_held(&rdtgroup_mutex); - - snc_mode = r->mon_scope == RESCTRL_L3_NODE; - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); - kn = kernfs_find_and_get(parent_kn, name); - if (kn) { - /* - * rdtgroup_mutex will prevent this directory from being - * removed. No need to keep this hold. - */ - kernfs_put(kn); - } else { - kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); - if (IS_ERR(kn)) - return PTR_ERR(kn); - - ret = rdtgroup_kn_set_ugid(kn); - if (ret) - goto out_destroy; - ret = mon_add_all_files(kn, d, r, prgrp, snc_mode); - if (ret) - goto out_destroy; - } - - if (snc_mode) { - sprintf(name, "mon_sub_%s_%02d", r->name, d->hdr.id); - ckn = kernfs_create_dir(kn, name, parent_kn->mode, prgrp); - if (IS_ERR(ckn)) { - ret = -EINVAL; - goto out_destroy; - } - - ret = rdtgroup_kn_set_ugid(ckn); - if (ret) - goto out_destroy; - - ret = mon_add_all_files(ckn, d, r, prgrp, false); - if (ret) - goto out_destroy; - } - - kernfs_activate(kn); - return 0; - -out_destroy: - kernfs_remove(kn); - return ret; -} - -/* - * Add all subdirectories of mon_data for "ctrl_mon" groups - * and "monitor" groups with given domain id. - */ -static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - struct rdt_mon_domain *d) -{ - struct kernfs_node *parent_kn; - struct rdtgroup *prgrp, *crgrp; - struct list_head *head; - - list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - parent_kn = prgrp->mon.mon_data_kn; - mkdir_mondata_subdir(parent_kn, d, r, prgrp); - - head = &prgrp->mon.crdtgrp_list; - list_for_each_entry(crgrp, head, mon.crdtgrp_list) { - parent_kn = crgrp->mon.mon_data_kn; - mkdir_mondata_subdir(parent_kn, d, r, crgrp); - } - } -} - -static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, - struct rdt_resource *r, - struct rdtgroup *prgrp) -{ - struct rdt_mon_domain *dom; - int ret; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - list_for_each_entry(dom, &r->mon_domains, hdr.list) { - ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); - if (ret) - return ret; - } - - return 0; -} - -/* - * This creates a directory mon_data which contains the monitored data. - * - * mon_data has one directory for each domain which are named - * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data - * with L3 domain looks as below: - * ./mon_data: - * mon_L3_00 - * mon_L3_01 - * mon_L3_02 - * ... - * - * Each domain directory has one file per event: - * ./mon_L3_00/: - * llc_occupancy - * - */ -static int mkdir_mondata_all(struct kernfs_node *parent_kn, - struct rdtgroup *prgrp, - struct kernfs_node **dest_kn) -{ - struct rdt_resource *r; - struct kernfs_node *kn; - int ret; - - /* - * Create the mon_data directory first. - */ - ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn); - if (ret) - return ret; - - if (dest_kn) - *dest_kn = kn; - - /* - * Create the subdirectories for each domain. Note that all events - * in a domain like L3 are grouped into a resource whose domain is L3 - */ - for_each_mon_capable_rdt_resource(r) { - ret = mkdir_mondata_subdir_alldom(kn, r, prgrp); - if (ret) - goto out_destroy; - } - - return 0; - -out_destroy: - kernfs_remove(kn); - return ret; -} - -/** - * cbm_ensure_valid - Enforce validity on provided CBM - * @_val: Candidate CBM - * @r: RDT resource to which the CBM belongs - * - * The provided CBM represents all cache portions available for use. This - * may be represented by a bitmap that does not consist of contiguous ones - * and thus be an invalid CBM. - * Here the provided CBM is forced to be a valid CBM by only considering - * the first set of contiguous bits as valid and clearing all bits. - * The intention here is to provide a valid default CBM with which a new - * resource group is initialized. The user can follow this with a - * modification to the CBM if the default does not satisfy the - * requirements. - */ -static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r) -{ - unsigned int cbm_len = r->cache.cbm_len; - unsigned long first_bit, zero_bit; - unsigned long val = _val; - - if (!val) - return 0; - - first_bit = find_first_bit(&val, cbm_len); - zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); - - /* Clear any remaining bits to ensure contiguous region */ - bitmap_clear(&val, zero_bit, cbm_len - zero_bit); - return (u32)val; -} - -/* - * Initialize cache resources per RDT domain - * - * Set the RDT domain up to start off with all usable allocations. That is, - * all shareable and unused bits. All-zero CBM is invalid. - */ -static int __init_one_rdt_domain(struct rdt_ctrl_domain *d, struct resctrl_schema *s, - u32 closid) -{ - enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type); - enum resctrl_conf_type t = s->conf_type; - struct resctrl_staged_config *cfg; - struct rdt_resource *r = s->res; - u32 used_b = 0, unused_b = 0; - unsigned long tmp_cbm; - enum rdtgrp_mode mode; - u32 peer_ctl, ctrl_val; - int i; - - cfg = &d->staged_config[t]; - cfg->have_new_ctrl = false; - cfg->new_ctrl = r->cache.shareable_bits; - used_b = r->cache.shareable_bits; - for (i = 0; i < closids_supported(); i++) { - if (closid_allocated(i) && i != closid) { - mode = rdtgroup_mode_by_closid(i); - if (mode == RDT_MODE_PSEUDO_LOCKSETUP) - /* - * ctrl values for locksetup aren't relevant - * until the schemata is written, and the mode - * becomes RDT_MODE_PSEUDO_LOCKED. - */ - continue; - /* - * If CDP is active include peer domain's - * usage to ensure there is no overlap - * with an exclusive group. - */ - if (resctrl_arch_get_cdp_enabled(r->rid)) - peer_ctl = resctrl_arch_get_config(r, d, i, - peer_type); - else - peer_ctl = 0; - ctrl_val = resctrl_arch_get_config(r, d, i, - s->conf_type); - used_b |= ctrl_val | peer_ctl; - if (mode == RDT_MODE_SHAREABLE) - cfg->new_ctrl |= ctrl_val | peer_ctl; - } - } - if (d->plr && d->plr->cbm > 0) - used_b |= d->plr->cbm; - unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1); - unused_b &= BIT_MASK(r->cache.cbm_len) - 1; - cfg->new_ctrl |= unused_b; - /* - * Force the initial CBM to be valid, user can - * modify the CBM based on system availability. - */ - cfg->new_ctrl = cbm_ensure_valid(cfg->new_ctrl, r); - /* - * Assign the u32 CBM to an unsigned long to ensure that - * bitmap_weight() does not access out-of-bound memory. - */ - tmp_cbm = cfg->new_ctrl; - if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) { - rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->hdr.id); - return -ENOSPC; - } - cfg->have_new_ctrl = true; - - return 0; -} - -/* - * Initialize cache resources with default values. - * - * A new RDT group is being created on an allocation capable (CAT) - * supporting system. Set this group up to start off with all usable - * allocations. - * - * If there are no more shareable bits available on any domain then - * the entire allocation will fail. - */ -static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid) -{ - struct rdt_ctrl_domain *d; - int ret; - - list_for_each_entry(d, &s->res->ctrl_domains, hdr.list) { - ret = __init_one_rdt_domain(d, s, closid); - if (ret < 0) - return ret; - } - - return 0; -} - -/* Initialize MBA resource with default values. */ -static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid) -{ - struct resctrl_staged_config *cfg; - struct rdt_ctrl_domain *d; - - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { - if (is_mba_sc(r)) { - d->mbps_val[closid] = MBA_MAX_MBPS; - continue; - } - - cfg = &d->staged_config[CDP_NONE]; - cfg->new_ctrl = resctrl_get_default_ctrl(r); - cfg->have_new_ctrl = true; - } -} - -/* Initialize the RDT group's allocations. */ -static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) -{ - struct resctrl_schema *s; - struct rdt_resource *r; - int ret = 0; - - rdt_staged_configs_clear(); - - list_for_each_entry(s, &resctrl_schema_all, list) { - r = s->res; - if (r->rid == RDT_RESOURCE_MBA || - r->rid == RDT_RESOURCE_SMBA) { - rdtgroup_init_mba(r, rdtgrp->closid); - if (is_mba_sc(r)) - continue; - } else { - ret = rdtgroup_init_cat(s, rdtgrp->closid); - if (ret < 0) - goto out; - } - - ret = resctrl_arch_update_domains(r, rdtgrp->closid); - if (ret < 0) { - rdt_last_cmd_puts("Failed to initialize allocations\n"); - goto out; - } - - } - - rdtgrp->mode = RDT_MODE_SHAREABLE; - -out: - rdt_staged_configs_clear(); - return ret; -} - -static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) -{ - int ret; - - if (!resctrl_arch_mon_capable()) - return 0; - - ret = alloc_rmid(rdtgrp->closid); - if (ret < 0) { - rdt_last_cmd_puts("Out of RMIDs\n"); - return ret; - } - rdtgrp->mon.rmid = ret; - - ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn); - if (ret) { - rdt_last_cmd_puts("kernfs subdir error\n"); - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - return ret; - } - - return 0; -} - -static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp) -{ - if (resctrl_arch_mon_capable()) - free_rmid(rgrp->closid, rgrp->mon.rmid); -} - -/* - * We allow creating mon groups only with in a directory called "mon_groups" - * which is present in every ctrl_mon group. Check if this is a valid - * "mon_groups" directory. - * - * 1. The directory should be named "mon_groups". - * 2. The mon group itself should "not" be named "mon_groups". - * This makes sure "mon_groups" directory always has a ctrl_mon group - * as parent. - */ -static bool is_mon_groups(struct kernfs_node *kn, const char *name) -{ - return (!strcmp(rdt_kn_name(kn), "mon_groups") && - strcmp(name, "mon_groups")); -} - -static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, - const char *name, umode_t mode, - enum rdt_group_type rtype, struct rdtgroup **r) -{ - struct rdtgroup *prdtgrp, *rdtgrp; - unsigned long files = 0; - struct kernfs_node *kn; - int ret; - - prdtgrp = rdtgroup_kn_lock_live(parent_kn); - if (!prdtgrp) { - ret = -ENODEV; - goto out_unlock; - } - - /* - * Check that the parent directory for a monitor group is a "mon_groups" - * directory. - */ - if (rtype == RDTMON_GROUP && !is_mon_groups(parent_kn, name)) { - ret = -EPERM; - goto out_unlock; - } - - if (rtype == RDTMON_GROUP && - (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || - prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) { - ret = -EINVAL; - rdt_last_cmd_puts("Pseudo-locking in progress\n"); - goto out_unlock; - } - - /* allocate the rdtgroup. */ - rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); - if (!rdtgrp) { - ret = -ENOSPC; - rdt_last_cmd_puts("Kernel out of memory\n"); - goto out_unlock; - } - *r = rdtgrp; - rdtgrp->mon.parent = prdtgrp; - rdtgrp->type = rtype; - INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list); - - /* kernfs creates the directory for rdtgrp */ - kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp); - if (IS_ERR(kn)) { - ret = PTR_ERR(kn); - rdt_last_cmd_puts("kernfs create error\n"); - goto out_free_rgrp; - } - rdtgrp->kn = kn; - - /* - * kernfs_remove() will drop the reference count on "kn" which - * will free it. But we still need it to stick around for the - * rdtgroup_kn_unlock(kn) call. Take one extra reference here, - * which will be dropped by kernfs_put() in rdtgroup_remove(). - */ - kernfs_get(kn); - - ret = rdtgroup_kn_set_ugid(kn); - if (ret) { - rdt_last_cmd_puts("kernfs perm error\n"); - goto out_destroy; - } - - if (rtype == RDTCTRL_GROUP) { - files = RFTYPE_BASE | RFTYPE_CTRL; - if (resctrl_arch_mon_capable()) - files |= RFTYPE_MON; - } else { - files = RFTYPE_BASE | RFTYPE_MON; - } - - ret = rdtgroup_add_files(kn, files); - if (ret) { - rdt_last_cmd_puts("kernfs fill error\n"); - goto out_destroy; - } - - /* - * The caller unlocks the parent_kn upon success. - */ - return 0; - -out_destroy: - kernfs_put(rdtgrp->kn); - kernfs_remove(rdtgrp->kn); -out_free_rgrp: - kfree(rdtgrp); -out_unlock: - rdtgroup_kn_unlock(parent_kn); - return ret; -} - -static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp) -{ - kernfs_remove(rgrp->kn); - rdtgroup_remove(rgrp); -} - -/* - * Create a monitor group under "mon_groups" directory of a control - * and monitor group(ctrl_mon). This is a resource group - * to monitor a subset of tasks and cpus in its parent ctrl_mon group. - */ -static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn, - const char *name, umode_t mode) -{ - struct rdtgroup *rdtgrp, *prgrp; - int ret; - - ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp); - if (ret) - return ret; - - prgrp = rdtgrp->mon.parent; - rdtgrp->closid = prgrp->closid; - - ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); - if (ret) { - mkdir_rdt_prepare_clean(rdtgrp); - goto out_unlock; - } - - kernfs_activate(rdtgrp->kn); - - /* - * Add the rdtgrp to the list of rdtgrps the parent - * ctrl_mon group has to track. - */ - list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list); - -out_unlock: - rdtgroup_kn_unlock(parent_kn); - return ret; -} - -/* - * These are rdtgroups created under the root directory. Can be used - * to allocate and monitor resources. - */ -static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, - const char *name, umode_t mode) -{ - struct rdtgroup *rdtgrp; - struct kernfs_node *kn; - u32 closid; - int ret; - - ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp); - if (ret) - return ret; - - kn = rdtgrp->kn; - ret = closid_alloc(); - if (ret < 0) { - rdt_last_cmd_puts("Out of CLOSIDs\n"); - goto out_common_fail; - } - closid = ret; - ret = 0; - - rdtgrp->closid = closid; - - ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); - if (ret) - goto out_closid_free; - - kernfs_activate(rdtgrp->kn); - - ret = rdtgroup_init_alloc(rdtgrp); - if (ret < 0) - goto out_rmid_free; - - list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); - - if (resctrl_arch_mon_capable()) { - /* - * Create an empty mon_groups directory to hold the subset - * of tasks and cpus to monitor. - */ - ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL); - if (ret) { - rdt_last_cmd_puts("kernfs subdir error\n"); - goto out_del_list; - } - if (is_mba_sc(NULL)) - rdtgrp->mba_mbps_event = mba_mbps_default_event; - } - - goto out_unlock; - -out_del_list: - list_del(&rdtgrp->rdtgroup_list); -out_rmid_free: - mkdir_rdt_prepare_rmid_free(rdtgrp); -out_closid_free: - closid_free(closid); -out_common_fail: - mkdir_rdt_prepare_clean(rdtgrp); -out_unlock: - rdtgroup_kn_unlock(parent_kn); - return ret; -} - -static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, - umode_t mode) -{ - /* Do not accept '\n' to avoid unparsable situation. */ - if (strchr(name, '\n')) - return -EINVAL; - - /* - * If the parent directory is the root directory and RDT - * allocation is supported, add a control and monitoring - * subdirectory - */ - if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn) - return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode); - - /* Else, attempt to add a monitoring subdirectory. */ - if (resctrl_arch_mon_capable()) - return rdtgroup_mkdir_mon(parent_kn, name, mode); - - return -EPERM; -} - -static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) -{ - struct rdtgroup *prdtgrp = rdtgrp->mon.parent; - u32 closid, rmid; - int cpu; - - /* Give any tasks back to the parent group */ - rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask); - - /* - * Update per cpu closid/rmid of the moved CPUs first. - * Note: the closid will not change, but the arch code still needs it. - */ - closid = prdtgrp->closid; - rmid = prdtgrp->mon.rmid; - for_each_cpu(cpu, &rdtgrp->cpu_mask) - resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid); - - /* - * Update the MSR on moved CPUs and CPUs which have moved - * task running on them. - */ - cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); - update_closid_rmid(tmpmask, NULL); - - rdtgrp->flags = RDT_DELETED; - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - - /* - * Remove the rdtgrp from the parent ctrl_mon group's list - */ - WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); - list_del(&rdtgrp->mon.crdtgrp_list); - - kernfs_remove(rdtgrp->kn); - - return 0; -} - -static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp) -{ - rdtgrp->flags = RDT_DELETED; - list_del(&rdtgrp->rdtgroup_list); - - kernfs_remove(rdtgrp->kn); - return 0; -} - -static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) -{ - u32 closid, rmid; - int cpu; - - /* Give any tasks back to the default group */ - rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask); - - /* Give any CPUs back to the default group */ - cpumask_or(&rdtgroup_default.cpu_mask, - &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); - - /* Update per cpu closid and rmid of the moved CPUs first */ - closid = rdtgroup_default.closid; - rmid = rdtgroup_default.mon.rmid; - for_each_cpu(cpu, &rdtgrp->cpu_mask) - resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid); - - /* - * Update the MSR on moved CPUs and CPUs which have moved - * task running on them. - */ - cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); - update_closid_rmid(tmpmask, NULL); - - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - closid_free(rdtgrp->closid); - - rdtgroup_ctrl_remove(rdtgrp); - - /* - * Free all the child monitor group rmids. - */ - free_all_child_rdtgrp(rdtgrp); - - return 0; -} - -static struct kernfs_node *rdt_kn_parent(struct kernfs_node *kn) -{ - /* - * Valid within the RCU section it was obtained or while rdtgroup_mutex - * is held. - */ - return rcu_dereference_check(kn->__parent, lockdep_is_held(&rdtgroup_mutex)); -} - -static int rdtgroup_rmdir(struct kernfs_node *kn) -{ - struct kernfs_node *parent_kn; - struct rdtgroup *rdtgrp; - cpumask_var_t tmpmask; - int ret = 0; - - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) - return -ENOMEM; - - rdtgrp = rdtgroup_kn_lock_live(kn); - if (!rdtgrp) { - ret = -EPERM; - goto out; - } - parent_kn = rdt_kn_parent(kn); - - /* - * If the rdtgroup is a ctrl_mon group and parent directory - * is the root directory, remove the ctrl_mon group. - * - * If the rdtgroup is a mon group and parent directory - * is a valid "mon_groups" directory, remove the mon group. - */ - if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn && - rdtgrp != &rdtgroup_default) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - ret = rdtgroup_ctrl_remove(rdtgrp); - } else { - ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask); - } - } else if (rdtgrp->type == RDTMON_GROUP && - is_mon_groups(parent_kn, rdt_kn_name(kn))) { - ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask); - } else { - ret = -EPERM; - } - -out: - rdtgroup_kn_unlock(kn); - free_cpumask_var(tmpmask); - return ret; -} - -/** - * mongrp_reparent() - replace parent CTRL_MON group of a MON group - * @rdtgrp: the MON group whose parent should be replaced - * @new_prdtgrp: replacement parent CTRL_MON group for @rdtgrp - * @cpus: cpumask provided by the caller for use during this call - * - * Replaces the parent CTRL_MON group for a MON group, resulting in all member - * tasks' CLOSID immediately changing to that of the new parent group. - * Monitoring data for the group is unaffected by this operation. - */ -static void mongrp_reparent(struct rdtgroup *rdtgrp, - struct rdtgroup *new_prdtgrp, - cpumask_var_t cpus) -{ - struct rdtgroup *prdtgrp = rdtgrp->mon.parent; - - WARN_ON(rdtgrp->type != RDTMON_GROUP); - WARN_ON(new_prdtgrp->type != RDTCTRL_GROUP); - - /* Nothing to do when simply renaming a MON group. */ - if (prdtgrp == new_prdtgrp) - return; - - WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); - list_move_tail(&rdtgrp->mon.crdtgrp_list, - &new_prdtgrp->mon.crdtgrp_list); - - rdtgrp->mon.parent = new_prdtgrp; - rdtgrp->closid = new_prdtgrp->closid; - - /* Propagate updated closid to all tasks in this group. */ - rdt_move_group_tasks(rdtgrp, rdtgrp, cpus); - - update_closid_rmid(cpus, NULL); -} - -static int rdtgroup_rename(struct kernfs_node *kn, - struct kernfs_node *new_parent, const char *new_name) -{ - struct kernfs_node *kn_parent; - struct rdtgroup *new_prdtgrp; - struct rdtgroup *rdtgrp; - cpumask_var_t tmpmask; - int ret; - - rdtgrp = kernfs_to_rdtgroup(kn); - new_prdtgrp = kernfs_to_rdtgroup(new_parent); - if (!rdtgrp || !new_prdtgrp) - return -ENOENT; - - /* Release both kernfs active_refs before obtaining rdtgroup mutex. */ - rdtgroup_kn_get(rdtgrp, kn); - rdtgroup_kn_get(new_prdtgrp, new_parent); - - mutex_lock(&rdtgroup_mutex); - - rdt_last_cmd_clear(); - - /* - * Don't allow kernfs_to_rdtgroup() to return a parent rdtgroup if - * either kernfs_node is a file. - */ - if (kernfs_type(kn) != KERNFS_DIR || - kernfs_type(new_parent) != KERNFS_DIR) { - rdt_last_cmd_puts("Source and destination must be directories"); - ret = -EPERM; - goto out; - } - - if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) { - ret = -ENOENT; - goto out; - } - - kn_parent = rdt_kn_parent(kn); - if (rdtgrp->type != RDTMON_GROUP || !kn_parent || - !is_mon_groups(kn_parent, rdt_kn_name(kn))) { - rdt_last_cmd_puts("Source must be a MON group\n"); - ret = -EPERM; - goto out; - } - - if (!is_mon_groups(new_parent, new_name)) { - rdt_last_cmd_puts("Destination must be a mon_groups subdirectory\n"); - ret = -EPERM; - goto out; - } - - /* - * If the MON group is monitoring CPUs, the CPUs must be assigned to the - * current parent CTRL_MON group and therefore cannot be assigned to - * the new parent, making the move illegal. - */ - if (!cpumask_empty(&rdtgrp->cpu_mask) && - rdtgrp->mon.parent != new_prdtgrp) { - rdt_last_cmd_puts("Cannot move a MON group that monitors CPUs\n"); - ret = -EPERM; - goto out; - } - - /* - * Allocate the cpumask for use in mongrp_reparent() to avoid the - * possibility of failing to allocate it after kernfs_rename() has - * succeeded. - */ - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) { - ret = -ENOMEM; - goto out; - } - - /* - * Perform all input validation and allocations needed to ensure - * mongrp_reparent() will succeed before calling kernfs_rename(), - * otherwise it would be necessary to revert this call if - * mongrp_reparent() failed. - */ - ret = kernfs_rename(kn, new_parent, new_name); - if (!ret) - mongrp_reparent(rdtgrp, new_prdtgrp, tmpmask); - - free_cpumask_var(tmpmask); - -out: - mutex_unlock(&rdtgroup_mutex); - rdtgroup_kn_put(rdtgrp, kn); - rdtgroup_kn_put(new_prdtgrp, new_parent); - return ret; -} - -static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) -{ - if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3)) - seq_puts(seq, ",cdp"); - - if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) - seq_puts(seq, ",cdpl2"); - - if (is_mba_sc(resctrl_arch_get_resource(RDT_RESOURCE_MBA))) - seq_puts(seq, ",mba_MBps"); - - if (resctrl_debug) - seq_puts(seq, ",debug"); - - return 0; -} - -static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = { - .mkdir = rdtgroup_mkdir, - .rmdir = rdtgroup_rmdir, - .rename = rdtgroup_rename, - .show_options = rdtgroup_show_options, -}; - -static int rdtgroup_setup_root(struct rdt_fs_context *ctx) -{ - rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops, - KERNFS_ROOT_CREATE_DEACTIVATED | - KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, - &rdtgroup_default); - if (IS_ERR(rdt_root)) - return PTR_ERR(rdt_root); - - ctx->kfc.root = rdt_root; - rdtgroup_default.kn = kernfs_root_to_node(rdt_root); - - return 0; -} - -static void rdtgroup_destroy_root(void) -{ - kernfs_destroy_root(rdt_root); - rdtgroup_default.kn = NULL; -} - -static void __init rdtgroup_setup_default(void) -{ - mutex_lock(&rdtgroup_mutex); - - rdtgroup_default.closid = RESCTRL_RESERVED_CLOSID; - rdtgroup_default.mon.rmid = RESCTRL_RESERVED_RMID; - rdtgroup_default.type = RDTCTRL_GROUP; - INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list); - - list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); - - mutex_unlock(&rdtgroup_mutex); -} - -static void domain_destroy_mon_state(struct rdt_mon_domain *d) -{ - bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); - kfree(d->mbm_local); -} - -void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) -{ - mutex_lock(&rdtgroup_mutex); - - if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) - mba_sc_domain_destroy(r, d); - - mutex_unlock(&rdtgroup_mutex); -} - -void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) -{ - mutex_lock(&rdtgroup_mutex); - - /* - * If resctrl is mounted, remove all the - * per domain monitor data directories. - */ - if (resctrl_mounted && resctrl_arch_mon_capable()) - rmdir_mondata_subdir_allrdtgrp(r, d); - - if (resctrl_is_mbm_enabled()) - cancel_delayed_work(&d->mbm_over); - if (resctrl_arch_is_llc_occupancy_enabled() && has_busy_rmid(d)) { - /* - * When a package is going down, forcefully - * decrement rmid->ebusy. There is no way to know - * that the L3 was flushed and hence may lead to - * incorrect counts in rare scenarios, but leaving - * the RMID as busy creates RMID leaks if the - * package never comes back. - */ - __check_limbo(d, true); - cancel_delayed_work(&d->cqm_limbo); - } - - domain_destroy_mon_state(d); - - mutex_unlock(&rdtgroup_mutex); -} - -/** - * domain_setup_mon_state() - Initialise domain monitoring structures. - * @r: The resource for the newly online domain. - * @d: The newly online domain. - * - * Allocate monitor resources that belong to this domain. - * Called when the first CPU of a domain comes online, regardless of whether - * the filesystem is mounted. - * During boot this may be called before global allocations have been made by - * resctrl_mon_resource_init(). - * - * Returns 0 for success, or -ENOMEM. - */ -static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d) -{ - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - size_t tsize; - - if (resctrl_arch_is_llc_occupancy_enabled()) { - d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL); - if (!d->rmid_busy_llc) - return -ENOMEM; - } - if (resctrl_arch_is_mbm_total_enabled()) { - tsize = sizeof(*d->mbm_total); - d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL); - if (!d->mbm_total) { - bitmap_free(d->rmid_busy_llc); - return -ENOMEM; - } - } - if (resctrl_arch_is_mbm_local_enabled()) { - tsize = sizeof(*d->mbm_local); - d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL); - if (!d->mbm_local) { - bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); - return -ENOMEM; - } - } - - return 0; -} - -int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) -{ - int err = 0; - - mutex_lock(&rdtgroup_mutex); - - if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) { - /* RDT_RESOURCE_MBA is never mon_capable */ - err = mba_sc_domain_allocate(r, d); - } - - mutex_unlock(&rdtgroup_mutex); - - return err; -} - -int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) -{ - int err; - - mutex_lock(&rdtgroup_mutex); - - err = domain_setup_mon_state(r, d); - if (err) - goto out_unlock; - - if (resctrl_is_mbm_enabled()) { - INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); - mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL, - RESCTRL_PICK_ANY_CPU); - } - - if (resctrl_arch_is_llc_occupancy_enabled()) - INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); - - /* - * If the filesystem is not mounted then only the default resource group - * exists. Creation of its directories is deferred until mount time - * by rdt_get_tree() calling mkdir_mondata_all(). - * If resctrl is mounted, add per domain monitor data directories. - */ - if (resctrl_mounted && resctrl_arch_mon_capable()) - mkdir_mondata_subdir_allrdtgrp(r, d); - -out_unlock: - mutex_unlock(&rdtgroup_mutex); - - return err; -} - -void resctrl_online_cpu(unsigned int cpu) -{ - mutex_lock(&rdtgroup_mutex); - /* The CPU is set in default rdtgroup after online. */ - cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); - mutex_unlock(&rdtgroup_mutex); -} - -static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) -{ - struct rdtgroup *cr; - - list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { - if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) - break; - } -} - -static struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, - struct rdt_resource *r) -{ - struct rdt_mon_domain *d; - - lockdep_assert_cpus_held(); - - list_for_each_entry(d, &r->mon_domains, hdr.list) { - /* Find the domain that contains this CPU */ - if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) - return d; - } - - return NULL; -} - -void resctrl_offline_cpu(unsigned int cpu) -{ - struct rdt_resource *l3 = resctrl_arch_get_resource(RDT_RESOURCE_L3); - struct rdt_mon_domain *d; - struct rdtgroup *rdtgrp; - - mutex_lock(&rdtgroup_mutex); - list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { - if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { - clear_childcpus(rdtgrp, cpu); - break; - } - } - - if (!l3->mon_capable) - goto out_unlock; - - d = get_mon_domain_from_cpu(cpu, l3); - if (d) { - if (resctrl_is_mbm_enabled() && cpu == d->mbm_work_cpu) { - cancel_delayed_work(&d->mbm_over); - mbm_setup_overflow_handler(d, 0, cpu); - } - if (resctrl_arch_is_llc_occupancy_enabled() && - cpu == d->cqm_work_cpu && has_busy_rmid(d)) { - cancel_delayed_work(&d->cqm_limbo); - cqm_setup_limbo_handler(d, 0, cpu); - } - } - -out_unlock: - mutex_unlock(&rdtgroup_mutex); -} - -/* - * resctrl_init - resctrl filesystem initialization - * - * Setup resctrl file system including set up root, create mount point, - * register resctrl filesystem, and initialize files under root directory. - * - * Return: 0 on success or -errno - */ -int __init resctrl_init(void) -{ - int ret = 0; - - seq_buf_init(&last_cmd_status, last_cmd_status_buf, - sizeof(last_cmd_status_buf)); - - rdtgroup_setup_default(); - - thread_throttle_mode_init(); - - ret = resctrl_mon_resource_init(); - if (ret) - return ret; - - ret = sysfs_create_mount_point(fs_kobj, "resctrl"); - if (ret) { - resctrl_mon_resource_exit(); - return ret; - } - - ret = register_filesystem(&rdt_fs_type); - if (ret) - goto cleanup_mountpoint; - - /* - * Adding the resctrl debugfs directory here may not be ideal since - * it would let the resctrl debugfs directory appear on the debugfs - * filesystem before the resctrl filesystem is mounted. - * It may also be ok since that would enable debugging of RDT before - * resctrl is mounted. - * The reason why the debugfs directory is created here and not in - * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and - * during the debugfs directory creation also &sb->s_type->i_mutex_key - * (the lockdep class of inode->i_rwsem). Other filesystem - * interactions (eg. SyS_getdents) have the lock ordering: - * &sb->s_type->i_mutex_key --> &mm->mmap_lock - * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex - * is taken, thus creating dependency: - * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause - * issues considering the other two lock dependencies. - * By creating the debugfs directory here we avoid a dependency - * that may cause deadlock (even though file operations cannot - * occur until the filesystem is mounted, but I do not know how to - * tell lockdep that). - */ - debugfs_resctrl = debugfs_create_dir("resctrl", NULL); - - return 0; - -cleanup_mountpoint: - sysfs_remove_mount_point(fs_kobj, "resctrl"); - resctrl_mon_resource_exit(); - - return ret; -} - -void __exit resctrl_exit(void) -{ - debugfs_remove_recursive(debugfs_resctrl); - unregister_filesystem(&rdt_fs_type); - sysfs_remove_mount_point(fs_kobj, "resctrl"); - - resctrl_mon_resource_exit(); -} diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 9bd4fa694da5..a10e180cbf23 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -208,7 +208,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) raw_cpu_write(current_task, next_p); /* Load the Intel cache allocation PQR MSR. */ - resctrl_sched_in(next_p); + resctrl_arch_sched_in(next_p); return prev_p; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index f39ff02e498d..8d6cf25127aa 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -705,7 +705,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) } /* Load the Intel cache allocation PQR MSR. */ - resctrl_sched_in(next_p); + resctrl_arch_sched_in(next_p); return prev_p; } diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 36336299596b..e7e71490bd25 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -970,13 +970,13 @@ static void amd_rp_pme_suspend(struct pci_dev *dev) struct pci_dev *rp; /* - * PM_SUSPEND_ON means we're doing runtime suspend, which means + * If system suspend is not in progress, we're doing runtime suspend, so * amd-pmc will not be involved so PMEs during D3 work as advertised. * * The PMEs *do* work if amd-pmc doesn't put the SoC in the hardware * sleep state, but we assume amd-pmc is always present. */ - if (pm_suspend_target_state == PM_SUSPEND_ON) + if (!pm_suspend_in_progress()) return; rp = pcie_find_root_port(dev); |