From 9fad9d560af5c654bb38e0b07ee54a4e9acdc5cd Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Wed, 8 May 2024 17:22:51 +0000 Subject: scsi: sr: Fix unintentional arithmetic wraparound Running syzkaller with the newly reintroduced signed integer overflow sanitizer produces this report: [ 65.194362] ------------[ cut here ]------------ [ 65.197752] UBSAN: signed-integer-overflow in ../drivers/scsi/sr_ioctl.c:436:9 [ 65.203607] -2147483648 * 177 cannot be represented in type 'int' [ 65.207911] CPU: 2 PID: 10416 Comm: syz-executor.1 Not tainted 6.8.0-rc2-00035-gb3ef86b5a957 #1 [ 65.213585] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 [ 65.219923] Call Trace: [ 65.221556] [ 65.223029] dump_stack_lvl+0x93/0xd0 [ 65.225573] handle_overflow+0x171/0x1b0 [ 65.228219] sr_select_speed+0xeb/0xf0 [ 65.230786] ? __pm_runtime_resume+0xe6/0x130 [ 65.233606] sr_block_ioctl+0x15d/0x1d0 ... Historically, the signed integer overflow sanitizer did not work in the kernel due to its interaction with `-fwrapv` but this has since been changed [1] in the newest version of Clang. It was re-enabled in the kernel with Commit 557f8c582a9b ("ubsan: Reintroduce signed overflow sanitizer"). Firstly, let's change the type of "speed" to unsigned long as sr_select_speed()'s only caller passes in an unsigned long anyways. $ git grep '\.select_speed' | drivers/scsi/sr.c: .select_speed = sr_select_speed, ... | static int cdrom_ioctl_select_speed(struct cdrom_device_info *cdi, | unsigned long arg) | { | ... | return cdi->ops->select_speed(cdi, arg); | } Next, let's add an extra check to make sure we don't exceed 0xffff/177 (350) since 0xffff is the max speed. This has two benefits: 1) we deal with integer overflow before it happens and 2) we properly respect the max speed of 0xffff. There are some "magic" numbers here but I did not want to change more than what was necessary. Link: https://github.com/llvm/llvm-project/pull/82432 [1] Closes: https://github.com/KSPP/linux/issues/357 Cc: linux-hardening@vger.kernel.org Signed-off-by: Justin Stitt Link: https://lore.kernel.org/r/20240508-b4-b4-sio-sr_select_speed-v2-1-00b68f724290@google.com Reviewed-by: Kees Cook Signed-off-by: Martin K. Petersen --- include/linux/cdrom.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index 98c6fd0b39b6..fdfb61ccf55a 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -77,7 +77,7 @@ struct cdrom_device_ops { unsigned int clearing, int slot); int (*tray_move) (struct cdrom_device_info *, int); int (*lock_door) (struct cdrom_device_info *, int); - int (*select_speed) (struct cdrom_device_info *, int); + int (*select_speed) (struct cdrom_device_info *, unsigned long); int (*get_last_session) (struct cdrom_device_info *, struct cdrom_multisession *); int (*get_mcn) (struct cdrom_device_info *, -- cgit v1.2.3 From e61bcf42d290e73025bab38e0e55a5586c2d8ad5 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 15 Apr 2024 22:50:27 +0200 Subject: i2c: Remove I2C_CLASS_SPD Remove this class after all users have been gone. Signed-off-by: Heiner Kallweit Signed-off-by: Andi Shyti --- include/linux/i2c.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 5e6cd43a6dbd..9709537370ee 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -852,7 +852,6 @@ static inline void i2c_mark_adapter_resumed(struct i2c_adapter *adap) /* i2c adapter classes (bitmask) */ #define I2C_CLASS_HWMON (1<<0) /* lm_sensors, ... */ -#define I2C_CLASS_SPD (1<<7) /* Memory modules */ /* Warn users that the adapter doesn't support classes anymore */ #define I2C_CLASS_DEPRECATED (1<<8) -- cgit v1.2.3 From db003a28e03f95f2bcb63f037a2078b8870b1ecd Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 28 May 2024 14:33:20 +0200 Subject: netfs: fix kernel doc for nets_wait_for_outstanding_io() The @inode parameter wasn't documented leading to new doc build warnings. Fixes: f89ea63f1c65 ("netfs, 9p: Fix race between umount and async request completion") Link: https://lore.kernel.org/r/20240528133050.7e09d78e@canb.auug.org.au Signed-off-by: Christian Brauner --- include/linux/netfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 3ca3906bb8da..5d0288938cc2 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -521,7 +521,7 @@ static inline struct fscache_cookie *netfs_i_cookie(struct netfs_inode *ctx) /** * netfs_wait_for_outstanding_io - Wait for outstanding I/O to complete - * @ctx: The netfs inode to wait on + * @inode: The netfs inode to wait on * * Wait for outstanding I/O requests of any type to complete. This is intended * to be called from inode eviction routines. This makes sure that any -- cgit v1.2.3 From c7a5096781732e0f9784551309484f3e103f6750 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 28 May 2024 13:25:02 +0300 Subject: PNP: Make dev_is_pnp() to be a function and export it for modules Since we have a dev_is_pnp() macro that utilises the address of the pnp_bus_type variable, the users, which can be compiled as modules, will fail to build. Convert the macro to be a function and export it to the modules to prevent build breakage. Reported-by: Woody Suwalski Closes: https://lore.kernel.org/r/cc8a93b2-2504-9754-e26c-5d5c3bd1265c@gmail.com Fixes: 2a49b45cd0e7 ("PNP: Add dev_is_pnp() macro") Signed-off-by: Andy Shevchenko Reviewed-by: Christoph Hellwig Signed-off-by: Rafael J. Wysocki --- drivers/pnp/driver.c | 6 ++++++ include/linux/pnp.h | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/pnp/driver.c b/drivers/pnp/driver.c index 0a5d0d8befa8..3483e52e3a81 100644 --- a/drivers/pnp/driver.c +++ b/drivers/pnp/driver.c @@ -266,6 +266,12 @@ const struct bus_type pnp_bus_type = { .dev_groups = pnp_dev_groups, }; +bool dev_is_pnp(const struct device *dev) +{ + return dev->bus == &pnp_bus_type; +} +EXPORT_SYMBOL_GPL(dev_is_pnp); + int pnp_register_driver(struct pnp_driver *drv) { drv->driver.name = drv->name; diff --git a/include/linux/pnp.h b/include/linux/pnp.h index 82561242cda4..a8def1cea32c 100644 --- a/include/linux/pnp.h +++ b/include/linux/pnp.h @@ -469,7 +469,7 @@ int compare_pnp_id(struct pnp_id *pos, const char *id); int pnp_register_driver(struct pnp_driver *drv); void pnp_unregister_driver(struct pnp_driver *drv); -#define dev_is_pnp(d) ((d)->bus == &pnp_bus_type) +bool dev_is_pnp(const struct device *dev); #else @@ -502,7 +502,7 @@ static inline int compare_pnp_id(struct pnp_id *pos, const char *id) { return -E static inline int pnp_register_driver(struct pnp_driver *drv) { return -ENODEV; } static inline void pnp_unregister_driver(struct pnp_driver *drv) { } -#define dev_is_pnp(d) false +static inline bool dev_is_pnp(const struct device *dev) { return false; } #endif /* CONFIG_PNP */ -- cgit v1.2.3 From edcde848c01eb071a91d479a6b3101d9cf48e905 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 28 May 2024 13:25:03 +0300 Subject: PNP: Hide pnp_bus_type from the non-PNP code The pnp_bus_type is defined only when CONFIG_PNP=y, while being not guarded by ifdeffery in the header. Moreover, it's not used outside of the PNP code. Move it to the internal header to make sure no-one will try to (ab)use it. Signed-off-by: Andy Shevchenko Reviewed-by: Christoph Hellwig Signed-off-by: Rafael J. Wysocki --- drivers/pnp/base.h | 1 + include/linux/pnp.h | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/pnp/base.h b/drivers/pnp/base.h index e74a0f6a3157..4e80273dfb1e 100644 --- a/drivers/pnp/base.h +++ b/drivers/pnp/base.h @@ -6,6 +6,7 @@ extern struct mutex pnp_lock; extern const struct attribute_group *pnp_dev_groups[]; +extern const struct bus_type pnp_bus_type; int pnp_register_protocol(struct pnp_protocol *protocol); void pnp_unregister_protocol(struct pnp_protocol *protocol); diff --git a/include/linux/pnp.h b/include/linux/pnp.h index a8def1cea32c..7f2ff95d2deb 100644 --- a/include/linux/pnp.h +++ b/include/linux/pnp.h @@ -435,8 +435,6 @@ struct pnp_protocol { #define protocol_for_each_dev(protocol, dev) \ list_for_each_entry(dev, &(protocol)->devices, protocol_list) -extern const struct bus_type pnp_bus_type; - #if defined(CONFIG_PNP) /* device management */ -- cgit v1.2.3 From 779b8a14afde110dd3502566be907289eba72447 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 28 May 2024 14:09:23 +0200 Subject: cpufreq: amd-pstate: remove global header file When extra warnings are enabled, gcc points out a global variable definition in a header: In file included from drivers/cpufreq/amd-pstate-ut.c:29: include/linux/amd-pstate.h:123:27: error: 'amd_pstate_mode_string' defined but not used [-Werror=unused-const-variable=] 123 | static const char * const amd_pstate_mode_string[] = { | ^~~~~~~~~~~~~~~~~~~~~~ This header is only included from two files in the same directory, and one of them uses only a single definition from it, so clean it up by moving most of the contents into the driver that uses them, and making shared bits a local header file. Fixes: 36c5014e5460 ("cpufreq: amd-pstate: optimize driver working mode selection in amd_pstate_param()") Signed-off-by: Arnd Bergmann Acked-by: Mario Limonciello Signed-off-by: Rafael J. Wysocki --- MAINTAINERS | 1 - drivers/cpufreq/amd-pstate-ut.c | 3 +- drivers/cpufreq/amd-pstate.c | 34 +++++++++- drivers/cpufreq/amd-pstate.h | 104 ++++++++++++++++++++++++++++++ include/linux/amd-pstate.h | 137 ---------------------------------------- 5 files changed, 139 insertions(+), 140 deletions(-) create mode 100644 drivers/cpufreq/amd-pstate.h delete mode 100644 include/linux/amd-pstate.h (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index d6c90161c7bf..fc31870379f8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1107,7 +1107,6 @@ L: linux-pm@vger.kernel.org S: Supported F: Documentation/admin-guide/pm/amd-pstate.rst F: drivers/cpufreq/amd-pstate* -F: include/linux/amd-pstate.h F: tools/power/x86/amd_pstate_tracer/amd_pstate_trace.py AMD PTDMA DRIVER diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c index f04ae67dda37..fc275d41d51e 100644 --- a/drivers/cpufreq/amd-pstate-ut.c +++ b/drivers/cpufreq/amd-pstate-ut.c @@ -26,10 +26,11 @@ #include #include #include -#include #include +#include "amd-pstate.h" + /* * Abbreviations: * amd_pstate_ut: used as a shortform for AMD P-State unit test. diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 1b7e82a0ad2e..91993647e09e 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -36,7 +36,6 @@ #include #include #include -#include #include #include @@ -46,6 +45,8 @@ #include #include #include + +#include "amd-pstate.h" #include "amd-pstate-trace.h" #define AMD_PSTATE_TRANSITION_LATENCY 20000 @@ -53,6 +54,37 @@ #define CPPC_HIGHEST_PERF_PERFORMANCE 196 #define CPPC_HIGHEST_PERF_DEFAULT 166 +#define AMD_CPPC_EPP_PERFORMANCE 0x00 +#define AMD_CPPC_EPP_BALANCE_PERFORMANCE 0x80 +#define AMD_CPPC_EPP_BALANCE_POWERSAVE 0xBF +#define AMD_CPPC_EPP_POWERSAVE 0xFF + +/* + * enum amd_pstate_mode - driver working mode of amd pstate + */ +enum amd_pstate_mode { + AMD_PSTATE_UNDEFINED = 0, + AMD_PSTATE_DISABLE, + AMD_PSTATE_PASSIVE, + AMD_PSTATE_ACTIVE, + AMD_PSTATE_GUIDED, + AMD_PSTATE_MAX, +}; + +static const char * const amd_pstate_mode_string[] = { + [AMD_PSTATE_UNDEFINED] = "undefined", + [AMD_PSTATE_DISABLE] = "disable", + [AMD_PSTATE_PASSIVE] = "passive", + [AMD_PSTATE_ACTIVE] = "active", + [AMD_PSTATE_GUIDED] = "guided", + NULL, +}; + +struct quirk_entry { + u32 nominal_freq; + u32 lowest_freq; +}; + /* * TODO: We need more time to fine tune processors with shared memory solution * with community together. diff --git a/drivers/cpufreq/amd-pstate.h b/drivers/cpufreq/amd-pstate.h new file mode 100644 index 000000000000..e6a28e7f4dbf --- /dev/null +++ b/drivers/cpufreq/amd-pstate.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2022 Advanced Micro Devices, Inc. + * + * Author: Meng Li + */ + +#ifndef _LINUX_AMD_PSTATE_H +#define _LINUX_AMD_PSTATE_H + +#include + +/********************************************************************* + * AMD P-state INTERFACE * + *********************************************************************/ +/** + * struct amd_aperf_mperf + * @aperf: actual performance frequency clock count + * @mperf: maximum performance frequency clock count + * @tsc: time stamp counter + */ +struct amd_aperf_mperf { + u64 aperf; + u64 mperf; + u64 tsc; +}; + +/** + * struct amd_cpudata - private CPU data for AMD P-State + * @cpu: CPU number + * @req: constraint request to apply + * @cppc_req_cached: cached performance request hints + * @highest_perf: the maximum performance an individual processor may reach, + * assuming ideal conditions + * For platforms that do not support the preferred core feature, the + * highest_pef may be configured with 166 or 255, to avoid max frequency + * calculated wrongly. we take the fixed value as the highest_perf. + * @nominal_perf: the maximum sustained performance level of the processor, + * assuming ideal operating conditions + * @lowest_nonlinear_perf: the lowest performance level at which nonlinear power + * savings are achieved + * @lowest_perf: the absolute lowest performance level of the processor + * @prefcore_ranking: the preferred core ranking, the higher value indicates a higher + * priority. + * @min_limit_perf: Cached value of the performance corresponding to policy->min + * @max_limit_perf: Cached value of the performance corresponding to policy->max + * @min_limit_freq: Cached value of policy->min (in khz) + * @max_limit_freq: Cached value of policy->max (in khz) + * @max_freq: the frequency (in khz) that mapped to highest_perf + * @min_freq: the frequency (in khz) that mapped to lowest_perf + * @nominal_freq: the frequency (in khz) that mapped to nominal_perf + * @lowest_nonlinear_freq: the frequency (in khz) that mapped to lowest_nonlinear_perf + * @cur: Difference of Aperf/Mperf/tsc count between last and current sample + * @prev: Last Aperf/Mperf/tsc count value read from register + * @freq: current cpu frequency value (in khz) + * @boost_supported: check whether the Processor or SBIOS supports boost mode + * @hw_prefcore: check whether HW supports preferred core featue. + * Only when hw_prefcore and early prefcore param are true, + * AMD P-State driver supports preferred core featue. + * @epp_policy: Last saved policy used to set energy-performance preference + * @epp_cached: Cached CPPC energy-performance preference value + * @policy: Cpufreq policy value + * @cppc_cap1_cached Cached MSR_AMD_CPPC_CAP1 register value + * + * The amd_cpudata is key private data for each CPU thread in AMD P-State, and + * represents all the attributes and goals that AMD P-State requests at runtime. + */ +struct amd_cpudata { + int cpu; + + struct freq_qos_request req[2]; + u64 cppc_req_cached; + + u32 highest_perf; + u32 nominal_perf; + u32 lowest_nonlinear_perf; + u32 lowest_perf; + u32 prefcore_ranking; + u32 min_limit_perf; + u32 max_limit_perf; + u32 min_limit_freq; + u32 max_limit_freq; + + u32 max_freq; + u32 min_freq; + u32 nominal_freq; + u32 lowest_nonlinear_freq; + + struct amd_aperf_mperf cur; + struct amd_aperf_mperf prev; + + u64 freq; + bool boost_supported; + bool hw_prefcore; + + /* EPP feature related attributes*/ + s16 epp_policy; + s16 epp_cached; + u32 policy; + u64 cppc_cap1_cached; + bool suspended; +}; + +#endif /* _LINUX_AMD_PSTATE_H */ diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h deleted file mode 100644 index d58fc022ec46..000000000000 --- a/include/linux/amd-pstate.h +++ /dev/null @@ -1,137 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/include/linux/amd-pstate.h - * - * Copyright (C) 2022 Advanced Micro Devices, Inc. - * - * Author: Meng Li - */ - -#ifndef _LINUX_AMD_PSTATE_H -#define _LINUX_AMD_PSTATE_H - -#include - -#define AMD_CPPC_EPP_PERFORMANCE 0x00 -#define AMD_CPPC_EPP_BALANCE_PERFORMANCE 0x80 -#define AMD_CPPC_EPP_BALANCE_POWERSAVE 0xBF -#define AMD_CPPC_EPP_POWERSAVE 0xFF - -/********************************************************************* - * AMD P-state INTERFACE * - *********************************************************************/ -/** - * struct amd_aperf_mperf - * @aperf: actual performance frequency clock count - * @mperf: maximum performance frequency clock count - * @tsc: time stamp counter - */ -struct amd_aperf_mperf { - u64 aperf; - u64 mperf; - u64 tsc; -}; - -/** - * struct amd_cpudata - private CPU data for AMD P-State - * @cpu: CPU number - * @req: constraint request to apply - * @cppc_req_cached: cached performance request hints - * @highest_perf: the maximum performance an individual processor may reach, - * assuming ideal conditions - * For platforms that do not support the preferred core feature, the - * highest_pef may be configured with 166 or 255, to avoid max frequency - * calculated wrongly. we take the fixed value as the highest_perf. - * @nominal_perf: the maximum sustained performance level of the processor, - * assuming ideal operating conditions - * @lowest_nonlinear_perf: the lowest performance level at which nonlinear power - * savings are achieved - * @lowest_perf: the absolute lowest performance level of the processor - * @prefcore_ranking: the preferred core ranking, the higher value indicates a higher - * priority. - * @min_limit_perf: Cached value of the performance corresponding to policy->min - * @max_limit_perf: Cached value of the performance corresponding to policy->max - * @min_limit_freq: Cached value of policy->min (in khz) - * @max_limit_freq: Cached value of policy->max (in khz) - * @max_freq: the frequency (in khz) that mapped to highest_perf - * @min_freq: the frequency (in khz) that mapped to lowest_perf - * @nominal_freq: the frequency (in khz) that mapped to nominal_perf - * @lowest_nonlinear_freq: the frequency (in khz) that mapped to lowest_nonlinear_perf - * @cur: Difference of Aperf/Mperf/tsc count between last and current sample - * @prev: Last Aperf/Mperf/tsc count value read from register - * @freq: current cpu frequency value (in khz) - * @boost_supported: check whether the Processor or SBIOS supports boost mode - * @hw_prefcore: check whether HW supports preferred core featue. - * Only when hw_prefcore and early prefcore param are true, - * AMD P-State driver supports preferred core featue. - * @epp_policy: Last saved policy used to set energy-performance preference - * @epp_cached: Cached CPPC energy-performance preference value - * @policy: Cpufreq policy value - * @cppc_cap1_cached Cached MSR_AMD_CPPC_CAP1 register value - * - * The amd_cpudata is key private data for each CPU thread in AMD P-State, and - * represents all the attributes and goals that AMD P-State requests at runtime. - */ -struct amd_cpudata { - int cpu; - - struct freq_qos_request req[2]; - u64 cppc_req_cached; - - u32 highest_perf; - u32 nominal_perf; - u32 lowest_nonlinear_perf; - u32 lowest_perf; - u32 prefcore_ranking; - u32 min_limit_perf; - u32 max_limit_perf; - u32 min_limit_freq; - u32 max_limit_freq; - - u32 max_freq; - u32 min_freq; - u32 nominal_freq; - u32 lowest_nonlinear_freq; - - struct amd_aperf_mperf cur; - struct amd_aperf_mperf prev; - - u64 freq; - bool boost_supported; - bool hw_prefcore; - - /* EPP feature related attributes*/ - s16 epp_policy; - s16 epp_cached; - u32 policy; - u64 cppc_cap1_cached; - bool suspended; -}; - -/* - * enum amd_pstate_mode - driver working mode of amd pstate - */ -enum amd_pstate_mode { - AMD_PSTATE_UNDEFINED = 0, - AMD_PSTATE_DISABLE, - AMD_PSTATE_PASSIVE, - AMD_PSTATE_ACTIVE, - AMD_PSTATE_GUIDED, - AMD_PSTATE_MAX, -}; - -static const char * const amd_pstate_mode_string[] = { - [AMD_PSTATE_UNDEFINED] = "undefined", - [AMD_PSTATE_DISABLE] = "disable", - [AMD_PSTATE_PASSIVE] = "passive", - [AMD_PSTATE_ACTIVE] = "active", - [AMD_PSTATE_GUIDED] = "guided", - NULL, -}; - -struct quirk_entry { - u32 nominal_freq; - u32 lowest_freq; -}; - -#endif /* _LINUX_AMD_PSTATE_H */ -- cgit v1.2.3 From b7c5e64fecfa88764791679cca4786ac65de739e Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Wed, 29 May 2024 22:52:30 -0600 Subject: vfio: Create vfio_fs_type with inode per device By linking all the device fds we provide to userspace to an address space through a new pseudo fs, we can use tools like unmap_mapping_range() to zap all vmas associated with a device. Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240530045236.1005864-2-alex.williamson@redhat.com Signed-off-by: Alex Williamson --- drivers/vfio/device_cdev.c | 7 +++++++ drivers/vfio/group.c | 7 +++++++ drivers/vfio/vfio_main.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/vfio.h | 1 + 4 files changed, 59 insertions(+) (limited to 'include/linux') diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c index e75da0a70d1f..bb1817bd4ff3 100644 --- a/drivers/vfio/device_cdev.c +++ b/drivers/vfio/device_cdev.c @@ -39,6 +39,13 @@ int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep) filep->private_data = df; + /* + * Use the pseudo fs inode on the device to link all mmaps + * to the same address space, allowing us to unmap all vmas + * associated to this device using unmap_mapping_range(). + */ + filep->f_mapping = device->inode->i_mapping; + return 0; err_put_registration: diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c index 610a429c6191..ded364588d29 100644 --- a/drivers/vfio/group.c +++ b/drivers/vfio/group.c @@ -286,6 +286,13 @@ static struct file *vfio_device_open_file(struct vfio_device *device) */ filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE); + /* + * Use the pseudo fs inode on the device to link all mmaps + * to the same address space, allowing us to unmap all vmas + * associated to this device using unmap_mapping_range(). + */ + filep->f_mapping = device->inode->i_mapping; + if (device->group->type == VFIO_NO_IOMMU) dev_warn(device->dev, "vfio-noiommu device opened by user " "(%s:%d)\n", current->comm, task_pid_nr(current)); diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index e97d796a54fb..a5a62d9d963f 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -22,8 +22,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -43,9 +45,13 @@ #define DRIVER_AUTHOR "Alex Williamson " #define DRIVER_DESC "VFIO - User Level meta-driver" +#define VFIO_MAGIC 0x5646494f /* "VFIO" */ + static struct vfio { struct class *device_class; struct ida device_ida; + struct vfsmount *vfs_mount; + int fs_count; } vfio; #ifdef CONFIG_VFIO_NOIOMMU @@ -186,6 +192,8 @@ static void vfio_device_release(struct device *dev) if (device->ops->release) device->ops->release(device); + iput(device->inode); + simple_release_fs(&vfio.vfs_mount, &vfio.fs_count); kvfree(device); } @@ -228,6 +236,34 @@ out_free: } EXPORT_SYMBOL_GPL(_vfio_alloc_device); +static int vfio_fs_init_fs_context(struct fs_context *fc) +{ + return init_pseudo(fc, VFIO_MAGIC) ? 0 : -ENOMEM; +} + +static struct file_system_type vfio_fs_type = { + .name = "vfio", + .owner = THIS_MODULE, + .init_fs_context = vfio_fs_init_fs_context, + .kill_sb = kill_anon_super, +}; + +static struct inode *vfio_fs_inode_new(void) +{ + struct inode *inode; + int ret; + + ret = simple_pin_fs(&vfio_fs_type, &vfio.vfs_mount, &vfio.fs_count); + if (ret) + return ERR_PTR(ret); + + inode = alloc_anon_inode(vfio.vfs_mount->mnt_sb); + if (IS_ERR(inode)) + simple_release_fs(&vfio.vfs_mount, &vfio.fs_count); + + return inode; +} + /* * Initialize a vfio_device so it can be registered to vfio core. */ @@ -246,6 +282,11 @@ static int vfio_init_device(struct vfio_device *device, struct device *dev, init_completion(&device->comp); device->dev = dev; device->ops = ops; + device->inode = vfio_fs_inode_new(); + if (IS_ERR(device->inode)) { + ret = PTR_ERR(device->inode); + goto out_inode; + } if (ops->init) { ret = ops->init(device); @@ -260,6 +301,9 @@ static int vfio_init_device(struct vfio_device *device, struct device *dev, return 0; out_uninit: + iput(device->inode); + simple_release_fs(&vfio.vfs_mount, &vfio.fs_count); +out_inode: vfio_release_device_set(device); ida_free(&vfio.device_ida, device->index); return ret; diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 8b1a29820409..000a6cab2d31 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -64,6 +64,7 @@ struct vfio_device { struct completion comp; struct iommufd_access *iommufd_access; void (*put_kvm)(struct kvm *kvm); + struct inode *inode; #if IS_ENABLED(CONFIG_IOMMUFD) struct iommufd_device *iommufd_device; u8 iommufd_attached:1; -- cgit v1.2.3 From aac6db75a9fc2c7a6f73e152df8f15101dda38e6 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Wed, 29 May 2024 22:52:31 -0600 Subject: vfio/pci: Use unmap_mapping_range() With the vfio device fd tied to the address space of the pseudo fs inode, we can use the mm to track all vmas that might be mmap'ing device BARs, which removes our vma_list and all the complicated lock ordering necessary to manually zap each related vma. Note that we can no longer store the pfn in vm_pgoff if we want to use unmap_mapping_range() to zap a selective portion of the device fd corresponding to BAR mappings. This also converts our mmap fault handler to use vmf_insert_pfn() because we no longer have a vma_list to avoid the concurrency problem with io_remap_pfn_range(). The goal is to eventually use the vm_ops huge_fault handler to avoid the additional faulting overhead, but vmf_insert_pfn_{pmd,pud}() need to learn about pfnmaps first. Also, Jason notes that a race exists between unmap_mapping_range() and the fops mmap callback if we were to call io_remap_pfn_range() to populate the vma on mmap. Specifically, mmap_region() does call_mmap() before it does vma_link_file() which gives a window where the vma is populated but invisible to unmap_mapping_range(). Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240530045236.1005864-3-alex.williamson@redhat.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 264 ++++++++------------------------------- include/linux/vfio_pci_core.h | 2 - 2 files changed, 55 insertions(+), 211 deletions(-) (limited to 'include/linux') diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 80cae87fff36..db31c27bf78b 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1610,100 +1610,20 @@ ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *bu } EXPORT_SYMBOL_GPL(vfio_pci_core_write); -/* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */ -static int vfio_pci_zap_and_vma_lock(struct vfio_pci_core_device *vdev, bool try) +static void vfio_pci_zap_bars(struct vfio_pci_core_device *vdev) { - struct vfio_pci_mmap_vma *mmap_vma, *tmp; + struct vfio_device *core_vdev = &vdev->vdev; + loff_t start = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_BAR0_REGION_INDEX); + loff_t end = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_ROM_REGION_INDEX); + loff_t len = end - start; - /* - * Lock ordering: - * vma_lock is nested under mmap_lock for vm_ops callback paths. - * The memory_lock semaphore is used by both code paths calling - * into this function to zap vmas and the vm_ops.fault callback - * to protect the memory enable state of the device. - * - * When zapping vmas we need to maintain the mmap_lock => vma_lock - * ordering, which requires using vma_lock to walk vma_list to - * acquire an mm, then dropping vma_lock to get the mmap_lock and - * reacquiring vma_lock. This logic is derived from similar - * requirements in uverbs_user_mmap_disassociate(). - * - * mmap_lock must always be the top-level lock when it is taken. - * Therefore we can only hold the memory_lock write lock when - * vma_list is empty, as we'd need to take mmap_lock to clear - * entries. vma_list can only be guaranteed empty when holding - * vma_lock, thus memory_lock is nested under vma_lock. - * - * This enables the vm_ops.fault callback to acquire vma_lock, - * followed by memory_lock read lock, while already holding - * mmap_lock without risk of deadlock. - */ - while (1) { - struct mm_struct *mm = NULL; - - if (try) { - if (!mutex_trylock(&vdev->vma_lock)) - return 0; - } else { - mutex_lock(&vdev->vma_lock); - } - while (!list_empty(&vdev->vma_list)) { - mmap_vma = list_first_entry(&vdev->vma_list, - struct vfio_pci_mmap_vma, - vma_next); - mm = mmap_vma->vma->vm_mm; - if (mmget_not_zero(mm)) - break; - - list_del(&mmap_vma->vma_next); - kfree(mmap_vma); - mm = NULL; - } - if (!mm) - return 1; - mutex_unlock(&vdev->vma_lock); - - if (try) { - if (!mmap_read_trylock(mm)) { - mmput(mm); - return 0; - } - } else { - mmap_read_lock(mm); - } - if (try) { - if (!mutex_trylock(&vdev->vma_lock)) { - mmap_read_unlock(mm); - mmput(mm); - return 0; - } - } else { - mutex_lock(&vdev->vma_lock); - } - list_for_each_entry_safe(mmap_vma, tmp, - &vdev->vma_list, vma_next) { - struct vm_area_struct *vma = mmap_vma->vma; - - if (vma->vm_mm != mm) - continue; - - list_del(&mmap_vma->vma_next); - kfree(mmap_vma); - - zap_vma_ptes(vma, vma->vm_start, - vma->vm_end - vma->vm_start); - } - mutex_unlock(&vdev->vma_lock); - mmap_read_unlock(mm); - mmput(mm); - } + unmap_mapping_range(core_vdev->inode->i_mapping, start, len, true); } void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev) { - vfio_pci_zap_and_vma_lock(vdev, false); down_write(&vdev->memory_lock); - mutex_unlock(&vdev->vma_lock); + vfio_pci_zap_bars(vdev); } u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev) @@ -1725,99 +1645,41 @@ void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, u16 c up_write(&vdev->memory_lock); } -/* Caller holds vma_lock */ -static int __vfio_pci_add_vma(struct vfio_pci_core_device *vdev, - struct vm_area_struct *vma) -{ - struct vfio_pci_mmap_vma *mmap_vma; - - mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL_ACCOUNT); - if (!mmap_vma) - return -ENOMEM; - - mmap_vma->vma = vma; - list_add(&mmap_vma->vma_next, &vdev->vma_list); - - return 0; -} - -/* - * Zap mmaps on open so that we can fault them in on access and therefore - * our vma_list only tracks mappings accessed since last zap. - */ -static void vfio_pci_mmap_open(struct vm_area_struct *vma) -{ - zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); -} - -static void vfio_pci_mmap_close(struct vm_area_struct *vma) +static unsigned long vma_to_pfn(struct vm_area_struct *vma) { struct vfio_pci_core_device *vdev = vma->vm_private_data; - struct vfio_pci_mmap_vma *mmap_vma; + int index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); + u64 pgoff; - mutex_lock(&vdev->vma_lock); - list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) { - if (mmap_vma->vma == vma) { - list_del(&mmap_vma->vma_next); - kfree(mmap_vma); - break; - } - } - mutex_unlock(&vdev->vma_lock); + pgoff = vma->vm_pgoff & + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + + return (pci_resource_start(vdev->pdev, index) >> PAGE_SHIFT) + pgoff; } static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct vfio_pci_core_device *vdev = vma->vm_private_data; - struct vfio_pci_mmap_vma *mmap_vma; - vm_fault_t ret = VM_FAULT_NOPAGE; + unsigned long pfn, pgoff = vmf->pgoff - vma->vm_pgoff; + vm_fault_t ret = VM_FAULT_SIGBUS; - mutex_lock(&vdev->vma_lock); - down_read(&vdev->memory_lock); + pfn = vma_to_pfn(vma); - /* - * Memory region cannot be accessed if the low power feature is engaged - * or memory access is disabled. - */ - if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) { - ret = VM_FAULT_SIGBUS; - goto up_out; - } + down_read(&vdev->memory_lock); - /* - * We populate the whole vma on fault, so we need to test whether - * the vma has already been mapped, such as for concurrent faults - * to the same vma. io_remap_pfn_range() will trigger a BUG_ON if - * we ask it to fill the same range again. - */ - list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) { - if (mmap_vma->vma == vma) - goto up_out; - } + if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) + goto out_disabled; - if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, - vma->vm_end - vma->vm_start, - vma->vm_page_prot)) { - ret = VM_FAULT_SIGBUS; - zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); - goto up_out; - } + ret = vmf_insert_pfn(vma, vmf->address, pfn + pgoff); - if (__vfio_pci_add_vma(vdev, vma)) { - ret = VM_FAULT_OOM; - zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); - } - -up_out: +out_disabled: up_read(&vdev->memory_lock); - mutex_unlock(&vdev->vma_lock); + return ret; } static const struct vm_operations_struct vfio_pci_mmap_ops = { - .open = vfio_pci_mmap_open, - .close = vfio_pci_mmap_close, .fault = vfio_pci_mmap_fault, }; @@ -1880,11 +1742,12 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma vma->vm_private_data = vdev; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff; + vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); /* - * See remap_pfn_range(), called from vfio_pci_fault() but we can't - * change vm_flags within the fault handler. Set them now. + * Set vm_flags now, they should not be changed in the fault handler. + * We want the same flags and page protection (decrypted above) as + * io_remap_pfn_range() would set. * * VM_ALLOW_ANY_UNCACHED: The VMA flag is implemented for ARM64, * allowing KVM stage 2 device mapping attributes to use Normal-NC @@ -2202,8 +2065,6 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev) mutex_init(&vdev->ioeventfds_lock); INIT_LIST_HEAD(&vdev->dummy_resources_list); INIT_LIST_HEAD(&vdev->ioeventfds_list); - mutex_init(&vdev->vma_lock); - INIT_LIST_HEAD(&vdev->vma_list); INIT_LIST_HEAD(&vdev->sriov_pfs_item); init_rwsem(&vdev->memory_lock); xa_init(&vdev->ctx); @@ -2219,7 +2080,6 @@ void vfio_pci_core_release_dev(struct vfio_device *core_vdev) mutex_destroy(&vdev->igate); mutex_destroy(&vdev->ioeventfds_lock); - mutex_destroy(&vdev->vma_lock); kfree(vdev->region); kfree(vdev->pm_save); } @@ -2497,26 +2357,15 @@ unwind: return ret; } -/* - * We need to get memory_lock for each device, but devices can share mmap_lock, - * therefore we need to zap and hold the vma_lock for each device, and only then - * get each memory_lock. - */ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, struct vfio_pci_group_info *groups, struct iommufd_ctx *iommufd_ctx) { - struct vfio_pci_core_device *cur_mem; - struct vfio_pci_core_device *cur_vma; - struct vfio_pci_core_device *cur; + struct vfio_pci_core_device *vdev; struct pci_dev *pdev; - bool is_mem = true; int ret; mutex_lock(&dev_set->lock); - cur_mem = list_first_entry(&dev_set->device_list, - struct vfio_pci_core_device, - vdev.dev_set_list); pdev = vfio_pci_dev_set_resettable(dev_set); if (!pdev) { @@ -2533,7 +2382,7 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, if (ret) goto err_unlock; - list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) { + list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) { bool owned; /* @@ -2557,38 +2406,38 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, * Otherwise, reset is not allowed. */ if (iommufd_ctx) { - int devid = vfio_iommufd_get_dev_id(&cur_vma->vdev, + int devid = vfio_iommufd_get_dev_id(&vdev->vdev, iommufd_ctx); owned = (devid > 0 || devid == -ENOENT); } else { - owned = vfio_dev_in_groups(&cur_vma->vdev, groups); + owned = vfio_dev_in_groups(&vdev->vdev, groups); } if (!owned) { ret = -EINVAL; - goto err_undo; + break; } /* - * Locking multiple devices is prone to deadlock, runaway and - * unwind if we hit contention. + * Take the memory write lock for each device and zap BAR + * mappings to prevent the user accessing the device while in + * reset. Locking multiple devices is prone to deadlock, + * runaway and unwind if we hit contention. */ - if (!vfio_pci_zap_and_vma_lock(cur_vma, true)) { + if (!down_write_trylock(&vdev->memory_lock)) { ret = -EBUSY; - goto err_undo; + break; } + + vfio_pci_zap_bars(vdev); } - cur_vma = NULL; - list_for_each_entry(cur_mem, &dev_set->device_list, vdev.dev_set_list) { - if (!down_write_trylock(&cur_mem->memory_lock)) { - ret = -EBUSY; - goto err_undo; - } - mutex_unlock(&cur_mem->vma_lock); + if (!list_entry_is_head(vdev, + &dev_set->device_list, vdev.dev_set_list)) { + vdev = list_prev_entry(vdev, vdev.dev_set_list); + goto err_undo; } - cur_mem = NULL; /* * The pci_reset_bus() will reset all the devices in the bus. @@ -2599,25 +2448,22 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, * cause the PCI config space reset without restoring the original * state (saved locally in 'vdev->pm_save'). */ - list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) - vfio_pci_set_power_state(cur, PCI_D0); + list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) + vfio_pci_set_power_state(vdev, PCI_D0); ret = pci_reset_bus(pdev); + vdev = list_last_entry(&dev_set->device_list, + struct vfio_pci_core_device, vdev.dev_set_list); + err_undo: - list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) { - if (cur == cur_mem) - is_mem = false; - if (cur == cur_vma) - break; - if (is_mem) - up_write(&cur->memory_lock); - else - mutex_unlock(&cur->vma_lock); - } + list_for_each_entry_from_reverse(vdev, &dev_set->device_list, + vdev.dev_set_list) + up_write(&vdev->memory_lock); + + list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) + pm_runtime_put(&vdev->pdev->dev); - list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) - pm_runtime_put(&cur->pdev->dev); err_unlock: mutex_unlock(&dev_set->lock); return ret; diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index a2c8b8bba711..f87067438ed4 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -93,8 +93,6 @@ struct vfio_pci_core_device { struct list_head sriov_pfs_item; struct vfio_pci_core_device *sriov_pf_core_dev; struct notifier_block nb; - struct mutex vma_lock; - struct list_head vma_list; struct rw_semaphore memory_lock; }; -- cgit v1.2.3 From 89e8a2366e3bce584b6c01549d5019c5cda1205e Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 28 May 2024 12:25:28 +0800 Subject: iommu: Return right value in iommu_sva_bind_device() iommu_sva_bind_device() should return either a sva bond handle or an ERR_PTR value in error cases. Existing drivers (idxd and uacce) only check the return value with IS_ERR(). This could potentially lead to a kernel NULL pointer dereference issue if the function returns NULL instead of an error pointer. In reality, this doesn't cause any problems because iommu_sva_bind_device() only returns NULL when the kernel is not configured with CONFIG_IOMMU_SVA. In this case, iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_SVA) will return an error, and the device drivers won't call iommu_sva_bind_device() at all. Fixes: 26b25a2b98e4 ("iommu: Bind process address spaces to devices") Signed-off-by: Lu Baolu Reviewed-by: Jean-Philippe Brucker Reviewed-by: Kevin Tian Reviewed-by: Vasant Hegde Link: https://lore.kernel.org/r/20240528042528.71396-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 7bc8dff7cf6d..17b3f36ad843 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1533,7 +1533,7 @@ struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, static inline struct iommu_sva * iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) { - return NULL; + return ERR_PTR(-ENODEV); } static inline void iommu_sva_unbind_device(struct iommu_sva *handle) -- cgit v1.2.3 From f85d39dd7ed89ffdd622bc1de247ffba8d961504 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 27 May 2024 19:35:38 +0200 Subject: kcov, usb: disable interrupts in kcov_remote_start_usb_softirq After commit 8fea0c8fda30 ("usb: core: hcd: Convert from tasklet to BH workqueue"), usb_giveback_urb_bh() runs in the BH workqueue with interrupts enabled. Thus, the remote coverage collection section in usb_giveback_urb_bh()-> __usb_hcd_giveback_urb() might be interrupted, and the interrupt handler might invoke __usb_hcd_giveback_urb() again. This breaks KCOV, as it does not support nested remote coverage collection sections within the same context (neither in task nor in softirq). Update kcov_remote_start/stop_usb_softirq() to disable interrupts for the duration of the coverage collection section to avoid nested sections in the softirq context (in addition to such in the task context, which are already handled). Reported-by: Tetsuo Handa Closes: https://lore.kernel.org/linux-usb/0f4d1964-7397-485b-bc48-11c01e2fcbca@I-love.SAKURA.ne.jp/ Closes: https://syzkaller.appspot.com/bug?extid=0438378d6f157baae1a2 Suggested-by: Alan Stern Fixes: 8fea0c8fda30 ("usb: core: hcd: Convert from tasklet to BH workqueue") Cc: stable@vger.kernel.org Acked-by: Dmitry Vyukov Signed-off-by: Andrey Konovalov Link: https://lore.kernel.org/r/20240527173538.4989-1-andrey.konovalov@linux.dev Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hcd.c | 12 +++++++----- include/linux/kcov.h | 47 ++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 45 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index e3366f4d82b9..1ff7d901fede 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -1623,6 +1623,7 @@ static void __usb_hcd_giveback_urb(struct urb *urb) struct usb_hcd *hcd = bus_to_hcd(urb->dev->bus); struct usb_anchor *anchor = urb->anchor; int status = urb->unlinked; + unsigned long flags; urb->hcpriv = NULL; if (unlikely((urb->transfer_flags & URB_SHORT_NOT_OK) && @@ -1640,13 +1641,14 @@ static void __usb_hcd_giveback_urb(struct urb *urb) /* pass ownership to the completion handler */ urb->status = status; /* - * This function can be called in task context inside another remote - * coverage collection section, but kcov doesn't support that kind of - * recursion yet. Only collect coverage in softirq context for now. + * Only collect coverage in the softirq context and disable interrupts + * to avoid scenarios with nested remote coverage collection sections + * that KCOV does not support. + * See the comment next to kcov_remote_start_usb_softirq() for details. */ - kcov_remote_start_usb_softirq((u64)urb->dev->bus->busnum); + flags = kcov_remote_start_usb_softirq((u64)urb->dev->bus->busnum); urb->complete(urb); - kcov_remote_stop_softirq(); + kcov_remote_stop_softirq(flags); usb_anchor_resume_wakeups(anchor); atomic_dec(&urb->use_count); diff --git a/include/linux/kcov.h b/include/linux/kcov.h index b851ba415e03..1068a7318d89 100644 --- a/include/linux/kcov.h +++ b/include/linux/kcov.h @@ -55,21 +55,47 @@ static inline void kcov_remote_start_usb(u64 id) /* * The softirq flavor of kcov_remote_*() functions is introduced as a temporary - * work around for kcov's lack of nested remote coverage sections support in - * task context. Adding support for nested sections is tracked in: - * https://bugzilla.kernel.org/show_bug.cgi?id=210337 + * workaround for KCOV's lack of nested remote coverage sections support. + * + * Adding support is tracked in https://bugzilla.kernel.org/show_bug.cgi?id=210337. + * + * kcov_remote_start_usb_softirq(): + * + * 1. Only collects coverage when called in the softirq context. This allows + * avoiding nested remote coverage collection sections in the task context. + * For example, USB/IP calls usb_hcd_giveback_urb() in the task context + * within an existing remote coverage collection section. Thus, KCOV should + * not attempt to start collecting coverage within the coverage collection + * section in __usb_hcd_giveback_urb() in this case. + * + * 2. Disables interrupts for the duration of the coverage collection section. + * This allows avoiding nested remote coverage collection sections in the + * softirq context (a softirq might occur during the execution of a work in + * the BH workqueue, which runs with in_serving_softirq() > 0). + * For example, usb_giveback_urb_bh() runs in the BH workqueue with + * interrupts enabled, so __usb_hcd_giveback_urb() might be interrupted in + * the middle of its remote coverage collection section, and the interrupt + * handler might invoke __usb_hcd_giveback_urb() again. */ -static inline void kcov_remote_start_usb_softirq(u64 id) +static inline unsigned long kcov_remote_start_usb_softirq(u64 id) { - if (in_serving_softirq()) + unsigned long flags = 0; + + if (in_serving_softirq()) { + local_irq_save(flags); kcov_remote_start_usb(id); + } + + return flags; } -static inline void kcov_remote_stop_softirq(void) +static inline void kcov_remote_stop_softirq(unsigned long flags) { - if (in_serving_softirq()) + if (in_serving_softirq()) { kcov_remote_stop(); + local_irq_restore(flags); + } } #ifdef CONFIG_64BIT @@ -103,8 +129,11 @@ static inline u64 kcov_common_handle(void) } static inline void kcov_remote_start_common(u64 id) {} static inline void kcov_remote_start_usb(u64 id) {} -static inline void kcov_remote_start_usb_softirq(u64 id) {} -static inline void kcov_remote_stop_softirq(void) {} +static inline unsigned long kcov_remote_start_usb_softirq(u64 id) +{ + return 0; +} +static inline void kcov_remote_stop_softirq(unsigned long flags) {} #endif /* CONFIG_KCOV */ #endif /* _LINUX_KCOV_H */ -- cgit v1.2.3 From 971187350602d03c4a27c0783ff412502b95720a Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 4 Jul 2023 14:17:19 +0100 Subject: driver core: remove devm_device_add_groups() There is no more in-kernel users of this function, and no driver should ever be using it, so remove it from the kernel. Acked-by: Dmitry Torokhov Acked-by: "Rafael J. Wysocki" Link: https://lore.kernel.org/r/20230704131715.44454-8-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 45 --------------------------------------------- include/linux/device.h | 2 -- 2 files changed, 47 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/core.c b/drivers/base/core.c index 131d96c6090b..2e776fcc31b6 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -2845,15 +2845,6 @@ static void devm_attr_group_remove(struct device *dev, void *res) sysfs_remove_group(&dev->kobj, group); } -static void devm_attr_groups_remove(struct device *dev, void *res) -{ - union device_attr_group_devres *devres = res; - const struct attribute_group **groups = devres->groups; - - dev_dbg(dev, "%s: removing groups %p\n", __func__, groups); - sysfs_remove_groups(&dev->kobj, groups); -} - /** * devm_device_add_group - given a device, create a managed attribute group * @dev: The device to create the group for @@ -2886,42 +2877,6 @@ int devm_device_add_group(struct device *dev, const struct attribute_group *grp) } EXPORT_SYMBOL_GPL(devm_device_add_group); -/** - * devm_device_add_groups - create a bunch of managed attribute groups - * @dev: The device to create the group for - * @groups: The attribute groups to create, NULL terminated - * - * This function creates a bunch of managed attribute groups. If an error - * occurs when creating a group, all previously created groups will be - * removed, unwinding everything back to the original state when this - * function was called. It will explicitly warn and error if any of the - * attribute files being created already exist. - * - * Returns 0 on success or error code from sysfs_create_group on failure. - */ -int devm_device_add_groups(struct device *dev, - const struct attribute_group **groups) -{ - union device_attr_group_devres *devres; - int error; - - devres = devres_alloc(devm_attr_groups_remove, - sizeof(*devres), GFP_KERNEL); - if (!devres) - return -ENOMEM; - - error = sysfs_create_groups(&dev->kobj, groups); - if (error) { - devres_free(devres); - return error; - } - - devres->groups = groups; - devres_add(dev, devres); - return 0; -} -EXPORT_SYMBOL_GPL(devm_device_add_groups); - static int device_add_attrs(struct device *dev) { const struct class *class = dev->class; diff --git a/include/linux/device.h b/include/linux/device.h index fc3bd7116ab9..ace039151cb8 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1220,8 +1220,6 @@ static inline void device_remove_group(struct device *dev, return device_remove_groups(dev, groups); } -int __must_check devm_device_add_groups(struct device *dev, - const struct attribute_group **groups); int __must_check devm_device_add_group(struct device *dev, const struct attribute_group *grp); -- cgit v1.2.3 From 44a45be57f85165761fdabf072f9a97aa026ff61 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Thu, 23 May 2024 13:00:00 +0200 Subject: sysfs: Unbreak the build around sysfs_bin_attr_simple_read() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Günter reports build breakage for m68k "m5208evb_defconfig" plus CONFIG_BLK_DEV_INITRD=y caused by commit 66bc1a173328 ("treewide: Use sysfs_bin_attr_simple_read() helper"). The defconfig disables CONFIG_SYSFS, so sysfs_bin_attr_simple_read() is not compiled into the kernel. But init/initramfs.c references that function in the initializer of a struct bin_attribute. Add an empty static inline to avoid the build breakage. Fixes: 66bc1a173328 ("treewide: Use sysfs_bin_attr_simple_read() helper") Reported-by: Guenter Roeck Closes: https://lore.kernel.org/r/e12b0027-b199-4de7-b83d-668171447ccc@roeck-us.net Signed-off-by: Lukas Wunner Tested-by: Guenter Roeck Reviewed-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/05f4290439a58730738a15b0c99cd8576c4aa0d9.1716461752.git.lukas@wunner.de Signed-off-by: Greg Kroah-Hartman --- include/linux/sysfs.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index a7d725fbf739..c4e64dc11206 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -750,6 +750,15 @@ static inline int sysfs_emit_at(char *buf, int at, const char *fmt, ...) { return 0; } + +static inline ssize_t sysfs_bin_attr_simple_read(struct file *file, + struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, + size_t count) +{ + return 0; +} #endif /* CONFIG_SYSFS */ static inline int __must_check sysfs_create_file(struct kobject *kobj, -- cgit v1.2.3 From c9d52fb313d3719d69a040f4ca78a3e2e95fba21 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 30 May 2024 18:04:24 -0700 Subject: PCI: Revert the cfg_access_lock lockdep mechanism While the experiment did reveal that there are additional places that are missing the lock during secondary bus reset, one of the places that needs to take cfg_access_lock (pci_bus_lock()) is not prepared for lockdep annotation. Specifically, pci_bus_lock() takes pci_dev_lock() recursively and is currently dependent on the fact that the device_lock() is marked lockdep_set_novalidate_class(&dev->mutex). Otherwise, without that annotation, pci_bus_lock() would need to use something like a new pci_dev_lock_nested() helper, a scheme to track a PCI device's depth in the topology, and a hope that the depth of a PCI tree never exceeds the max value for a lockdep subclass. The alternative to ripping out the lockdep coverage would be to deploy a dynamic lock key for every PCI device. Unfortunately, there is evidence that increasing the number of keys that lockdep needs to track to be per-PCI-device is prohibitively expensive for something like the cfg_access_lock. The main motivation for adding the annotation in the first place was to catch unlocked secondary bus resets, not necessarily catch lock ordering problems between cfg_access_lock and other locks. Solve that narrower problem with follow-on patches, and just due to targeted revert for now. Link: https://lore.kernel.org/r/171711746402.1628941.14575335981264103013.stgit@dwillia2-xfh.jf.intel.com Fixes: 7e89efc6e9e4 ("PCI: Lock upstream bridge for pci_reset_function()") Reported-by: Imre Deak Closes: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_134186v1/shard-dg2-1/igt@device_reset@unbind-reset-rebind.html Signed-off-by: Dan Williams Signed-off-by: Bjorn Helgaas Tested-by: Hans de Goede Tested-by: Kalle Valo Reviewed-by: Dave Jiang Cc: Jani Saarinen --- drivers/pci/access.c | 4 ---- drivers/pci/pci.c | 1 - drivers/pci/probe.c | 3 --- include/linux/lockdep.h | 5 ----- include/linux/pci.h | 2 -- 5 files changed, 15 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/access.c b/drivers/pci/access.c index 30f031de9cfe..b123da16b63b 100644 --- a/drivers/pci/access.c +++ b/drivers/pci/access.c @@ -289,8 +289,6 @@ void pci_cfg_access_lock(struct pci_dev *dev) { might_sleep(); - lock_map_acquire(&dev->cfg_access_lock); - raw_spin_lock_irq(&pci_lock); if (dev->block_cfg_access) pci_wait_cfg(dev); @@ -345,8 +343,6 @@ void pci_cfg_access_unlock(struct pci_dev *dev) raw_spin_unlock_irqrestore(&pci_lock, flags); wake_up_all(&pci_cfg_wait); - - lock_map_release(&dev->cfg_access_lock); } EXPORT_SYMBOL_GPL(pci_cfg_access_unlock); diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 59e0949fb079..35fb1f17a589 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -4883,7 +4883,6 @@ void __weak pcibios_reset_secondary_bus(struct pci_dev *dev) */ int pci_bridge_secondary_bus_reset(struct pci_dev *dev) { - lock_map_assert_held(&dev->cfg_access_lock); pcibios_reset_secondary_bus(dev); return pci_bridge_wait_for_secondary_bus(dev, "bus reset"); diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 8e696e547565..5fbabb4e3425 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2546,9 +2546,6 @@ void pci_device_add(struct pci_dev *dev, struct pci_bus *bus) dev->dev.dma_mask = &dev->dma_mask; dev->dev.dma_parms = &dev->dma_parms; dev->dev.coherent_dma_mask = 0xffffffffull; - lockdep_register_key(&dev->cfg_access_key); - lockdep_init_map(&dev->cfg_access_lock, dev_name(&dev->dev), - &dev->cfg_access_key, 0); dma_set_max_seg_size(&dev->dev, 65536); dma_set_seg_boundary(&dev->dev, 0xffffffff); diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 5e51b0de4c4b..08b0d1d9d78b 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -297,9 +297,6 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); .wait_type_inner = _wait_type, \ .lock_type = LD_LOCK_WAIT_OVERRIDE, } -#define lock_map_assert_held(l) \ - lockdep_assert(lock_is_held(l) != LOCK_STATE_NOT_HELD) - #else /* !CONFIG_LOCKDEP */ static inline void lockdep_init_task(struct task_struct *task) @@ -391,8 +388,6 @@ extern int lockdep_is_held(const void *); #define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type) \ struct lockdep_map __maybe_unused _name = {} -#define lock_map_assert_held(l) do { (void)(l); } while (0) - #endif /* !LOCKDEP */ #ifdef CONFIG_PROVE_LOCKING diff --git a/include/linux/pci.h b/include/linux/pci.h index fb004fd4e889..cafc5ab1cbcb 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -413,8 +413,6 @@ struct pci_dev { struct resource driver_exclusive_resource; /* driver exclusive resource ranges */ bool match_driver; /* Skip attaching driver */ - struct lock_class_key cfg_access_key; - struct lockdep_map cfg_access_lock; unsigned int transparent:1; /* Subtractive decode bridge */ unsigned int io_window:1; /* Bridge has I/O window */ -- cgit v1.2.3 From f92a59f6d12e31ead999fee9585471b95a8ae8a3 Mon Sep 17 00:00:00 2001 From: Carlos Llamas Date: Wed, 15 May 2024 13:37:10 +0000 Subject: locking/atomic: scripts: fix ${atomic}_sub_and_test() kerneldoc For ${atomic}_sub_and_test() the @i parameter is the value to subtract, not add. Fix the typo in the kerneldoc template and generate the headers with this update. Fixes: ad8110706f38 ("locking/atomic: scripts: generate kerneldoc comments") Suggested-by: Mark Rutland Signed-off-by: Carlos Llamas Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland Reviewed-by: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20240515133844.3502360-1-cmllamas@google.com --- include/linux/atomic/atomic-arch-fallback.h | 6 +++--- include/linux/atomic/atomic-instrumented.h | 8 ++++---- include/linux/atomic/atomic-long.h | 4 ++-- scripts/atomic/kerneldoc/sub_and_test | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/atomic/atomic-arch-fallback.h b/include/linux/atomic/atomic-arch-fallback.h index 956bcba5dbf2..2f9d36b72bd8 100644 --- a/include/linux/atomic/atomic-arch-fallback.h +++ b/include/linux/atomic/atomic-arch-fallback.h @@ -2242,7 +2242,7 @@ raw_atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new) /** * raw_atomic_sub_and_test() - atomic subtract and test if zero with full ordering - * @i: int value to add + * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with full ordering. @@ -4368,7 +4368,7 @@ raw_atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new) /** * raw_atomic64_sub_and_test() - atomic subtract and test if zero with full ordering - * @i: s64 value to add + * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with full ordering. @@ -4690,4 +4690,4 @@ raw_atomic64_dec_if_positive(atomic64_t *v) } #endif /* _LINUX_ATOMIC_FALLBACK_H */ -// 14850c0b0db20c62fdc78ccd1d42b98b88d76331 +// b565db590afeeff0d7c9485ccbca5bb6e155749f diff --git a/include/linux/atomic/atomic-instrumented.h b/include/linux/atomic/atomic-instrumented.h index debd487fe971..9409a6ddf3e0 100644 --- a/include/linux/atomic/atomic-instrumented.h +++ b/include/linux/atomic/atomic-instrumented.h @@ -1349,7 +1349,7 @@ atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new) /** * atomic_sub_and_test() - atomic subtract and test if zero with full ordering - * @i: int value to add + * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with full ordering. @@ -2927,7 +2927,7 @@ atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new) /** * atomic64_sub_and_test() - atomic subtract and test if zero with full ordering - * @i: s64 value to add + * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with full ordering. @@ -4505,7 +4505,7 @@ atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new) /** * atomic_long_sub_and_test() - atomic subtract and test if zero with full ordering - * @i: long value to add + * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with full ordering. @@ -5050,4 +5050,4 @@ atomic_long_dec_if_positive(atomic_long_t *v) #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */ -// ce5b65e0f1f8a276268b667194581d24bed219d4 +// 8829b337928e9508259079d32581775ececd415b diff --git a/include/linux/atomic/atomic-long.h b/include/linux/atomic/atomic-long.h index 3ef844b3ab8a..f86b29d90877 100644 --- a/include/linux/atomic/atomic-long.h +++ b/include/linux/atomic/atomic-long.h @@ -1535,7 +1535,7 @@ raw_atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new) /** * raw_atomic_long_sub_and_test() - atomic subtract and test if zero with full ordering - * @i: long value to add + * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with full ordering. @@ -1809,4 +1809,4 @@ raw_atomic_long_dec_if_positive(atomic_long_t *v) } #endif /* _LINUX_ATOMIC_LONG_H */ -// 1c4a26fc77f345342953770ebe3c4d08e7ce2f9a +// eadf183c3600b8b92b91839dd3be6bcc560c752d diff --git a/scripts/atomic/kerneldoc/sub_and_test b/scripts/atomic/kerneldoc/sub_and_test index d3760f7749d4..96615e50836b 100644 --- a/scripts/atomic/kerneldoc/sub_and_test +++ b/scripts/atomic/kerneldoc/sub_and_test @@ -1,7 +1,7 @@ cat < Date: Thu, 23 May 2024 10:36:39 +0800 Subject: mm: drop the 'anon_' prefix for swap-out mTHP counters The mTHP swap related counters: 'anon_swpout' and 'anon_swpout_fallback' are confusing with an 'anon_' prefix, since the shmem can swap out non-anonymous pages. So drop the 'anon_' prefix to keep consistent with the old swap counter names. This is needed in 6.10-rcX to avoid having an inconsistent ABI out in the field. Link: https://lkml.kernel.org/r/7a8989c13299920d7589007a30065c3e2c19f0e0.1716431702.git.baolin.wang@linux.alibaba.com Fixes: d0f048ac39f6 ("mm: add per-order mTHP anon_swpout and anon_swpout_fallback counters") Fixes: 42248b9d34ea ("mm: add docs for per-order mTHP counters and transhuge_page ABI") Signed-off-by: Baolin Wang Suggested-by: "Huang, Ying" Acked-by: Barry Song Cc: David Hildenbrand Cc: Lance Yang Cc: Matthew Wilcox (Oracle) Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/transhuge.rst | 4 ++-- include/linux/huge_mm.h | 4 ++-- mm/huge_memory.c | 8 ++++---- mm/page_io.c | 2 +- mm/vmscan.c | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 076443cc10a6..d414d3f5592a 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -467,11 +467,11 @@ anon_fault_fallback_charge instead falls back to using huge pages with lower orders or small pages even though the allocation was successful. -anon_swpout +swpout is incremented every time a huge page is swapped out in one piece without splitting. -anon_swpout_fallback +swpout_fallback is incremented if a huge page has to be split before swapout. Usually because failed to allocate some continuous swap space for the huge page. diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index c8d3ec116e29..8c72d3786583 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -269,8 +269,8 @@ enum mthp_stat_item { MTHP_STAT_ANON_FAULT_ALLOC, MTHP_STAT_ANON_FAULT_FALLBACK, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, - MTHP_STAT_ANON_SWPOUT, - MTHP_STAT_ANON_SWPOUT_FALLBACK, + MTHP_STAT_SWPOUT, + MTHP_STAT_SWPOUT_FALLBACK, __MTHP_STAT_COUNT }; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 317de2afd371..89932fd0f62e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -558,15 +558,15 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); -DEFINE_MTHP_STAT_ATTR(anon_swpout, MTHP_STAT_ANON_SWPOUT); -DEFINE_MTHP_STAT_ATTR(anon_swpout_fallback, MTHP_STAT_ANON_SWPOUT_FALLBACK); +DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT); +DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK); static struct attribute *stats_attrs[] = { &anon_fault_alloc_attr.attr, &anon_fault_fallback_attr.attr, &anon_fault_fallback_charge_attr.attr, - &anon_swpout_attr.attr, - &anon_swpout_fallback_attr.attr, + &swpout_attr.attr, + &swpout_fallback_attr.attr, NULL, }; diff --git a/mm/page_io.c b/mm/page_io.c index 46c603dddf04..0a150c240bf4 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -217,7 +217,7 @@ static inline void count_swpout_vm_event(struct folio *folio) count_memcg_folio_events(folio, THP_SWPOUT, 1); count_vm_event(THP_SWPOUT); } - count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_SWPOUT); + count_mthp_stat(folio_order(folio), MTHP_STAT_SWPOUT); #endif count_vm_events(PSWPOUT, folio_nr_pages(folio)); } diff --git a/mm/vmscan.c b/mm/vmscan.c index d55e8d07ffc4..2e34de9cd0d4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1227,7 +1227,7 @@ retry: THP_SWPOUT_FALLBACK, 1); count_vm_event(THP_SWPOUT_FALLBACK); } - count_mthp_stat(order, MTHP_STAT_ANON_SWPOUT_FALLBACK); + count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK); #endif if (!add_to_swap(folio)) goto activate_locked_split; -- cgit v1.2.3 From 94d46bf17916965e918bd2f3d2eec057f7c5578d Mon Sep 17 00:00:00 2001 From: Barry Song Date: Fri, 24 May 2024 08:50:48 +1200 Subject: mm: huge_mm: fix undefined reference to `mthp_stats' for CONFIG_SYSFS=n if CONFIG_SYSFS is not enabled in config, we get the below error, All errors (new ones prefixed by >>): s390-linux-ld: mm/memory.o: in function `count_mthp_stat': >> include/linux/huge_mm.h:285:(.text+0x191c): undefined reference to `mthp_stats' s390-linux-ld: mm/huge_memory.o:(.rodata+0x10): undefined reference to `mthp_stats' vim +285 include/linux/huge_mm.h 279 280 static inline void count_mthp_stat(int order, enum mthp_stat_item item) 281 { 282 if (order <= 0 || order > PMD_ORDER) 283 return; 284 > 285 this_cpu_inc(mthp_stats.stats[order][item]); 286 } 287 Link: https://lkml.kernel.org/r/20240523210045.40444-1-21cnbao@gmail.com Fixes: ec33687c6749 ("mm: add per-order mTHP anon_fault_alloc and anon_fault_fallback counters") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202405231728.tCAogiSI-lkp@intel.com/ Signed-off-by: Barry Song Tested-by: Yujie Liu Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 8c72d3786583..2aa986a5cd1b 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -278,6 +278,7 @@ struct mthp_stat { unsigned long stats[ilog2(MAX_PTRS_PER_PTE) + 1][__MTHP_STAT_COUNT]; }; +#ifdef CONFIG_SYSFS DECLARE_PER_CPU(struct mthp_stat, mthp_stats); static inline void count_mthp_stat(int order, enum mthp_stat_item item) @@ -287,6 +288,11 @@ static inline void count_mthp_stat(int order, enum mthp_stat_item item) this_cpu_inc(mthp_stats.stats[order][item]); } +#else +static inline void count_mthp_stat(int order, enum mthp_stat_item item) +{ +} +#endif #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ -- cgit v1.2.3 From c2dc78b86e0821ecf9a9d0c35dba2618279a5bb6 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Tue, 28 May 2024 13:15:22 +0800 Subject: mm/ksm: fix ksm_zero_pages accounting We normally ksm_zero_pages++ in ksmd when page is merged with zero page, but ksm_zero_pages-- is done from page tables side, where there is no any accessing protection of ksm_zero_pages. So we can read very exceptional value of ksm_zero_pages in rare cases, such as -1, which is very confusing to users. Fix it by changing to use atomic_long_t, and the same case with the mm->ksm_zero_pages. Link: https://lkml.kernel.org/r/20240528-b4-ksm-counters-v3-2-34bb358fdc13@linux.dev Fixes: e2942062e01d ("ksm: count all zero pages placed by KSM") Fixes: 6080d19f0704 ("ksm: add ksm zero pages for each process") Signed-off-by: Chengming Zhou Acked-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Ran Xiaokai Cc: Stefan Roesch Cc: xu xin Cc: Yang Yang Cc: Signed-off-by: Andrew Morton --- fs/proc/base.c | 2 +- include/linux/ksm.h | 17 ++++++++++++++--- include/linux/mm_types.h | 2 +- mm/ksm.c | 11 +++++------ 4 files changed, 21 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/base.c b/fs/proc/base.c index 18550c071d71..72a1acd03675 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3214,7 +3214,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, mm = get_task_mm(task); if (mm) { seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items); - seq_printf(m, "ksm_zero_pages %lu\n", mm->ksm_zero_pages); + seq_printf(m, "ksm_zero_pages %ld\n", mm_ksm_zero_pages(mm)); seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages); seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm)); mmput(mm); diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 52c63a9c5a9c..11690dacd986 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -33,16 +33,27 @@ void __ksm_exit(struct mm_struct *mm); */ #define is_ksm_zero_pte(pte) (is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte)) -extern unsigned long ksm_zero_pages; +extern atomic_long_t ksm_zero_pages; + +static inline void ksm_map_zero_page(struct mm_struct *mm) +{ + atomic_long_inc(&ksm_zero_pages); + atomic_long_inc(&mm->ksm_zero_pages); +} static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte) { if (is_ksm_zero_pte(pte)) { - ksm_zero_pages--; - mm->ksm_zero_pages--; + atomic_long_dec(&ksm_zero_pages); + atomic_long_dec(&mm->ksm_zero_pages); } } +static inline long mm_ksm_zero_pages(struct mm_struct *mm) +{ + return atomic_long_read(&mm->ksm_zero_pages); +} + static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 24323c7d0bd4..af3a0256fa93 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -985,7 +985,7 @@ struct mm_struct { * Represent how many empty pages are merged with kernel zero * pages when enabling KSM use_zero_pages. */ - unsigned long ksm_zero_pages; + atomic_long_t ksm_zero_pages; #endif /* CONFIG_KSM */ #ifdef CONFIG_LRU_GEN_WALKS_MMU struct { diff --git a/mm/ksm.c b/mm/ksm.c index 9e99cb12d330..34c4820e0d3d 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -296,7 +296,7 @@ static bool ksm_use_zero_pages __read_mostly; static bool ksm_smart_scan = true; /* The number of zero pages which is placed by KSM */ -unsigned long ksm_zero_pages; +atomic_long_t ksm_zero_pages = ATOMIC_LONG_INIT(0); /* The number of pages that have been skipped due to "smart scanning" */ static unsigned long ksm_pages_skipped; @@ -1429,8 +1429,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, * the dirty bit in zero page's PTE is set. */ newpte = pte_mkdirty(pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot))); - ksm_zero_pages++; - mm->ksm_zero_pages++; + ksm_map_zero_page(mm); /* * We're replacing an anonymous page with a zero page, which is * not anonymous. We need to do proper accounting otherwise we @@ -3374,7 +3373,7 @@ static void wait_while_offlining(void) #ifdef CONFIG_PROC_FS long ksm_process_profit(struct mm_struct *mm) { - return (long)(mm->ksm_merging_pages + mm->ksm_zero_pages) * PAGE_SIZE - + return (long)(mm->ksm_merging_pages + mm_ksm_zero_pages(mm)) * PAGE_SIZE - mm->ksm_rmap_items * sizeof(struct ksm_rmap_item); } #endif /* CONFIG_PROC_FS */ @@ -3663,7 +3662,7 @@ KSM_ATTR_RO(pages_skipped); static ssize_t ksm_zero_pages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sysfs_emit(buf, "%ld\n", ksm_zero_pages); + return sysfs_emit(buf, "%ld\n", atomic_long_read(&ksm_zero_pages)); } KSM_ATTR_RO(ksm_zero_pages); @@ -3672,7 +3671,7 @@ static ssize_t general_profit_show(struct kobject *kobj, { long general_profit; - general_profit = (ksm_pages_sharing + ksm_zero_pages) * PAGE_SIZE - + general_profit = (ksm_pages_sharing + atomic_long_read(&ksm_zero_pages)) * PAGE_SIZE - ksm_rmap_items * sizeof(struct ksm_rmap_item); return sysfs_emit(buf, "%ld\n", general_profit); -- cgit v1.2.3 From 144ba8580bcb82b2686c3d1a043299d844b9a682 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Mon, 10 Jun 2024 10:34:26 +0200 Subject: net: pse-pd: Use EOPNOTSUPP error code instead of ENOTSUPP ENOTSUPP is not a SUSV4 error code, prefer EOPNOTSUPP as reported by checkpatch script. Fixes: 18ff0bcda6d1 ("ethtool: add interface to interact with Ethernet Power Equipment") Reviewed-by: Andrew Lunn Acked-by: Oleksij Rempel Signed-off-by: Kory Maincent Link: https://lore.kernel.org/r/20240610083426.740660-1-kory.maincent@bootlin.com Signed-off-by: Jakub Kicinski --- include/linux/pse-pd/pse.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h index 6d07c95dabb9..6eec24ffa866 100644 --- a/include/linux/pse-pd/pse.h +++ b/include/linux/pse-pd/pse.h @@ -167,14 +167,14 @@ static inline int pse_ethtool_get_status(struct pse_control *psec, struct netlink_ext_ack *extack, struct pse_control_status *status) { - return -ENOTSUPP; + return -EOPNOTSUPP; } static inline int pse_ethtool_set_config(struct pse_control *psec, struct netlink_ext_ack *extack, const struct pse_control_config *config) { - return -ENOTSUPP; + return -EOPNOTSUPP; } static inline bool pse_has_podl(struct pse_control *psec) -- cgit v1.2.3 From e038ee6189842e9662d2fc59d09dbcf48350cf99 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Mon, 10 Jun 2024 16:41:44 +0530 Subject: block: unmap and free user mapped integrity via submitter The user mapped intergity is copied back and unpinned by bio_integrity_free which is a low-level routine. Do it via the submitter rather than doing it in the low-level block layer code, to split the submitter side from the consumer side of the bio. Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20240610111144.14647-1-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- block/bio-integrity.c | 26 ++++++++++++++++++++++++-- drivers/nvme/host/ioctl.c | 15 +++++++++++---- include/linux/bio.h | 4 ++++ 3 files changed, 39 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 2e3e8e04961e..8b528e12136f 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -144,16 +144,38 @@ void bio_integrity_free(struct bio *bio) struct bio_integrity_payload *bip = bio_integrity(bio); struct bio_set *bs = bio->bi_pool; + if (bip->bip_flags & BIP_INTEGRITY_USER) + return; if (bip->bip_flags & BIP_BLOCK_INTEGRITY) kfree(bvec_virt(bip->bip_vec)); - else if (bip->bip_flags & BIP_INTEGRITY_USER) - bio_integrity_unmap_user(bip); __bio_integrity_free(bs, bip); bio->bi_integrity = NULL; bio->bi_opf &= ~REQ_INTEGRITY; } +/** + * bio_integrity_unmap_free_user - Unmap and free bio user integrity payload + * @bio: bio containing bip to be unmapped and freed + * + * Description: Used to unmap and free the user mapped integrity portion of a + * bio. Submitter attaching the user integrity buffer is responsible for + * unmapping and freeing it during completion. + */ +void bio_integrity_unmap_free_user(struct bio *bio) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + struct bio_set *bs = bio->bi_pool; + + if (WARN_ON_ONCE(!(bip->bip_flags & BIP_INTEGRITY_USER))) + return; + bio_integrity_unmap_user(bip); + __bio_integrity_free(bs, bip); + bio->bi_integrity = NULL; + bio->bi_opf &= ~REQ_INTEGRITY; +} +EXPORT_SYMBOL(bio_integrity_unmap_free_user); + /** * bio_integrity_add_page - Attach integrity metadata * @bio: bio to update diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 9d9d2a127c4e..8b69427a4476 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -111,6 +111,13 @@ static struct request *nvme_alloc_user_request(struct request_queue *q, return req; } +static void nvme_unmap_bio(struct bio *bio) +{ + if (bio_integrity(bio)) + bio_integrity_unmap_free_user(bio); + blk_rq_unmap_user(bio); +} + static int nvme_map_user_request(struct request *req, u64 ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, u32 meta_seed, struct io_uring_cmd *ioucmd, unsigned int flags) @@ -157,7 +164,7 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, out_unmap: if (bio) - blk_rq_unmap_user(bio); + nvme_unmap_bio(bio); out: blk_mq_free_request(req); return ret; @@ -195,7 +202,7 @@ static int nvme_submit_user_cmd(struct request_queue *q, if (result) *result = le64_to_cpu(nvme_req(req)->result.u64); if (bio) - blk_rq_unmap_user(bio); + nvme_unmap_bio(bio); blk_mq_free_request(req); if (effects) @@ -406,7 +413,7 @@ static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd, struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); if (pdu->bio) - blk_rq_unmap_user(pdu->bio); + nvme_unmap_bio(pdu->bio); io_uring_cmd_done(ioucmd, pdu->status, pdu->result, issue_flags); } @@ -432,7 +439,7 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, */ if (blk_rq_is_poll(req)) { if (pdu->bio) - blk_rq_unmap_user(pdu->bio); + nvme_unmap_bio(pdu->bio); io_uring_cmd_iopoll_done(ioucmd, pdu->result, pdu->status); } else { io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); diff --git a/include/linux/bio.h b/include/linux/bio.h index d5379548d684..818e93612947 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -731,6 +731,7 @@ static inline bool bioset_initialized(struct bio_set *bs) bip_for_each_vec(_bvl, _bio->bi_integrity, _iter) int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len, u32 seed); +void bio_integrity_unmap_free_user(struct bio *bio); extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int); extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int); extern bool bio_integrity_prep(struct bio *); @@ -807,6 +808,9 @@ static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf, { return -EINVAL; } +static inline void bio_integrity_unmap_free_user(struct bio *bio) +{ +} #endif /* CONFIG_BLK_DEV_INTEGRITY */ -- cgit v1.2.3 From f4a1254f2a076afb0edd473589bf40f9b4d36b41 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 14 Jun 2024 01:04:29 +0100 Subject: io_uring: fix cancellation overwriting req->flags Only the current owner of a request is allowed to write into req->flags. Hence, the cancellation path should never touch it. Add a new field instead of the flag, move it into the 3rd cache line because it should always be initialised. poll_refs can move further as polling is an involved process anyway. It's a minimal patch, in the future we can and should find a better place for it and remove now unused REQ_F_CANCEL_SEQ. Fixes: 521223d7c229f ("io_uring/cancel: don't default to setting req->work.cancel_seq") Cc: stable@vger.kernel.org Reported-by: Li Shi Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/6827b129f8f0ad76fa9d1f0a773de938b240ffab.1718323430.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 ++- io_uring/cancel.h | 4 ++-- io_uring/io_uring.c | 1 + 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 7a6b190c7da7..b48570eaa449 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -648,7 +648,7 @@ struct io_kiocb { struct io_rsrc_node *rsrc_node; atomic_t refs; - atomic_t poll_refs; + bool cancel_seq_set; struct io_task_work io_task_work; /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ struct hlist_node hash_node; @@ -657,6 +657,7 @@ struct io_kiocb { /* opcode allocated if it needs to store data for async defer */ void *async_data; /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */ + atomic_t poll_refs; struct io_kiocb *link; /* custom credentials, valid IFF REQ_F_CREDS is set */ const struct cred *creds; diff --git a/io_uring/cancel.h b/io_uring/cancel.h index 76b32e65c03c..b33995e00ba9 100644 --- a/io_uring/cancel.h +++ b/io_uring/cancel.h @@ -27,10 +27,10 @@ bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd); static inline bool io_cancel_match_sequence(struct io_kiocb *req, int sequence) { - if ((req->flags & REQ_F_CANCEL_SEQ) && sequence == req->work.cancel_seq) + if (req->cancel_seq_set && sequence == req->work.cancel_seq) return true; - req->flags |= REQ_F_CANCEL_SEQ; + req->cancel_seq_set = true; req->work.cancel_seq = sequence; return false; } diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 816e93e7f949..154b25b8a613 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2058,6 +2058,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->file = NULL; req->rsrc_node = NULL; req->task = current; + req->cancel_seq_set = false; if (unlikely(opcode >= IORING_OP_LAST)) { req->opcode = 0; -- cgit v1.2.3