From 49b393af3130c7712c7e8f215f4126c9a8060fa6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 4 Jun 2025 10:21:38 +0200 Subject: perf: Add comment to enum perf_event_state Better describe the event states. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Leo Yan Link: https://lkml.kernel.org/r/20250604135801.GK38114@noisy.programming.kicks-ass.net --- include/linux/perf_event.h | 42 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 52dc7cfab0e0..ec9d96025683 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -635,8 +635,46 @@ struct perf_addr_filter_range { unsigned long size; }; -/** - * enum perf_event_state - the states of an event: +/* + * The normal states are: + * + * ACTIVE --. + * ^ | + * | | + * sched_{in,out}() | + * | | + * v | + * ,---> INACTIVE --+ <-. + * | | | + * | {dis,en}able() + * sched_in() | | + * | OFF <--' --+ + * | | + * `---> ERROR ------' + * + * That is: + * + * sched_in: INACTIVE -> {ACTIVE,ERROR} + * sched_out: ACTIVE -> INACTIVE + * disable: {ACTIVE,INACTIVE} -> OFF + * enable: {OFF,ERROR} -> INACTIVE + * + * Where {OFF,ERROR} are disabled states. + * + * Then we have the {EXIT,REVOKED,DEAD} states which are various shades of + * defunct events: + * + * - EXIT means task that the even was assigned to died, but child events + * still live, and further children can still be created. But the event + * itself will never be active again. It can only transition to + * {REVOKED,DEAD}; + * + * - REVOKED means the PMU the event was associated with is gone; all + * functionality is stopped but the event is still alive. Can only + * transition to DEAD; + * + * - DEAD event really is DYING tearing down state and freeing bits. + * */ enum perf_event_state { PERF_EVENT_STATE_DEAD = -5, -- cgit v1.2.3 From 33877220b8641b4cde474a4229ea92c0e3637883 Mon Sep 17 00:00:00 2001 From: Tasos Sahanidis Date: Mon, 19 May 2025 11:56:55 +0300 Subject: ata: libata-acpi: Do not assume 40 wire cable if no devices are enabled On at least an ASRock 990FX Extreme 4 with a VIA VT6330, the devices have not yet been enabled by the first time ata_acpi_cbl_80wire() is called. This means that the ata_for_each_dev loop is never entered, and a 40 wire cable is assumed. The VIA controller on this board does not report the cable in the PCI config space, thus having to fall back to ACPI even though no SATA bridge is present. The _GTM values are correctly reported by the firmware through ACPI, which has already set up faster transfer modes, but due to the above the controller is forced down to a maximum of UDMA/33. Resolve this by modifying ata_acpi_cbl_80wire() to directly return the cable type. First, an unknown cable is assumed which preserves the mode set by the firmware, and then on subsequent calls when the devices have been enabled, an 80 wire cable is correctly detected. Since the function now directly returns the cable type, it is renamed to ata_acpi_cbl_pata_type(). Signed-off-by: Tasos Sahanidis Link: https://lore.kernel.org/r/20250519085945.1399466-1-tasos@tasossah.com Signed-off-by: Niklas Cassel --- drivers/ata/libata-acpi.c | 24 ++++++++++++++++-------- drivers/ata/pata_via.c | 6 ++---- include/linux/libata.h | 7 +++---- 3 files changed, 21 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/drivers/ata/libata-acpi.c b/drivers/ata/libata-acpi.c index b7f0bf795521..f2140fc06ba0 100644 --- a/drivers/ata/libata-acpi.c +++ b/drivers/ata/libata-acpi.c @@ -514,15 +514,19 @@ unsigned int ata_acpi_gtm_xfermask(struct ata_device *dev, EXPORT_SYMBOL_GPL(ata_acpi_gtm_xfermask); /** - * ata_acpi_cbl_80wire - Check for 80 wire cable + * ata_acpi_cbl_pata_type - Return PATA cable type * @ap: Port to check - * @gtm: GTM data to use * - * Return 1 if the @gtm indicates the BIOS selected an 80wire mode. + * Return ATA_CBL_PATA* according to the transfer mode selected by BIOS */ -int ata_acpi_cbl_80wire(struct ata_port *ap, const struct ata_acpi_gtm *gtm) +int ata_acpi_cbl_pata_type(struct ata_port *ap) { struct ata_device *dev; + int ret = ATA_CBL_PATA_UNK; + const struct ata_acpi_gtm *gtm = ata_acpi_init_gtm(ap); + + if (!gtm) + return ATA_CBL_PATA40; ata_for_each_dev(dev, &ap->link, ENABLED) { unsigned int xfer_mask, udma_mask; @@ -530,13 +534,17 @@ int ata_acpi_cbl_80wire(struct ata_port *ap, const struct ata_acpi_gtm *gtm) xfer_mask = ata_acpi_gtm_xfermask(dev, gtm); ata_unpack_xfermask(xfer_mask, NULL, NULL, &udma_mask); - if (udma_mask & ~ATA_UDMA_MASK_40C) - return 1; + ret = ATA_CBL_PATA40; + + if (udma_mask & ~ATA_UDMA_MASK_40C) { + ret = ATA_CBL_PATA80; + break; + } } - return 0; + return ret; } -EXPORT_SYMBOL_GPL(ata_acpi_cbl_80wire); +EXPORT_SYMBOL_GPL(ata_acpi_cbl_pata_type); static void ata_acpi_gtf_to_tf(struct ata_device *dev, const struct ata_acpi_gtf *gtf, diff --git a/drivers/ata/pata_via.c b/drivers/ata/pata_via.c index d82728a01832..bb80e7800dcb 100644 --- a/drivers/ata/pata_via.c +++ b/drivers/ata/pata_via.c @@ -201,11 +201,9 @@ static int via_cable_detect(struct ata_port *ap) { two drives */ if (ata66 & (0x10100000 >> (16 * ap->port_no))) return ATA_CBL_PATA80; + /* Check with ACPI so we can spot BIOS reported SATA bridges */ - if (ata_acpi_init_gtm(ap) && - ata_acpi_cbl_80wire(ap, ata_acpi_init_gtm(ap))) - return ATA_CBL_PATA80; - return ATA_CBL_PATA40; + return ata_acpi_cbl_pata_type(ap); } static int via_pre_reset(struct ata_link *link, unsigned long deadline) diff --git a/include/linux/libata.h b/include/linux/libata.h index 31be45fd47a6..1e5aec839041 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1352,7 +1352,7 @@ int ata_acpi_stm(struct ata_port *ap, const struct ata_acpi_gtm *stm); int ata_acpi_gtm(struct ata_port *ap, struct ata_acpi_gtm *stm); unsigned int ata_acpi_gtm_xfermask(struct ata_device *dev, const struct ata_acpi_gtm *gtm); -int ata_acpi_cbl_80wire(struct ata_port *ap, const struct ata_acpi_gtm *gtm); +int ata_acpi_cbl_pata_type(struct ata_port *ap); #else static inline const struct ata_acpi_gtm *ata_acpi_init_gtm(struct ata_port *ap) { @@ -1377,10 +1377,9 @@ static inline unsigned int ata_acpi_gtm_xfermask(struct ata_device *dev, return 0; } -static inline int ata_acpi_cbl_80wire(struct ata_port *ap, - const struct ata_acpi_gtm *gtm) +static inline int ata_acpi_cbl_pata_type(struct ata_port *ap) { - return 0; + return ATA_CBL_PATA40; } #endif -- cgit v1.2.3 From 0b0cae7119a0ec9449d7261b5e672a5fed765068 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Tue, 3 Jun 2025 14:14:43 +0300 Subject: x86/its: move its_pages array to struct mod_arch_specific The of pages with ITS thunks allocated for modules are tracked by an array in 'struct module'. Since this is very architecture specific data structure, move it to 'struct mod_arch_specific'. No functional changes. Fixes: 872df34d7c51 ("x86/its: Use dynamic thunks for indirect branches") Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Mike Rapoport (Microsoft) Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20250603111446.2609381-4-rppt@kernel.org --- arch/x86/include/asm/module.h | 8 ++++++++ arch/x86/kernel/alternative.c | 19 ++++++++++--------- include/linux/module.h | 5 ----- 3 files changed, 18 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index e988bac0a4a1..3c2de4ce3b10 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -5,12 +5,20 @@ #include #include +struct its_array { +#ifdef CONFIG_MITIGATION_ITS + void **pages; + int num; +#endif +}; + struct mod_arch_specific { #ifdef CONFIG_UNWINDER_ORC unsigned int num_orcs; int *orc_unwind_ip; struct orc_entry *orc_unwind; #endif + struct its_array its_pages; }; #endif /* _ASM_X86_MODULE_H */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index ecfe7b497cad..b50fe6ce4655 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -173,8 +173,8 @@ void its_fini_mod(struct module *mod) its_page = NULL; mutex_unlock(&text_mutex); - for (int i = 0; i < mod->its_num_pages; i++) { - void *page = mod->its_page_array[i]; + for (int i = 0; i < mod->arch.its_pages.num; i++) { + void *page = mod->arch.its_pages.pages[i]; execmem_restore_rox(page, PAGE_SIZE); } } @@ -184,11 +184,11 @@ void its_free_mod(struct module *mod) if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) return; - for (int i = 0; i < mod->its_num_pages; i++) { - void *page = mod->its_page_array[i]; + for (int i = 0; i < mod->arch.its_pages.num; i++) { + void *page = mod->arch.its_pages.pages[i]; execmem_free(page); } - kfree(mod->its_page_array); + kfree(mod->arch.its_pages.pages); } #endif /* CONFIG_MODULES */ @@ -201,14 +201,15 @@ static void *its_alloc(void) #ifdef CONFIG_MODULES if (its_mod) { - void *tmp = krealloc(its_mod->its_page_array, - (its_mod->its_num_pages+1) * sizeof(void *), + struct its_array *pages = &its_mod->arch.its_pages; + void *tmp = krealloc(pages->pages, + (pages->num+1) * sizeof(void *), GFP_KERNEL); if (!tmp) return NULL; - its_mod->its_page_array = tmp; - its_mod->its_page_array[its_mod->its_num_pages++] = page; + pages->pages = tmp; + pages->pages[pages->num++] = page; execmem_make_temp_rw(page, PAGE_SIZE); } diff --git a/include/linux/module.h b/include/linux/module.h index 92e1420fccdf..5faa1fb1f4b4 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -586,11 +586,6 @@ struct module { atomic_t refcnt; #endif -#ifdef CONFIG_MITIGATION_ITS - int its_num_pages; - void **its_page_array; -#endif - #ifdef CONFIG_CONSTRUCTORS /* Constructor functions. */ ctor_fn_t *ctors; -- cgit v1.2.3 From 7cd9a11dd0c3d1dd225795ed1b5b53132888e7b5 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Tue, 3 Jun 2025 14:14:45 +0300 Subject: Revert "mm/execmem: Unify early execmem_cache behaviour" The commit d6d1e3e6580c ("mm/execmem: Unify early execmem_cache behaviour") changed early behaviour of execemem ROX cache to allow its usage in early x86 code that allocates text pages when CONFIG_MITGATION_ITS is enabled. The permission management of the pages allocated from execmem for ITS mitigation is now completely contained in arch/x86/kernel/alternatives.c and therefore there is no need to special case early allocations in execmem. This reverts commit d6d1e3e6580ca35071ad474381f053cbf1fb6414. Signed-off-by: Mike Rapoport (Microsoft) Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20250603111446.2609381-6-rppt@kernel.org --- arch/x86/mm/init_32.c | 3 --- arch/x86/mm/init_64.c | 3 --- include/linux/execmem.h | 8 +------- mm/execmem.c | 40 +++------------------------------------- 4 files changed, 4 insertions(+), 50 deletions(-) (limited to 'include') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 607d6a2e66e2..8a34fff6ab2b 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include @@ -749,8 +748,6 @@ void mark_rodata_ro(void) pr_info("Write protecting kernel text and read-only data: %luk\n", size >> 10); - execmem_cache_make_ro(); - kernel_set_to_readonly = 1; #ifdef CONFIG_CPA_DEBUG diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index ee66fae9ebcc..fdb6cab524f0 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include @@ -1392,8 +1391,6 @@ void mark_rodata_ro(void) (end - start) >> 10); set_memory_ro(start, (end - start) >> PAGE_SHIFT); - execmem_cache_make_ro(); - kernel_set_to_readonly = 1; /* diff --git a/include/linux/execmem.h b/include/linux/execmem.h index ca42d5e46ccc..3be35680a54f 100644 --- a/include/linux/execmem.h +++ b/include/linux/execmem.h @@ -54,7 +54,7 @@ enum execmem_range_flags { EXECMEM_ROX_CACHE = (1 << 1), }; -#if defined(CONFIG_ARCH_HAS_EXECMEM_ROX) && defined(CONFIG_EXECMEM) +#ifdef CONFIG_ARCH_HAS_EXECMEM_ROX /** * execmem_fill_trapping_insns - set memory to contain instructions that * will trap @@ -94,15 +94,9 @@ int execmem_make_temp_rw(void *ptr, size_t size); * Return: 0 on success or negative error code on failure. */ int execmem_restore_rox(void *ptr, size_t size); - -/* - * Called from mark_readonly(), where the system transitions to ROX. - */ -void execmem_cache_make_ro(void); #else static inline int execmem_make_temp_rw(void *ptr, size_t size) { return 0; } static inline int execmem_restore_rox(void *ptr, size_t size) { return 0; } -static inline void execmem_cache_make_ro(void) { } #endif /** diff --git a/mm/execmem.c b/mm/execmem.c index 9720ac2dfa41..2b683e7d864d 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -254,34 +254,6 @@ out_unlock: return ptr; } -static bool execmem_cache_rox = false; - -void execmem_cache_make_ro(void) -{ - struct maple_tree *free_areas = &execmem_cache.free_areas; - struct maple_tree *busy_areas = &execmem_cache.busy_areas; - MA_STATE(mas_free, free_areas, 0, ULONG_MAX); - MA_STATE(mas_busy, busy_areas, 0, ULONG_MAX); - struct mutex *mutex = &execmem_cache.mutex; - void *area; - - execmem_cache_rox = true; - - mutex_lock(mutex); - - mas_for_each(&mas_free, area, ULONG_MAX) { - unsigned long pages = mas_range_len(&mas_free) >> PAGE_SHIFT; - set_memory_ro(mas_free.index, pages); - } - - mas_for_each(&mas_busy, area, ULONG_MAX) { - unsigned long pages = mas_range_len(&mas_busy) >> PAGE_SHIFT; - set_memory_ro(mas_busy.index, pages); - } - - mutex_unlock(mutex); -} - static int execmem_cache_populate(struct execmem_range *range, size_t size) { unsigned long vm_flags = VM_ALLOW_HUGE_VMAP; @@ -302,15 +274,9 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size) /* fill memory with instructions that will trap */ execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true); - if (execmem_cache_rox) { - err = set_memory_rox((unsigned long)p, vm->nr_pages); - if (err) - goto err_free_mem; - } else { - err = set_memory_x((unsigned long)p, vm->nr_pages); - if (err) - goto err_free_mem; - } + err = set_memory_rox((unsigned long)p, vm->nr_pages); + if (err) + goto err_free_mem; err = execmem_cache_add(p, alloc_size); if (err) -- cgit v1.2.3 From 6bdd3a01fe4627ad7a562ba38eb759eba715b671 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Jun 2025 16:00:20 +0200 Subject: fs: add missing values to TRACE_IOCB_STRINGS Make sure all values are covered. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/20250610140020.2227932-1-hch@lst.de Signed-off-by: Christian Brauner --- include/linux/fs.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 96c7925a6551..d27c402f1162 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -399,7 +399,9 @@ struct readahead_control; { IOCB_WAITQ, "WAITQ" }, \ { IOCB_NOIO, "NOIO" }, \ { IOCB_ALLOC_CACHE, "ALLOC_CACHE" }, \ - { IOCB_DIO_CALLER_COMP, "CALLER_COMP" } + { IOCB_DIO_CALLER_COMP, "CALLER_COMP" }, \ + { IOCB_AIO_RW, "AIO_RW" }, \ + { IOCB_HAS_METADATA, "AIO_HAS_METADATA" } struct kiocb { struct file *ki_filp; -- cgit v1.2.3 From ac90aad0e9bf7c37e706fdc08ce763a553890bdf Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 12 Jun 2025 10:47:09 -0700 Subject: crypto: testmgr - reinstate kconfig control over full self-tests Commit 698de822780f ("crypto: testmgr - make it easier to enable the full set of tests") removed support for building kernels that run only the "fast" set of crypto self-tests by default. This assumed that nearly everyone actually wanted the full set of tests, *if* they had already chosen to enable the tests at all. Unfortunately, it turns out that both Debian and Fedora intentionally have the crypto self-tests enabled in their production kernels. And for production kernels we do need to keep the testing time down, which implies just running the "fast" tests, not the full set of tests. For Fedora, a reason for enabling the tests in production is that they are being (mis)used to meet the FIPS 140-3 pre-operational testing requirement. However, the other reason for enabling the tests in production, which applies to both distros, is that they provide some value in protecting users from buggy drivers. Unfortunately, the crypto/ subsystem has many buggy and untested drivers for off-CPU hardware accelerators on rare platforms. These broken drivers get shipped to users, and there have been multiple examples of the tests preventing these buggy drivers from being used. So effectively, the tests are being relied on in production kernels. I think this is kind of crazy (untested drivers should just not be enabled at all), but that seems to be how things work currently. Thus, reintroduce a kconfig option that controls the level of testing. Call it CRYPTO_SELFTESTS_FULL instead of the original name CRYPTO_MANAGER_EXTRA_TESTS, which was slightly misleading. Moreover, given the "production kernel" use case, make CRYPTO_SELFTESTS depend on EXPERT instead of DEBUG_KERNEL. I also haven't reinstated all the #ifdefs in crypto/testmgr.c. Instead, just rely on the compiler to optimize out unused code. Fixes: 40b9969796bf ("crypto: testmgr - replace CRYPTO_MANAGER_DISABLE_TESTS with CRYPTO_SELFTESTS") Fixes: 698de822780f ("crypto: testmgr - make it easier to enable the full set of tests") Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- crypto/Kconfig | 25 +++++++++++++++++++++---- crypto/testmgr.c | 15 ++++++++++++--- include/crypto/internal/simd.h | 6 ++++-- lib/crypto/Makefile | 2 +- 4 files changed, 38 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/crypto/Kconfig b/crypto/Kconfig index e9fee7818e27..e1cfd0d4cc8f 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -176,16 +176,33 @@ config CRYPTO_USER config CRYPTO_SELFTESTS bool "Enable cryptographic self-tests" - depends on DEBUG_KERNEL + depends on EXPERT help Enable the cryptographic self-tests. The cryptographic self-tests run at boot time, or at algorithm registration time if algorithms are dynamically loaded later. - This is primarily intended for developer use. It should not be - enabled in production kernels, unless you are trying to use these - tests to fulfill a FIPS testing requirement. + There are two main use cases for these tests: + + - Development and pre-release testing. In this case, also enable + CRYPTO_SELFTESTS_FULL to get the full set of tests. All crypto code + in the kernel is expected to pass the full set of tests. + + - Production kernels, to help prevent buggy drivers from being used + and/or meet FIPS 140-3 pre-operational testing requirements. In + this case, enable CRYPTO_SELFTESTS but not CRYPTO_SELFTESTS_FULL. + +config CRYPTO_SELFTESTS_FULL + bool "Enable the full set of cryptographic self-tests" + depends on CRYPTO_SELFTESTS + help + Enable the full set of cryptographic self-tests for each algorithm. + + The full set of tests should be enabled for development and + pre-release testing, but not in production kernels. + + All crypto code in the kernel is expected to pass the full tests. config CRYPTO_NULL tristate "Null algorithms" diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 72005074a5c2..32f753d6c430 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -45,6 +45,7 @@ static bool notests; module_param(notests, bool, 0644); MODULE_PARM_DESC(notests, "disable all crypto self-tests"); +#ifdef CONFIG_CRYPTO_SELFTESTS_FULL static bool noslowtests; module_param(noslowtests, bool, 0644); MODULE_PARM_DESC(noslowtests, "disable slow crypto self-tests"); @@ -52,6 +53,10 @@ MODULE_PARM_DESC(noslowtests, "disable slow crypto self-tests"); static unsigned int fuzz_iterations = 100; module_param(fuzz_iterations, uint, 0644); MODULE_PARM_DESC(fuzz_iterations, "number of fuzz test iterations"); +#else +#define noslowtests 1 +#define fuzz_iterations 0 +#endif #ifndef CONFIG_CRYPTO_SELFTESTS @@ -319,9 +324,9 @@ struct testvec_config { /* * The following are the lists of testvec_configs to test for each algorithm - * type when the fast crypto self-tests are enabled. They aim to provide good - * test coverage, while keeping the test time much shorter than the full tests - * so that the fast tests can be used to fulfill FIPS 140 testing requirements. + * type when the "fast" crypto self-tests are enabled. They aim to provide good + * test coverage, while keeping the test time much shorter than the "full" tests + * so that the "fast" tests can be enabled in a wider range of circumstances. */ /* Configs for skciphers and aeads */ @@ -1183,14 +1188,18 @@ static void generate_random_testvec_config(struct rnd_state *rng, static void crypto_disable_simd_for_test(void) { +#ifdef CONFIG_CRYPTO_SELFTESTS_FULL migrate_disable(); __this_cpu_write(crypto_simd_disabled_for_test, true); +#endif } static void crypto_reenable_simd_for_test(void) { +#ifdef CONFIG_CRYPTO_SELFTESTS_FULL __this_cpu_write(crypto_simd_disabled_for_test, false); migrate_enable(); +#endif } /* diff --git a/include/crypto/internal/simd.h b/include/crypto/internal/simd.h index 7e7f1ac3b7fd..9e338e7aafbd 100644 --- a/include/crypto/internal/simd.h +++ b/include/crypto/internal/simd.h @@ -44,9 +44,11 @@ void simd_unregister_aeads(struct aead_alg *algs, int count, * * This delegates to may_use_simd(), except that this also returns false if SIMD * in crypto code has been temporarily disabled on this CPU by the crypto - * self-tests, in order to test the no-SIMD fallback code. + * self-tests, in order to test the no-SIMD fallback code. This override is + * currently limited to configurations where the "full" self-tests are enabled, + * because it might be a bit too invasive to be part of the "fast" self-tests. */ -#ifdef CONFIG_CRYPTO_SELFTESTS +#ifdef CONFIG_CRYPTO_SELFTESTS_FULL DECLARE_PER_CPU(bool, crypto_simd_disabled_for_test); #define crypto_simd_usable() \ (may_use_simd() && !this_cpu_read(crypto_simd_disabled_for_test)) diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 3e79283b617d..f9e44aac6619 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -62,7 +62,7 @@ libsha256-generic-y := sha256-generic.o obj-$(CONFIG_MPILIB) += mpi/ -obj-$(CONFIG_CRYPTO_SELFTESTS) += simd.o +obj-$(CONFIG_CRYPTO_SELFTESTS_FULL) += simd.o obj-$(CONFIG_CRYPTO_LIB_SM3) += libsm3.o libsm3-y := sm3.o -- cgit v1.2.3 From 594902c986e269660302f09df9ec4bf1cf017b77 Mon Sep 17 00:00:00 2001 From: Qinyun Tan Date: Sat, 31 May 2025 02:20:53 +0800 Subject: x86,fs/resctrl: Remove inappropriate references to cacheinfo in the resctrl subsystem In the resctrl subsystem's Sub-NUMA Cluster (SNC) mode, the rdt_mon_domain structure representing a NUMA node relies on the cacheinfo interface (rdt_mon_domain::ci) to store L3 cache information (e.g., shared_cpu_map) for monitoring. The L3 cache information of a SNC NUMA node determines which domains are summed for the "top level" L3-scoped events. rdt_mon_domain::ci is initialized using the first online CPU of a NUMA node. When this CPU goes offline, its shared_cpu_map is cleared to contain only the offline CPU itself. Subsequently, attempting to read counters via smp_call_on_cpu(offline_cpu) fails (and error ignored), returning zero values for "top-level events" without any error indication. Replace the cacheinfo references in struct rdt_mon_domain and struct rmid_read with the cacheinfo ID (a unique identifier for the L3 cache). rdt_domain_hdr::cpu_mask contains the online CPUs associated with that domain. When reading "top-level events", select a CPU from rdt_domain_hdr::cpu_mask and utilize its L3 shared_cpu_map to determine valid CPUs for reading RMID counter via the MSR interface. Considering all CPUs associated with the L3 cache improves the chances of picking a housekeeping CPU on which the counter reading work can be queued, avoiding an unnecessary IPI. Fixes: 328ea68874642 ("x86/resctrl: Prepare for new Sub-NUMA Cluster (SNC) monitor files") Signed-off-by: Qinyun Tan Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Tony Luck Link: https://lore.kernel.org/20250530182053.37502-2-qinyuntan@linux.alibaba.com --- arch/x86/kernel/cpu/resctrl/core.c | 6 ++++-- fs/resctrl/ctrlmondata.c | 13 +++++++++---- fs/resctrl/internal.h | 4 ++-- fs/resctrl/monitor.c | 6 ++++-- fs/resctrl/rdtgroup.c | 6 +++--- include/linux/resctrl.h | 4 ++-- 6 files changed, 24 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 7109cbfcad4f..187d527ef73b 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -498,6 +498,7 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) struct rdt_hw_mon_domain *hw_dom; struct rdt_domain_hdr *hdr; struct rdt_mon_domain *d; + struct cacheinfo *ci; int err; lockdep_assert_held(&domain_list_lock); @@ -525,12 +526,13 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) d = &hw_dom->d_resctrl; d->hdr.id = id; d->hdr.type = RESCTRL_MON_DOMAIN; - d->ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); - if (!d->ci) { + ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); + if (!ci) { pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name); mon_domain_free(hw_dom); return; } + d->ci_id = ci->id; cpumask_set_cpu(cpu, &d->hdr.cpu_mask); arch_mon_domain_online(r, d); diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 6ed2dfd4dbbd..d98e0d2de09f 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -594,9 +594,10 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) struct rmid_read rr = {0}; struct rdt_mon_domain *d; struct rdtgroup *rdtgrp; + int domid, cpu, ret = 0; struct rdt_resource *r; + struct cacheinfo *ci; struct mon_data *md; - int domid, ret = 0; rdtgrp = rdtgroup_kn_lock_live(of->kn); if (!rdtgrp) { @@ -623,10 +624,14 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) * one that matches this cache id. */ list_for_each_entry(d, &r->mon_domains, hdr.list) { - if (d->ci->id == domid) { - rr.ci = d->ci; + if (d->ci_id == domid) { + rr.ci_id = d->ci_id; + cpu = cpumask_any(&d->hdr.cpu_mask); + ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); + if (!ci) + continue; mon_event_read(&rr, r, NULL, rdtgrp, - &d->ci->shared_cpu_map, evtid, false); + &ci->shared_cpu_map, evtid, false); goto checkresult; } } diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 9a8cf6f11151..0a1eedba2b03 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -98,7 +98,7 @@ struct mon_data { * domains in @r sharing L3 @ci.id * @evtid: Which monitor event to read. * @first: Initialize MBM counter when true. - * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains. + * @ci_id: Cacheinfo id for L3. Only set when @d is NULL. Used when summing domains. * @err: Error encountered when reading counter. * @val: Returned value of event counter. If @rgrp is a parent resource group, * @val includes the sum of event counts from its child resource groups. @@ -112,7 +112,7 @@ struct rmid_read { struct rdt_mon_domain *d; enum resctrl_event_id evtid; bool first; - struct cacheinfo *ci; + unsigned int ci_id; int err; u64 val; void *arch_mon_ctx; diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index bde2801289d3..f5637855c3ac 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -361,6 +361,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) { int cpu = smp_processor_id(); struct rdt_mon_domain *d; + struct cacheinfo *ci; struct mbm_state *m; int err, ret; u64 tval = 0; @@ -388,7 +389,8 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) } /* Summing domains that share a cache, must be on a CPU for that cache. */ - if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map)) + ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); + if (!ci || ci->id != rr->ci_id) return -EINVAL; /* @@ -400,7 +402,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) */ ret = -EINVAL; list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { - if (d->ci->id != rr->ci->id) + if (d->ci_id != rr->ci_id) continue; err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, rr->evtid, &tval, rr->arch_mon_ctx); diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 1beb124e25f6..77d08229d855 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -3036,7 +3036,7 @@ static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, char name[32]; snc_mode = r->mon_scope == RESCTRL_L3_NODE; - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); + sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id); if (snc_mode) sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id); @@ -3061,7 +3061,7 @@ static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, return -EPERM; list_for_each_entry(mevt, &r->evt_list, list) { - domid = do_sum ? d->ci->id : d->hdr.id; + domid = do_sum ? d->ci_id : d->hdr.id; priv = mon_get_kn_priv(r->rid, domid, mevt, do_sum); if (WARN_ON_ONCE(!priv)) return -EINVAL; @@ -3089,7 +3089,7 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, lockdep_assert_held(&rdtgroup_mutex); snc_mode = r->mon_scope == RESCTRL_L3_NODE; - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); + sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id); kn = kernfs_find_and_get(parent_kn, name); if (kn) { /* diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 9ba771f2ddea..6fb4894b8cfd 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -159,7 +159,7 @@ struct rdt_ctrl_domain { /** * struct rdt_mon_domain - group of CPUs sharing a resctrl monitor resource * @hdr: common header for different domain types - * @ci: cache info for this domain + * @ci_id: cache info id for this domain * @rmid_busy_llc: bitmap of which limbo RMIDs are above threshold * @mbm_total: saved state for MBM total bandwidth * @mbm_local: saved state for MBM local bandwidth @@ -170,7 +170,7 @@ struct rdt_ctrl_domain { */ struct rdt_mon_domain { struct rdt_domain_hdr hdr; - struct cacheinfo *ci; + unsigned int ci_id; unsigned long *rmid_busy_llc; struct mbm_state *mbm_total; struct mbm_state *mbm_local; -- cgit v1.2.3 From a85b8544d46390469b6ca72d6bfd3ecb7be985ff Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 14 Jun 2025 00:30:37 +0200 Subject: wifi: remove zero-length arrays All of these are really meant to be variable-length, and in the case of s1g_beacon it's actually accessed. Make that one in particular, and a couple of others (that aren't used as arrays now), actually variable. Reported-by: syzbot+fd222bb38e916df26fa4@syzkaller.appspotmail.com Fixes: 1e1f706fc2ce ("wifi: cfg80211/mac80211: correctly parse S1G beacon optional elements") Link: https://patch.msgid.link/20250614003037.a3e82e882251.I2e8b58e56ff2a9f8b06c66f036578b7c1d4e4685@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index ce377f7fb912..22f39e5e2ff1 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1278,7 +1278,7 @@ struct ieee80211_ext { u8 sa[ETH_ALEN]; __le32 timestamp; u8 change_seq; - u8 variable[0]; + u8 variable[]; } __packed s1g_beacon; } u; } __packed __aligned(2); @@ -1536,7 +1536,7 @@ struct ieee80211_mgmt { u8 action_code; u8 dialog_token; __le16 capability; - u8 variable[0]; + u8 variable[]; } __packed tdls_discover_resp; struct { u8 action_code; @@ -1721,35 +1721,35 @@ struct ieee80211_tdls_data { struct { u8 dialog_token; __le16 capability; - u8 variable[0]; + u8 variable[]; } __packed setup_req; struct { __le16 status_code; u8 dialog_token; __le16 capability; - u8 variable[0]; + u8 variable[]; } __packed setup_resp; struct { __le16 status_code; u8 dialog_token; - u8 variable[0]; + u8 variable[]; } __packed setup_cfm; struct { __le16 reason_code; - u8 variable[0]; + u8 variable[]; } __packed teardown; struct { u8 dialog_token; - u8 variable[0]; + u8 variable[]; } __packed discover_req; struct { u8 target_channel; u8 oper_class; - u8 variable[0]; + u8 variable[]; } __packed chan_switch_req; struct { __le16 status_code; - u8 variable[0]; + u8 variable[]; } __packed chan_switch_resp; } u; } __packed; -- cgit v1.2.3 From fc92099902fbf21000554678a47654b029c15a4d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 12 Jun 2025 12:36:06 -0300 Subject: tools headers: Synchronize linux/bits.h with the kernel sources To pick up the changes in this cset: 1e7933a575ed8af4 ("uapi: Revert "bitops: avoid integer overflow in GENMASK(_ULL)"") 5b572e8a9f3dcd6e ("bits: introduce fixed-type BIT_U*()") 19408200c094858d ("bits: introduce fixed-type GENMASK_U*()") 31299a5e02112411 ("bits: add comments and newlines to #if, #else and #endif directives") This addresses these perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/linux/bits.h include/linux/bits.h Please see tools/include/uapi/README for further details. Acked-by: Vincent Mailhol Cc: I Hsin Cheng Cc: Yury Norov Cc: Adrian Hunter Cc: Ian Rogers Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Lucas De Marchi Cc: Namhyung Kim Cc: Yury Norov Link: https://lore.kernel.org/r/aEr0ZJ60EbshEy6p@x1 Signed-off-by: Arnaldo Carvalho de Melo --- include/uapi/linux/bits.h | 4 ++-- tools/include/linux/bits.h | 57 ++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 57 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bits.h b/include/uapi/linux/bits.h index a04afef9efca..682b406e1067 100644 --- a/include/uapi/linux/bits.h +++ b/include/uapi/linux/bits.h @@ -4,9 +4,9 @@ #ifndef _UAPI_LINUX_BITS_H #define _UAPI_LINUX_BITS_H -#define __GENMASK(h, l) (((~_UL(0)) << (l)) & (~_UL(0) >> (__BITS_PER_LONG - 1 - (h)))) +#define __GENMASK(h, l) (((~_UL(0)) << (l)) & (~_UL(0) >> (BITS_PER_LONG - 1 - (h)))) -#define __GENMASK_ULL(h, l) (((~_ULL(0)) << (l)) & (~_ULL(0) >> (__BITS_PER_LONG_LONG - 1 - (h)))) +#define __GENMASK_ULL(h, l) (((~_ULL(0)) << (l)) & (~_ULL(0) >> (BITS_PER_LONG_LONG - 1 - (h)))) #define __GENMASK_U128(h, l) \ ((_BIT128((h)) << 1) - (_BIT128(l))) diff --git a/tools/include/linux/bits.h b/tools/include/linux/bits.h index 14fd0ca9a6cd..7ad056219115 100644 --- a/tools/include/linux/bits.h +++ b/tools/include/linux/bits.h @@ -12,6 +12,7 @@ #define BIT_ULL_MASK(nr) (ULL(1) << ((nr) % BITS_PER_LONG_LONG)) #define BIT_ULL_WORD(nr) ((nr) / BITS_PER_LONG_LONG) #define BITS_PER_BYTE 8 +#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE) /* * Create a contiguous bitmask starting at bit position @l and ending at @@ -19,16 +20,68 @@ * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000. */ #if !defined(__ASSEMBLY__) + +/* + * Missing asm support + * + * GENMASK_U*() and BIT_U*() depend on BITS_PER_TYPE() which relies on sizeof(), + * something not available in asm. Nevertheless, fixed width integers is a C + * concept. Assembly code can rely on the long and long long versions instead. + */ + #include #include +#include + #define GENMASK_INPUT_CHECK(h, l) BUILD_BUG_ON_ZERO(const_true((l) > (h))) -#else + +/* + * Generate a mask for the specified type @t. Additional checks are made to + * guarantee the value returned fits in that type, relying on + * -Wshift-count-overflow compiler check to detect incompatible arguments. + * For example, all these create build errors or warnings: + * + * - GENMASK(15, 20): wrong argument order + * - GENMASK(72, 15): doesn't fit unsigned long + * - GENMASK_U32(33, 15): doesn't fit in a u32 + */ +#define GENMASK_TYPE(t, h, l) \ + ((t)(GENMASK_INPUT_CHECK(h, l) + \ + (type_max(t) << (l) & \ + type_max(t) >> (BITS_PER_TYPE(t) - 1 - (h))))) + +#define GENMASK_U8(h, l) GENMASK_TYPE(u8, h, l) +#define GENMASK_U16(h, l) GENMASK_TYPE(u16, h, l) +#define GENMASK_U32(h, l) GENMASK_TYPE(u32, h, l) +#define GENMASK_U64(h, l) GENMASK_TYPE(u64, h, l) + +/* + * Fixed-type variants of BIT(), with additional checks like GENMASK_TYPE(). The + * following examples generate compiler warnings due to -Wshift-count-overflow: + * + * - BIT_U8(8) + * - BIT_U32(-1) + * - BIT_U32(40) + */ +#define BIT_INPUT_CHECK(type, nr) \ + BUILD_BUG_ON_ZERO(const_true((nr) >= BITS_PER_TYPE(type))) + +#define BIT_TYPE(type, nr) ((type)(BIT_INPUT_CHECK(type, nr) + BIT_ULL(nr))) + +#define BIT_U8(nr) BIT_TYPE(u8, nr) +#define BIT_U16(nr) BIT_TYPE(u16, nr) +#define BIT_U32(nr) BIT_TYPE(u32, nr) +#define BIT_U64(nr) BIT_TYPE(u64, nr) + +#else /* defined(__ASSEMBLY__) */ + /* * BUILD_BUG_ON_ZERO is not available in h files included from asm files, * disable the input check if that is the case. */ #define GENMASK_INPUT_CHECK(h, l) 0 -#endif + +#endif /* !defined(__ASSEMBLY__) */ #define GENMASK(h, l) \ (GENMASK_INPUT_CHECK(h, l) + __GENMASK(h, l)) -- cgit v1.2.3 From 7851263998d4269125fd6cb3fdbfc7c6db853859 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 16 Jun 2025 11:21:15 -0700 Subject: atm: Revert atm_account_tx() if copy_from_iter_full() fails. In vcc_sendmsg(), we account skb->truesize to sk->sk_wmem_alloc by atm_account_tx(). It is expected to be reverted by atm_pop_raw() later called by vcc->dev->ops->send(vcc, skb). However, vcc_sendmsg() misses the same revert when copy_from_iter_full() fails, and then we will leak a socket. Let's factorise the revert part as atm_return_tx() and call it in the failure path. Note that the corresponding sk_wmem_alloc operation can be found in alloc_tx() as of the blamed commit. $ git blame -L:alloc_tx net/atm/common.c c55fa3cccbc2c~ Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Simon Horman Closes: https://lore.kernel.org/netdev/20250614161959.GR414686@horms.kernel.org/ Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250616182147.963333-3-kuni1840@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/atmdev.h | 6 ++++++ net/atm/common.c | 1 + net/atm/raw.c | 2 +- 3 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/atmdev.h b/include/linux/atmdev.h index 9b02961d65ee..45f2f278b50a 100644 --- a/include/linux/atmdev.h +++ b/include/linux/atmdev.h @@ -249,6 +249,12 @@ static inline void atm_account_tx(struct atm_vcc *vcc, struct sk_buff *skb) ATM_SKB(skb)->atm_options = vcc->atm_options; } +static inline void atm_return_tx(struct atm_vcc *vcc, struct sk_buff *skb) +{ + WARN_ON_ONCE(refcount_sub_and_test(ATM_SKB(skb)->acct_truesize, + &sk_atm(vcc)->sk_wmem_alloc)); +} + static inline void atm_force_charge(struct atm_vcc *vcc,int truesize) { atomic_add(truesize, &sk_atm(vcc)->sk_rmem_alloc); diff --git a/net/atm/common.c b/net/atm/common.c index 9b75699992ff..d7f7976ea13a 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -635,6 +635,7 @@ int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t size) skb->dev = NULL; /* for paths shared with net_device interfaces */ if (!copy_from_iter_full(skb_put(skb, size), size, &m->msg_iter)) { + atm_return_tx(vcc, skb); kfree_skb(skb); error = -EFAULT; goto out; diff --git a/net/atm/raw.c b/net/atm/raw.c index 2b5f78a7ec3e..1e6511ec842c 100644 --- a/net/atm/raw.c +++ b/net/atm/raw.c @@ -36,7 +36,7 @@ static void atm_pop_raw(struct atm_vcc *vcc, struct sk_buff *skb) pr_debug("(%d) %d -= %d\n", vcc->vci, sk_wmem_alloc_get(sk), ATM_SKB(skb)->acct_truesize); - WARN_ON(refcount_sub_and_test(ATM_SKB(skb)->acct_truesize, &sk->sk_wmem_alloc)); + atm_return_tx(vcc, skb); dev_kfree_skb_any(skb); sk->sk_write_space(sk); } -- cgit v1.2.3 From 30b58444807c93bffeaba7d776110f2a909d2f9a Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 17 Jun 2025 13:40:56 +0800 Subject: erofs: remove unused trace event erofs_destroy_inode The trace event `erofs_destroy_inode` was added but remains unused. This unused event contributes approximately 5KB to the kernel module size. Reported-by: Steven Rostedt Closes: https://lore.kernel.org/r/20250612224906.15000244@batman.local.home Fixes: 13f06f48f7bf ("staging: erofs: support tracepoint") Cc: stable@vger.kernel.org Reviewed-by: Hongbo Li Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20250617054056.3232365-1-hsiangkao@linux.alibaba.com --- include/trace/events/erofs.h | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'include') diff --git a/include/trace/events/erofs.h b/include/trace/events/erofs.h index a5f4b9234f46..dad7360f42f9 100644 --- a/include/trace/events/erofs.h +++ b/include/trace/events/erofs.h @@ -211,24 +211,6 @@ TRACE_EVENT(erofs_map_blocks_exit, show_mflags(__entry->mflags), __entry->ret) ); -TRACE_EVENT(erofs_destroy_inode, - TP_PROTO(struct inode *inode), - - TP_ARGS(inode), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( erofs_nid_t, nid ) - ), - - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->nid = EROFS_I(inode)->nid; - ), - - TP_printk("dev = (%d,%d), nid = %llu", show_dev_nid(__entry)) -); - #endif /* _TRACE_EROFS_H */ /* This part must be outside protection */ -- cgit v1.2.3 From 635e118317ffa773f6d25ec6a71b7927d7e8886a Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Wed, 18 Jun 2025 10:54:44 +0200 Subject: Revert "mtd: core: always create master device" The idea behind this patch was to always let a "master" mtd device available to anchor runtime PM. Historically, there was no mtd device representing the whole storage as soon as partitions were coming into play. The introduction of CONFIG_MTD_PARTITIONED_MASTER allowed to keep this "master" device, but was not enabled by default to avoid breaking existing users (otherwise the mtd device numbering would be totally messed up with an off by 1, at least). The approach of adding an mtd_master class on top of partitioned mtd devices is breaking the mtd core in many creative ways, so better think again this approach and revert the faulty changes for now. This reverts commit 0aa7b390fc40a871267a2328bbbefca8b37ad307. Fixes: 0aa7b390fc40 ("mtd: core: always create master device") Tested-by: Guenter Roeck Acked-by: Guenter Roeck Signed-off-by: Miquel Raynal --- drivers/mtd/mtdchar.c | 2 +- drivers/mtd/mtdcore.c | 152 +++++++++++------------------------------ drivers/mtd/mtdcore.h | 2 +- drivers/mtd/mtdpart.c | 16 ++--- include/linux/mtd/partitions.h | 2 +- 5 files changed, 51 insertions(+), 123 deletions(-) (limited to 'include') diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c index 391d81ad960c..8dc4f5c493fc 100644 --- a/drivers/mtd/mtdchar.c +++ b/drivers/mtd/mtdchar.c @@ -559,7 +559,7 @@ static int mtdchar_blkpg_ioctl(struct mtd_info *mtd, /* Sanitize user input */ p.devname[BLKPG_DEVNAMELTH - 1] = '\0'; - return mtd_add_partition(mtd, p.devname, p.start, p.length, NULL); + return mtd_add_partition(mtd, p.devname, p.start, p.length); case BLKPG_DEL_PARTITION: diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index 429d8c16baf0..5ba9a741f5ac 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -68,13 +68,7 @@ static struct class mtd_class = { .pm = MTD_CLS_PM_OPS, }; -static struct class mtd_master_class = { - .name = "mtd_master", - .pm = MTD_CLS_PM_OPS, -}; - static DEFINE_IDR(mtd_idr); -static DEFINE_IDR(mtd_master_idr); /* These are exported solely for the purpose of mtd_blkdevs.c. You should not use them for _anything_ else */ @@ -89,9 +83,8 @@ EXPORT_SYMBOL_GPL(__mtd_next_device); static LIST_HEAD(mtd_notifiers); -#define MTD_MASTER_DEVS 255 + #define MTD_DEVT(index) MKDEV(MTD_CHAR_MAJOR, (index)*2) -static dev_t mtd_master_devt; /* REVISIT once MTD uses the driver model better, whoever allocates * the mtd_info will probably want to use the release() hook... @@ -111,17 +104,6 @@ static void mtd_release(struct device *dev) device_destroy(&mtd_class, index + 1); } -static void mtd_master_release(struct device *dev) -{ - struct mtd_info *mtd = dev_get_drvdata(dev); - - idr_remove(&mtd_master_idr, mtd->index); - of_node_put(mtd_get_of_node(mtd)); - - if (mtd_is_partition(mtd)) - release_mtd_partition(mtd); -} - static void mtd_device_release(struct kref *kref) { struct mtd_info *mtd = container_of(kref, struct mtd_info, refcnt); @@ -385,11 +367,6 @@ static const struct device_type mtd_devtype = { .release = mtd_release, }; -static const struct device_type mtd_master_devtype = { - .name = "mtd_master", - .release = mtd_master_release, -}; - static bool mtd_expert_analysis_mode; #ifdef CONFIG_DEBUG_FS @@ -657,13 +634,13 @@ exit_parent: /** * add_mtd_device - register an MTD device * @mtd: pointer to new MTD device info structure - * @partitioned: create partitioned device * * Add a device to the list of MTD devices present in the system, and * notify each currently active MTD 'user' of its arrival. Returns * zero on success or non-zero on failure. */ -int add_mtd_device(struct mtd_info *mtd, bool partitioned) + +int add_mtd_device(struct mtd_info *mtd) { struct device_node *np = mtd_get_of_node(mtd); struct mtd_info *master = mtd_get_master(mtd); @@ -710,17 +687,10 @@ int add_mtd_device(struct mtd_info *mtd, bool partitioned) ofidx = -1; if (np) ofidx = of_alias_get_id(np, "mtd"); - if (partitioned) { - if (ofidx >= 0) - i = idr_alloc(&mtd_idr, mtd, ofidx, ofidx + 1, GFP_KERNEL); - else - i = idr_alloc(&mtd_idr, mtd, 0, 0, GFP_KERNEL); - } else { - if (ofidx >= 0) - i = idr_alloc(&mtd_master_idr, mtd, ofidx, ofidx + 1, GFP_KERNEL); - else - i = idr_alloc(&mtd_master_idr, mtd, 0, 0, GFP_KERNEL); - } + if (ofidx >= 0) + i = idr_alloc(&mtd_idr, mtd, ofidx, ofidx + 1, GFP_KERNEL); + else + i = idr_alloc(&mtd_idr, mtd, 0, 0, GFP_KERNEL); if (i < 0) { error = i; goto fail_locked; @@ -768,18 +738,10 @@ int add_mtd_device(struct mtd_info *mtd, bool partitioned) /* Caller should have set dev.parent to match the * physical device, if appropriate. */ - if (partitioned) { - mtd->dev.type = &mtd_devtype; - mtd->dev.class = &mtd_class; - mtd->dev.devt = MTD_DEVT(i); - dev_set_name(&mtd->dev, "mtd%d", i); - error = dev_set_name(&mtd->dev, "mtd%d", i); - } else { - mtd->dev.type = &mtd_master_devtype; - mtd->dev.class = &mtd_master_class; - mtd->dev.devt = MKDEV(MAJOR(mtd_master_devt), i); - error = dev_set_name(&mtd->dev, "mtd_master%d", i); - } + mtd->dev.type = &mtd_devtype; + mtd->dev.class = &mtd_class; + mtd->dev.devt = MTD_DEVT(i); + error = dev_set_name(&mtd->dev, "mtd%d", i); if (error) goto fail_devname; dev_set_drvdata(&mtd->dev, mtd); @@ -787,7 +749,6 @@ int add_mtd_device(struct mtd_info *mtd, bool partitioned) of_node_get(mtd_get_of_node(mtd)); error = device_register(&mtd->dev); if (error) { - pr_err("mtd: %s device_register fail %d\n", mtd->name, error); put_device(&mtd->dev); goto fail_added; } @@ -799,13 +760,10 @@ int add_mtd_device(struct mtd_info *mtd, bool partitioned) mtd_debugfs_populate(mtd); - if (partitioned) { - device_create(&mtd_class, mtd->dev.parent, MTD_DEVT(i) + 1, NULL, - "mtd%dro", i); - } + device_create(&mtd_class, mtd->dev.parent, MTD_DEVT(i) + 1, NULL, + "mtd%dro", i); - pr_debug("mtd: Giving out %spartitioned device %d to %s\n", - partitioned ? "" : "un-", i, mtd->name); + pr_debug("mtd: Giving out device %d to %s\n", i, mtd->name); /* No need to get a refcount on the module containing the notifier, since we hold the mtd_table_mutex */ list_for_each_entry(not, &mtd_notifiers, list) @@ -813,16 +771,13 @@ int add_mtd_device(struct mtd_info *mtd, bool partitioned) mutex_unlock(&mtd_table_mutex); - if (partitioned) { - if (of_property_read_bool(mtd_get_of_node(mtd), "linux,rootfs")) { - if (IS_BUILTIN(CONFIG_MTD)) { - pr_info("mtd: setting mtd%d (%s) as root device\n", - mtd->index, mtd->name); - ROOT_DEV = MKDEV(MTD_BLOCK_MAJOR, mtd->index); - } else { - pr_warn("mtd: can't set mtd%d (%s) as root device - mtd must be builtin\n", - mtd->index, mtd->name); - } + if (of_property_read_bool(mtd_get_of_node(mtd), "linux,rootfs")) { + if (IS_BUILTIN(CONFIG_MTD)) { + pr_info("mtd: setting mtd%d (%s) as root device\n", mtd->index, mtd->name); + ROOT_DEV = MKDEV(MTD_BLOCK_MAJOR, mtd->index); + } else { + pr_warn("mtd: can't set mtd%d (%s) as root device - mtd must be builtin\n", + mtd->index, mtd->name); } } @@ -838,10 +793,7 @@ fail_nvmem_add: fail_added: of_node_put(mtd_get_of_node(mtd)); fail_devname: - if (partitioned) - idr_remove(&mtd_idr, i); - else - idr_remove(&mtd_master_idr, i); + idr_remove(&mtd_idr, i); fail_locked: mutex_unlock(&mtd_table_mutex); return error; @@ -859,14 +811,12 @@ fail_locked: int del_mtd_device(struct mtd_info *mtd) { - struct mtd_notifier *not; - struct idr *idr; int ret; + struct mtd_notifier *not; mutex_lock(&mtd_table_mutex); - idr = mtd->dev.class == &mtd_class ? &mtd_idr : &mtd_master_idr; - if (idr_find(idr, mtd->index) != mtd) { + if (idr_find(&mtd_idr, mtd->index) != mtd) { ret = -ENODEV; goto out_error; } @@ -1106,7 +1056,6 @@ int mtd_device_parse_register(struct mtd_info *mtd, const char * const *types, const struct mtd_partition *parts, int nr_parts) { - struct mtd_info *parent; int ret, err; mtd_set_dev_defaults(mtd); @@ -1115,30 +1064,25 @@ int mtd_device_parse_register(struct mtd_info *mtd, const char * const *types, if (ret) goto out; - ret = add_mtd_device(mtd, false); - if (ret) - goto out; - if (IS_ENABLED(CONFIG_MTD_PARTITIONED_MASTER)) { - ret = mtd_add_partition(mtd, mtd->name, 0, MTDPART_SIZ_FULL, &parent); + ret = add_mtd_device(mtd); if (ret) goto out; - - } else { - parent = mtd; } /* Prefer parsed partitions over driver-provided fallback */ - ret = parse_mtd_partitions(parent, types, parser_data); + ret = parse_mtd_partitions(mtd, types, parser_data); if (ret == -EPROBE_DEFER) goto out; if (ret > 0) ret = 0; else if (nr_parts) - ret = add_mtd_partitions(parent, parts, nr_parts); - else if (!IS_ENABLED(CONFIG_MTD_PARTITIONED_MASTER)) - ret = mtd_add_partition(parent, mtd->name, 0, MTDPART_SIZ_FULL, NULL); + ret = add_mtd_partitions(mtd, parts, nr_parts); + else if (!device_is_registered(&mtd->dev)) + ret = add_mtd_device(mtd); + else + ret = 0; if (ret) goto out; @@ -1158,14 +1102,13 @@ int mtd_device_parse_register(struct mtd_info *mtd, const char * const *types, register_reboot_notifier(&mtd->reboot_notifier); } - return 0; out: - nvmem_unregister(mtd->otp_user_nvmem); - nvmem_unregister(mtd->otp_factory_nvmem); - - del_mtd_partitions(mtd); + if (ret) { + nvmem_unregister(mtd->otp_user_nvmem); + nvmem_unregister(mtd->otp_factory_nvmem); + } - if (device_is_registered(&mtd->dev)) { + if (ret && device_is_registered(&mtd->dev)) { err = del_mtd_device(mtd); if (err) pr_err("Error when deleting MTD device (%d)\n", err); @@ -1324,7 +1267,8 @@ int __get_mtd_device(struct mtd_info *mtd) mtd = mtd->parent; } - kref_get(&master->refcnt); + if (IS_ENABLED(CONFIG_MTD_PARTITIONED_MASTER)) + kref_get(&master->refcnt); return 0; } @@ -1418,7 +1362,8 @@ void __put_mtd_device(struct mtd_info *mtd) mtd = parent; } - kref_put(&master->refcnt, mtd_device_release); + if (IS_ENABLED(CONFIG_MTD_PARTITIONED_MASTER)) + kref_put(&master->refcnt, mtd_device_release); module_put(master->owner); @@ -2585,16 +2530,6 @@ static int __init init_mtd(void) if (ret) goto err_reg; - ret = class_register(&mtd_master_class); - if (ret) - goto err_reg2; - - ret = alloc_chrdev_region(&mtd_master_devt, 0, MTD_MASTER_DEVS, "mtd_master"); - if (ret < 0) { - pr_err("unable to allocate char dev region\n"); - goto err_chrdev; - } - mtd_bdi = mtd_bdi_init("mtd"); if (IS_ERR(mtd_bdi)) { ret = PTR_ERR(mtd_bdi); @@ -2619,10 +2554,6 @@ out_procfs: bdi_unregister(mtd_bdi); bdi_put(mtd_bdi); err_bdi: - unregister_chrdev_region(mtd_master_devt, MTD_MASTER_DEVS); -err_chrdev: - class_unregister(&mtd_master_class); -err_reg2: class_unregister(&mtd_class); err_reg: pr_err("Error registering mtd class or bdi: %d\n", ret); @@ -2636,12 +2567,9 @@ static void __exit cleanup_mtd(void) if (proc_mtd) remove_proc_entry("mtd", NULL); class_unregister(&mtd_class); - class_unregister(&mtd_master_class); - unregister_chrdev_region(mtd_master_devt, MTD_MASTER_DEVS); bdi_unregister(mtd_bdi); bdi_put(mtd_bdi); idr_destroy(&mtd_idr); - idr_destroy(&mtd_master_idr); } module_init(init_mtd); diff --git a/drivers/mtd/mtdcore.h b/drivers/mtd/mtdcore.h index 2258d31c5aa6..b014861a06a6 100644 --- a/drivers/mtd/mtdcore.h +++ b/drivers/mtd/mtdcore.h @@ -8,7 +8,7 @@ extern struct mutex mtd_table_mutex; extern struct backing_dev_info *mtd_bdi; struct mtd_info *__mtd_next_device(int i); -int __must_check add_mtd_device(struct mtd_info *mtd, bool partitioned); +int __must_check add_mtd_device(struct mtd_info *mtd); int del_mtd_device(struct mtd_info *mtd); int add_mtd_partitions(struct mtd_info *, const struct mtd_partition *, int); int del_mtd_partitions(struct mtd_info *); diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c index 5a3db36d734e..994e8c51e674 100644 --- a/drivers/mtd/mtdpart.c +++ b/drivers/mtd/mtdpart.c @@ -86,7 +86,8 @@ static struct mtd_info *allocate_partition(struct mtd_info *parent, * parent conditional on that option. Note, this is a way to * distinguish between the parent and its partitions in sysfs. */ - child->dev.parent = &parent->dev; + child->dev.parent = IS_ENABLED(CONFIG_MTD_PARTITIONED_MASTER) || mtd_is_partition(parent) ? + &parent->dev : parent->dev.parent; child->dev.of_node = part->of_node; child->parent = parent; child->part.offset = part->offset; @@ -242,7 +243,7 @@ static int mtd_add_partition_attrs(struct mtd_info *new) } int mtd_add_partition(struct mtd_info *parent, const char *name, - long long offset, long long length, struct mtd_info **out) + long long offset, long long length) { struct mtd_info *master = mtd_get_master(parent); u64 parent_size = mtd_is_partition(parent) ? @@ -275,15 +276,12 @@ int mtd_add_partition(struct mtd_info *parent, const char *name, list_add_tail(&child->part.node, &parent->partitions); mutex_unlock(&master->master.partitions_lock); - ret = add_mtd_device(child, true); + ret = add_mtd_device(child); if (ret) goto err_remove_part; mtd_add_partition_attrs(child); - if (out) - *out = child; - return 0; err_remove_part: @@ -415,7 +413,7 @@ int add_mtd_partitions(struct mtd_info *parent, list_add_tail(&child->part.node, &parent->partitions); mutex_unlock(&master->master.partitions_lock); - ret = add_mtd_device(child, true); + ret = add_mtd_device(child); if (ret) { mutex_lock(&master->master.partitions_lock); list_del(&child->part.node); @@ -592,6 +590,9 @@ static int mtd_part_of_parse(struct mtd_info *master, int ret, err = 0; dev = &master->dev; + /* Use parent device (controller) if the top level MTD is not registered */ + if (!IS_ENABLED(CONFIG_MTD_PARTITIONED_MASTER) && !mtd_is_partition(master)) + dev = master->dev.parent; np = mtd_get_of_node(master); if (mtd_is_partition(master)) @@ -710,7 +711,6 @@ int parse_mtd_partitions(struct mtd_info *master, const char *const *types, if (ret < 0 && !err) err = ret; } - return err; } diff --git a/include/linux/mtd/partitions.h b/include/linux/mtd/partitions.h index 5daf80df9e89..b74a539ec581 100644 --- a/include/linux/mtd/partitions.h +++ b/include/linux/mtd/partitions.h @@ -108,7 +108,7 @@ extern void deregister_mtd_parser(struct mtd_part_parser *parser); deregister_mtd_parser) int mtd_add_partition(struct mtd_info *master, const char *name, - long long offset, long long length, struct mtd_info **part); + long long offset, long long length); int mtd_del_partition(struct mtd_info *master, int partno); uint64_t mtd_get_device_size(const struct mtd_info *mtd); -- cgit v1.2.3 From dba90f5a79c13936de4273a19e67908a0c296afe Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Wed, 18 Jun 2025 10:48:00 +0200 Subject: mtd: spinand: winbond: Prevent unsupported frequencies on dual/quad I/O variants Dual and quad capable chips natively support dual and quad I/O variants at up to 104MHz (1-2-2 and 1-4-4 operations). Reaching the maximum speed of 166MHz is theoretically possible (while still unsupported in the field) by adding a few more dummy cycles. Let's be accurate and clearly state this limit. Setting a maximum frequency implies adding the frequency parameter to the macro, which is done using a variadic argument to avoid impacting all the other drivers which already make use of this macro. Fixes: 1ea808b4d15b ("mtd: spinand: winbond: Update the *JW chip definitions") Signed-off-by: Miquel Raynal --- drivers/mtd/nand/spi/winbond.c | 4 ++-- include/linux/mtd/spinand.h | 10 ++++++---- 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/mtd/nand/spi/winbond.c b/drivers/mtd/nand/spi/winbond.c index 0756966b4e3c..b7a28f001a38 100644 --- a/drivers/mtd/nand/spi/winbond.c +++ b/drivers/mtd/nand/spi/winbond.c @@ -42,11 +42,11 @@ static SPINAND_OP_VARIANTS(update_cache_octal_variants, static SPINAND_OP_VARIANTS(read_cache_dual_quad_dtr_variants, SPINAND_PAGE_READ_FROM_CACHE_1S_4D_4D_OP(0, 8, NULL, 0, 80 * HZ_PER_MHZ), SPINAND_PAGE_READ_FROM_CACHE_1S_1D_4D_OP(0, 2, NULL, 0, 80 * HZ_PER_MHZ), - SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(0, 2, NULL, 0), + SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(0, 2, NULL, 0, 104 * HZ_PER_MHZ), SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0), SPINAND_PAGE_READ_FROM_CACHE_1S_2D_2D_OP(0, 4, NULL, 0, 80 * HZ_PER_MHZ), SPINAND_PAGE_READ_FROM_CACHE_1S_1D_2D_OP(0, 2, NULL, 0, 80 * HZ_PER_MHZ), - SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(0, 1, NULL, 0), + SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(0, 1, NULL, 0, 104 * HZ_PER_MHZ), SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0), SPINAND_PAGE_READ_FROM_CACHE_1S_1D_1D_OP(0, 2, NULL, 0, 80 * HZ_PER_MHZ), SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0), diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 811a0f356315..15eaa09da998 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -113,11 +113,12 @@ SPI_MEM_DTR_OP_DATA_IN(len, buf, 2), \ SPI_MEM_OP_MAX_FREQ(freq)) -#define SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(addr, ndummy, buf, len) \ +#define SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(addr, ndummy, buf, len, ...) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0xbb, 1), \ SPI_MEM_OP_ADDR(2, addr, 2), \ SPI_MEM_OP_DUMMY(ndummy, 2), \ - SPI_MEM_OP_DATA_IN(len, buf, 2)) + SPI_MEM_OP_DATA_IN(len, buf, 2), \ + SPI_MEM_OP_MAX_FREQ(__VA_ARGS__ + 0)) #define SPINAND_PAGE_READ_FROM_CACHE_3A_1S_2S_2S_OP(addr, ndummy, buf, len) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0xbb, 1), \ @@ -151,11 +152,12 @@ SPI_MEM_DTR_OP_DATA_IN(len, buf, 4), \ SPI_MEM_OP_MAX_FREQ(freq)) -#define SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(addr, ndummy, buf, len) \ +#define SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(addr, ndummy, buf, len, ...) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0xeb, 1), \ SPI_MEM_OP_ADDR(2, addr, 4), \ SPI_MEM_OP_DUMMY(ndummy, 4), \ - SPI_MEM_OP_DATA_IN(len, buf, 4)) + SPI_MEM_OP_DATA_IN(len, buf, 4), \ + SPI_MEM_OP_MAX_FREQ(__VA_ARGS__ + 0)) #define SPINAND_PAGE_READ_FROM_CACHE_3A_1S_4S_4S_OP(addr, ndummy, buf, len) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0xeb, 1), \ -- cgit v1.2.3 From c6d732c38f93c4aebd204a5656583142289c3a2e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 17 Jun 2025 13:22:40 -0700 Subject: net: ethtool: remove duplicate defines for family info Commit under fixes switched to uAPI generation from the YAML spec. A number of custom defines were left behind, mostly for commands very hard to express in YAML spec. Among what was left behind was the name and version of the generic netlink family. Problem is that the codegen always outputs those values so we ended up with a duplicated, differently named set of defines. Provide naming info in YAML and remove the incorrect defines. Fixes: 8d0580c6ebdd ("ethtool: regenerate uapi header from the spec") Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250617202240.811179-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/ethtool.yaml | 3 +++ include/uapi/linux/ethtool_netlink.h | 4 ---- include/uapi/linux/ethtool_netlink_generated.h | 4 ++-- 3 files changed, 5 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 9f98715a6512..72a076b0e1b5 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -7,6 +7,9 @@ protocol: genetlink-legacy doc: Partial family for Ethtool Netlink. uapi-header: linux/ethtool_netlink_generated.h +c-family-name: ethtool-genl-name +c-version-name: ethtool-genl-version + definitions: - name: udp-tunnel-type diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 9ff72cfb2e98..09a75bdb6560 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -208,10 +208,6 @@ enum { ETHTOOL_A_STATS_PHY_MAX = (__ETHTOOL_A_STATS_PHY_CNT - 1) }; -/* generic netlink info */ -#define ETHTOOL_GENL_NAME "ethtool" -#define ETHTOOL_GENL_VERSION 1 - #define ETHTOOL_MCGRP_MONITOR_NAME "monitor" #endif /* _UAPI_LINUX_ETHTOOL_NETLINK_H_ */ diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 9a02f579de22..aa8ab5227c1e 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -6,8 +6,8 @@ #ifndef _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H #define _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H -#define ETHTOOL_FAMILY_NAME "ethtool" -#define ETHTOOL_FAMILY_VERSION 1 +#define ETHTOOL_GENL_NAME "ethtool" +#define ETHTOOL_GENL_VERSION 1 enum { ETHTOOL_UDP_TUNNEL_TYPE_VXLAN, -- cgit v1.2.3 From cf207eac06f661fb692f405d5ab8230df884ee52 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Tue, 10 Jun 2025 10:14:20 +0800 Subject: KVM: TDX: Handle TDG.VP.VMCALL Handle TDVMCALL for GetQuote to generate a TD-Quote. GetQuote is a doorbell-like interface used by TDX guests to request VMM to generate a TD-Quote signed by a service hosting TD-Quoting Enclave operating on the host. A TDX guest passes a TD Report (TDREPORT_STRUCT) in a shared-memory area as parameter. Host VMM can access it and queue the operation for a service hosting TD-Quoting enclave. When completed, the Quote is returned via the same shared-memory area. KVM only checks the GPA from the TDX guest has the shared-bit set and drops the shared-bit before exiting to userspace to avoid bleeding the shared-bit into KVM's exit ABI. KVM forwards the request to userspace VMM (e.g. QEMU) and userspace VMM queues the operation asynchronously. KVM sets the return code according to the 'ret' field set by userspace to notify the TDX guest whether the request has been queued successfully or not. When the request has been queued successfully, the TDX guest can poll the status field in the shared-memory area to check whether the Quote generation is completed or not. When completed, the generated Quote is returned via the same buffer. Add KVM_EXIT_TDX as a new exit reason to userspace. Userspace is required to handle the KVM exit reason as the initial support for TDX, by reentering KVM to ensure that the TDVMCALL is complete. While at it, add a note that KVM_EXIT_HYPERCALL also requires reentry with KVM_RUN. Signed-off-by: Binbin Wu Tested-by: Mikko Ylinen Acked-by: Kai Huang [Adjust userspace API. - Paolo] Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 49 +++++++++++++++++++++++++++++++++++++++++- arch/x86/kvm/vmx/tdx.c | 32 +++++++++++++++++++++++++++ include/uapi/linux/kvm.h | 17 +++++++++++++++ 3 files changed, 97 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 1bd2d42e6424..115ec3c2b641 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6645,7 +6645,8 @@ to the byte array. .. note:: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN, - KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding + KVM_EXIT_EPR, KVM_EXIT_HYPERCALL, KVM_EXIT_TDX, + KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding operations are complete (and guest state is consistent) only after userspace has re-entered the kernel with KVM_RUN. The kernel side will first finish incomplete operations and then check for pending signals. @@ -7174,6 +7175,52 @@ The valid value for 'flags' is: - KVM_NOTIFY_CONTEXT_INVALID -- the VM context is corrupted and not valid in VMCS. It would run into unknown result if resume the target VM. +:: + + /* KVM_EXIT_TDX */ + struct { + __u64 flags; + __u64 nr; + union { + struct { + u64 ret; + u64 data[5]; + } unknown; + struct { + u64 ret; + u64 gpa; + u64 size; + } get_quote; + }; + } tdx; + +Process a TDVMCALL from the guest. KVM forwards select TDVMCALL based +on the Guest-Hypervisor Communication Interface (GHCI) specification; +KVM bridges these requests to the userspace VMM with minimal changes, +placing the inputs in the union and copying them back to the guest +on re-entry. + +Flags are currently always zero, whereas ``nr`` contains the TDVMCALL +number from register R11. The remaining field of the union provide the +inputs and outputs of the TDVMCALL. Currently the following values of +``nr`` are defined: + +* ``TDVMCALL_GET_QUOTE``: the guest has requested to generate a TD-Quote +signed by a service hosting TD-Quoting Enclave operating on the host. +Parameters and return value are in the ``get_quote`` field of the union. +The ``gpa`` field and ``size`` specify the guest physical address +(without the shared bit set) and the size of a shared-memory buffer, in +which the TDX guest passes a TD Report. The ``ret`` field represents +the return value of the GetQuote request. When the request has been +queued successfully, the TDX guest can poll the status field in the +shared-memory area to check whether the Quote generation is completed or +not. When completed, the generated Quote is returned via the same buffer. + +KVM may add support for more values in the future that may cause a userspace +exit, even without calls to ``KVM_ENABLE_CAP`` or similar. In this case, +it will enter with output fields already valid; in the common case, the +``unknown.ret`` field of the union will be ``TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED``. +Userspace need not do anything if it does not wish to support a TDVMCALL. :: /* Fix the size of the union. */ diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 5d100c240ab3..b619a3478983 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1465,6 +1465,36 @@ static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu) return 1; } +static int tdx_complete_simple(struct kvm_vcpu *vcpu) +{ + tdvmcall_set_return_code(vcpu, vcpu->run->tdx.unknown.ret); + return 1; +} + +static int tdx_get_quote(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + u64 gpa = tdx->vp_enter_args.r12; + u64 size = tdx->vp_enter_args.r13; + + /* The gpa of buffer must have shared bit set. */ + if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) { + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + return 1; + } + + vcpu->run->exit_reason = KVM_EXIT_TDX; + vcpu->run->tdx.flags = 0; + vcpu->run->tdx.nr = TDVMCALL_GET_QUOTE; + vcpu->run->tdx.get_quote.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED; + vcpu->run->tdx.get_quote.gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm)); + vcpu->run->tdx.get_quote.size = size; + + vcpu->arch.complete_userspace_io = tdx_complete_simple; + + return 0; +} + static int handle_tdvmcall(struct kvm_vcpu *vcpu) { switch (tdvmcall_leaf(vcpu)) { @@ -1474,6 +1504,8 @@ static int handle_tdvmcall(struct kvm_vcpu *vcpu) return tdx_report_fatal_error(vcpu); case TDVMCALL_GET_TD_VM_CALL_INFO: return tdx_get_td_vm_call_info(vcpu); + case TDVMCALL_GET_QUOTE: + return tdx_get_quote(vcpu); default: break; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index d00b85cb168c..e23e7286ad1a 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -178,6 +178,7 @@ struct kvm_xen_exit { #define KVM_EXIT_NOTIFY 37 #define KVM_EXIT_LOONGARCH_IOCSR 38 #define KVM_EXIT_MEMORY_FAULT 39 +#define KVM_EXIT_TDX 40 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -447,6 +448,22 @@ struct kvm_run { __u64 gpa; __u64 size; } memory_fault; + /* KVM_EXIT_TDX */ + struct { + __u64 flags; + __u64 nr; + union { + struct { + __u64 ret; + __u64 data[5]; + } unknown; + struct { + __u64 ret; + __u64 gpa; + __u64 size; + } get_quote; + }; + } tdx; /* Fix the size of the union. */ char padding[256]; }; -- cgit v1.2.3 From 25e8b1dd4883e6c251c3db5b347f3c8ae4ade921 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Tue, 10 Jun 2025 10:14:21 +0800 Subject: KVM: TDX: Exit to userspace for GetTdVmCallInfo Exit to userspace for TDG.VP.VMCALL via KVM_EXIT_TDX, to allow userspace to provide information about the support of TDVMCALLs when r12 is 1 for the TDVMCALLs beyond the GHCI base API. GHCI spec defines the GHCI base TDVMCALLs: , , , , <#VE.RequestMMIO>, , , and . They must be supported by VMM to support TDX guests. For GetTdVmCallInfo - When leaf (r12) to enumerate TDVMCALL functionality is set to 0, successful execution indicates all GHCI base TDVMCALLs listed above are supported. Update the KVM TDX document with the set of the GHCI base APIs. - When leaf (r12) to enumerate TDVMCALL functionality is set to 1, it indicates the TDX guest is querying the supported TDVMCALLs beyond the GHCI base TDVMCALLs. Exit to userspace to let userspace set the TDVMCALL sub-function bit(s) accordingly to the leaf outputs. KVM could set the TDVMCALL bit(s) supported by itself when the TDVMCALLs don't need support from userspace after returning from userspace and before entering guest. Currently, no such TDVMCALLs implemented, KVM just sets the values returned from userspace. Suggested-by: Paolo Bonzini Signed-off-by: Binbin Wu [Adjust userspace API. - Paolo] Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 10 ++++++++++ arch/x86/kvm/vmx/tdx.c | 43 ++++++++++++++++++++++++++++++++++++++---- include/uapi/linux/kvm.h | 5 +++++ 3 files changed, 54 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 115ec3c2b641..9abf93ee5f65 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7191,6 +7191,11 @@ The valid value for 'flags' is: u64 gpa; u64 size; } get_quote; + struct { + u64 ret; + u64 leaf; + u64 r11, r12, r13, r14; + } get_tdvmcall_info; }; } tdx; @@ -7216,6 +7221,11 @@ queued successfully, the TDX guest can poll the status field in the shared-memory area to check whether the Quote generation is completed or not. When completed, the generated Quote is returned via the same buffer. +* ``TDVMCALL_GET_TD_VM_CALL_INFO``: the guest has requested the support +status of TDVMCALLs. The output values for the given leaf should be +placed in fields from ``r11`` to ``r14`` of the ``get_tdvmcall_info`` +field of the union. + KVM may add support for more values in the future that may cause a userspace exit, even without calls to ``KVM_ENABLE_CAP`` or similar. In this case, it will enter with output fields already valid; in the common case, the diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index b619a3478983..1ad20c273f3b 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1451,18 +1451,53 @@ error: return 1; } +static int tdx_complete_get_td_vm_call_info(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + + tdvmcall_set_return_code(vcpu, vcpu->run->tdx.get_tdvmcall_info.ret); + + /* + * For now, there is no TDVMCALL beyond GHCI base API supported by KVM + * directly without the support from userspace, just set the value + * returned from userspace. + */ + tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11; + tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12; + tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13; + tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14; + + return 1; +} + static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu) { struct vcpu_tdx *tdx = to_tdx(vcpu); - if (tdx->vp_enter_args.r12) - tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); - else { + switch (tdx->vp_enter_args.r12) { + case 0: tdx->vp_enter_args.r11 = 0; + tdx->vp_enter_args.r12 = 0; tdx->vp_enter_args.r13 = 0; tdx->vp_enter_args.r14 = 0; + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUCCESS); + return 1; + case 1: + vcpu->run->tdx.get_tdvmcall_info.leaf = tdx->vp_enter_args.r12; + vcpu->run->exit_reason = KVM_EXIT_TDX; + vcpu->run->tdx.flags = 0; + vcpu->run->tdx.nr = TDVMCALL_GET_TD_VM_CALL_INFO; + vcpu->run->tdx.get_tdvmcall_info.ret = TDVMCALL_STATUS_SUCCESS; + vcpu->run->tdx.get_tdvmcall_info.r11 = 0; + vcpu->run->tdx.get_tdvmcall_info.r12 = 0; + vcpu->run->tdx.get_tdvmcall_info.r13 = 0; + vcpu->run->tdx.get_tdvmcall_info.r14 = 0; + vcpu->arch.complete_userspace_io = tdx_complete_get_td_vm_call_info; + return 0; + default: + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + return 1; } - return 1; } static int tdx_complete_simple(struct kvm_vcpu *vcpu) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index e23e7286ad1a..37891580d05d 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -462,6 +462,11 @@ struct kvm_run { __u64 gpa; __u64 size; } get_quote; + struct { + __u64 ret; + __u64 leaf; + __u64 r11, r12, r13, r14; + } get_tdvmcall_info; }; } tdx; /* Fix the size of the union. */ -- cgit v1.2.3 From 64f7548aad63d2fbca2eeb6eb33361c218ebd5a5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 20 Jun 2025 21:19:40 +0200 Subject: lib/crypto: sha256: Mark sha256_choose_blocks as __always_inline When the compiler chooses to not inline sha256_choose_blocks() in the purgatory code, it fails to link against the missing CPU specific version: x86_64-linux-ld: arch/x86/purgatory/purgatory.ro: in function `sha256_choose_blocks.part.0': sha256.c:(.text+0x6a6): undefined reference to `irq_fpu_usable' sha256.c:(.text+0x6c7): undefined reference to `sha256_blocks_arch' sha256.c:(.text+0x6cc): undefined reference to `sha256_blocks_simd' Mark this function as __always_inline to prevent this, same as sha256_finup(). Fixes: 5b90a779bc54 ("crypto: lib/sha256 - Add helpers for block-based shash") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20250620191952.1867578-1-arnd@kernel.org Signed-off-by: Eric Biggers --- include/crypto/internal/sha2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/crypto/internal/sha2.h b/include/crypto/internal/sha2.h index b9bccd3ff57f..21a27fd5e198 100644 --- a/include/crypto/internal/sha2.h +++ b/include/crypto/internal/sha2.h @@ -25,7 +25,7 @@ void sha256_blocks_arch(u32 state[SHA256_STATE_WORDS], void sha256_blocks_simd(u32 state[SHA256_STATE_WORDS], const u8 *data, size_t nblocks); -static inline void sha256_choose_blocks( +static __always_inline void sha256_choose_blocks( u32 state[SHA256_STATE_WORDS], const u8 *data, size_t nblocks, bool force_generic, bool force_simd) { -- cgit v1.2.3 From b872f562c8cef59743993b48eb458c2d87c1651e Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 23 Jun 2025 19:11:50 +0800 Subject: dm-crypt: Extend state buffer size in crypt_iv_lmk_one Add a macro CRYPTO_MD5_STATESIZE for the Crypto API export state size of md5 and use that in dm-crypt instead of relying on the size of struct md5_state (the latter is currently undergoing a transition and may shrink). This commit fixes a crash on 32-bit machines: Oops: Oops: 0000 [#1] SMP CPU: 1 UID: 0 PID: 12 Comm: kworker/u16:0 Not tainted 6.16.0-rc2+ #993 PREEMPT(full) Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 11/12/2020 Workqueue: kcryptd-254:0-1 kcryptd_crypt [dm_crypt] EIP: __crypto_shash_export+0xf/0x90 Code: 4a c1 c7 40 20 a0 b4 4a c1 81 cf 0e 00 04 08 89 78 50 e9 2b ff ff ff 8d 74 26 00 55 89 e5 57 56 53 89 c3 89 d6 8b 00 8b 40 14 <8b> 50 fc f6 40 13 01 74 04 4a 2b 50 14 85 c9 74 10 89 f2 89 d8 ff EAX: 303a3435 EBX: c3007c90 ECX: 00000000 EDX: c3007c38 ESI: c3007c38 EDI: c3007c90 EBP: c3007bfc ESP: c3007bf0 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 EFLAGS: 00010216 CR0: 80050033 CR2: 303a3431 CR3: 04fbe000 CR4: 00350e90 Call Trace: crypto_shash_export+0x65/0xc0 crypt_iv_lmk_one+0x106/0x1a0 [dm_crypt] Fixes: efd62c85525e ("crypto: md5-generic - Use API partial block handling") Reported-by: Milan Broz Signed-off-by: Herbert Xu Tested-by: Milan Broz Closes: https://lore.kernel.org/linux-crypto/f1625ddc-e82e-4b77-80c2-dc8e45b54848@gmail.com/T/ Signed-off-by: Mikulas Patocka --- drivers/md/dm-crypt.c | 11 +++++++---- include/crypto/hash.h | 2 ++ include/crypto/md5.h | 4 ++++ 3 files changed, 13 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 9dfdb63220d7..17157c4216a5 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -517,7 +517,10 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv, { struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; SHASH_DESC_ON_STACK(desc, lmk->hash_tfm); - struct md5_state md5state; + union { + struct md5_state md5state; + u8 state[CRYPTO_MD5_STATESIZE]; + } u; __le32 buf[4]; int i, r; @@ -548,13 +551,13 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv, return r; /* No MD5 padding here */ - r = crypto_shash_export(desc, &md5state); + r = crypto_shash_export(desc, &u.md5state); if (r) return r; for (i = 0; i < MD5_HASH_WORDS; i++) - __cpu_to_le32s(&md5state.hash[i]); - memcpy(iv, &md5state.hash, cc->iv_size); + __cpu_to_le32s(&u.md5state.hash[i]); + memcpy(iv, &u.md5state.hash, cc->iv_size); return 0; } diff --git a/include/crypto/hash.h b/include/crypto/hash.h index 6f6b9de12cd3..db294d452e8c 100644 --- a/include/crypto/hash.h +++ b/include/crypto/hash.h @@ -202,6 +202,8 @@ struct shash_desc { #define HASH_REQUEST_CLONE(name, gfp) \ hash_request_clone(name, sizeof(__##name##_req), gfp) +#define CRYPTO_HASH_STATESIZE(coresize, blocksize) (coresize + blocksize + 1) + /** * struct shash_alg - synchronous message digest definition * @init: see struct ahash_alg diff --git a/include/crypto/md5.h b/include/crypto/md5.h index 198b5d69b92f..28ee533a0507 100644 --- a/include/crypto/md5.h +++ b/include/crypto/md5.h @@ -2,6 +2,7 @@ #ifndef _CRYPTO_MD5_H #define _CRYPTO_MD5_H +#include #include #define MD5_DIGEST_SIZE 16 @@ -15,6 +16,9 @@ #define MD5_H2 0x98badcfeUL #define MD5_H3 0x10325476UL +#define CRYPTO_MD5_STATESIZE \ + CRYPTO_HASH_STATESIZE(MD5_STATE_SIZE, MD5_HMAC_BLOCK_SIZE) + extern const u8 md5_zero_message_hash[MD5_DIGEST_SIZE]; struct md5_state { -- cgit v1.2.3 From a24cc6ce1933eade12aa2b9859de0fcd2dac2c06 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 23 Jun 2025 10:34:08 +0200 Subject: futex: Initialize futex_phash_new during fork(). During a hash resize operation the new private hash is stored in mm_struct::futex_phash_new if the current hash can not be immediately replaced. The new hash must not be copied during fork() into the new task. Doing so will lead to a double-free of the memory by the two tasks. Initialize the mm_struct::futex_phash_new during fork(). Closes: https://lore.kernel.org/all/aFBQ8CBKmRzEqIfS@mozart.vkv.me/ Fixes: bd54df5ea7cad ("futex: Allow to resize the private local hash") Reported-by: Calvin Owens Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Tested-by: Calvin Owens Link: https://lkml.kernel.org/r/20250623083408.jTiJiC6_@linutronix.de --- include/linux/futex.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/futex.h b/include/linux/futex.h index 005b040c4791..b37193653e6b 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -89,6 +89,7 @@ void futex_hash_free(struct mm_struct *mm); static inline void futex_mm_init(struct mm_struct *mm) { RCU_INIT_POINTER(mm->futex_phash, NULL); + mm->futex_phash_new = NULL; mutex_init(&mm->futex_hash_lock); } -- cgit v1.2.3 From 1d6123102e9fbedc8d25bf4731da6d513173e49e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 17 Jun 2025 09:58:13 -0700 Subject: Bluetooth: hci_core: Fix use-after-free in vhci_flush() syzbot reported use-after-free in vhci_flush() without repro. [0] From the splat, a thread close()d a vhci file descriptor while its device was being used by iotcl() on another thread. Once the last fd refcnt is released, vhci_release() calls hci_unregister_dev(), hci_free_dev(), and kfree() for struct vhci_data, which is set to hci_dev->dev->driver_data. The problem is that there is no synchronisation after unlinking hdev from hci_dev_list in hci_unregister_dev(). There might be another thread still accessing the hdev which was fetched before the unlink operation. We can use SRCU for such synchronisation. Let's run hci_dev_reset() under SRCU and wait for its completion in hci_unregister_dev(). Another option would be to restore hci_dev->destruct(), which was removed in commit 587ae086f6e4 ("Bluetooth: Remove unused hci-destruct cb"). However, this would not be a good solution, as we should not run hci_unregister_dev() while there are in-flight ioctl() requests, which could lead to another data-race KCSAN splat. Note that other drivers seem to have the same problem, for exmaple, virtbt_remove(). [0]: BUG: KASAN: slab-use-after-free in skb_queue_empty_lockless include/linux/skbuff.h:1891 [inline] BUG: KASAN: slab-use-after-free in skb_queue_purge_reason+0x99/0x360 net/core/skbuff.c:3937 Read of size 8 at addr ffff88807cb8d858 by task syz.1.219/6718 CPU: 1 UID: 0 PID: 6718 Comm: syz.1.219 Not tainted 6.16.0-rc1-syzkaller-00196-g08207f42d3ff #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/07/2025 Call Trace: dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:408 [inline] print_report+0xd2/0x2b0 mm/kasan/report.c:521 kasan_report+0x118/0x150 mm/kasan/report.c:634 skb_queue_empty_lockless include/linux/skbuff.h:1891 [inline] skb_queue_purge_reason+0x99/0x360 net/core/skbuff.c:3937 skb_queue_purge include/linux/skbuff.h:3368 [inline] vhci_flush+0x44/0x50 drivers/bluetooth/hci_vhci.c:69 hci_dev_do_reset net/bluetooth/hci_core.c:552 [inline] hci_dev_reset+0x420/0x5c0 net/bluetooth/hci_core.c:592 sock_do_ioctl+0xd9/0x300 net/socket.c:1190 sock_ioctl+0x576/0x790 net/socket.c:1311 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:907 [inline] __se_sys_ioctl+0xf9/0x170 fs/ioctl.c:893 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7fcf5b98e929 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fcf5c7b9038 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007fcf5bbb6160 RCX: 00007fcf5b98e929 RDX: 0000000000000000 RSI: 00000000400448cb RDI: 0000000000000009 RBP: 00007fcf5ba10b39 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 00007fcf5bbb6160 R15: 00007ffd6353d528 Allocated by task 6535: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:68 poison_kmalloc_redzone mm/kasan/common.c:377 [inline] __kasan_kmalloc+0x93/0xb0 mm/kasan/common.c:394 kasan_kmalloc include/linux/kasan.h:260 [inline] __kmalloc_cache_noprof+0x230/0x3d0 mm/slub.c:4359 kmalloc_noprof include/linux/slab.h:905 [inline] kzalloc_noprof include/linux/slab.h:1039 [inline] vhci_open+0x57/0x360 drivers/bluetooth/hci_vhci.c:635 misc_open+0x2bc/0x330 drivers/char/misc.c:161 chrdev_open+0x4c9/0x5e0 fs/char_dev.c:414 do_dentry_open+0xdf0/0x1970 fs/open.c:964 vfs_open+0x3b/0x340 fs/open.c:1094 do_open fs/namei.c:3887 [inline] path_openat+0x2ee5/0x3830 fs/namei.c:4046 do_filp_open+0x1fa/0x410 fs/namei.c:4073 do_sys_openat2+0x121/0x1c0 fs/open.c:1437 do_sys_open fs/open.c:1452 [inline] __do_sys_openat fs/open.c:1468 [inline] __se_sys_openat fs/open.c:1463 [inline] __x64_sys_openat+0x138/0x170 fs/open.c:1463 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Freed by task 6535: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:68 kasan_save_free_info+0x46/0x50 mm/kasan/generic.c:576 poison_slab_object mm/kasan/common.c:247 [inline] __kasan_slab_free+0x62/0x70 mm/kasan/common.c:264 kasan_slab_free include/linux/kasan.h:233 [inline] slab_free_hook mm/slub.c:2381 [inline] slab_free mm/slub.c:4643 [inline] kfree+0x18e/0x440 mm/slub.c:4842 vhci_release+0xbc/0xd0 drivers/bluetooth/hci_vhci.c:671 __fput+0x44c/0xa70 fs/file_table.c:465 task_work_run+0x1d1/0x260 kernel/task_work.c:227 exit_task_work include/linux/task_work.h:40 [inline] do_exit+0x6ad/0x22e0 kernel/exit.c:955 do_group_exit+0x21c/0x2d0 kernel/exit.c:1104 __do_sys_exit_group kernel/exit.c:1115 [inline] __se_sys_exit_group kernel/exit.c:1113 [inline] __x64_sys_exit_group+0x3f/0x40 kernel/exit.c:1113 x64_sys_call+0x21ba/0x21c0 arch/x86/include/generated/asm/syscalls_64.h:232 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f The buggy address belongs to the object at ffff88807cb8d800 which belongs to the cache kmalloc-1k of size 1024 The buggy address is located 88 bytes inside of freed 1024-byte region [ffff88807cb8d800, ffff88807cb8dc00) Fixes: bf18c7118cf8 ("Bluetooth: vhci: Free driver_data on file release") Reported-by: syzbot+2faa4825e556199361f9@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=f62d64848fc4c7c30cd6 Signed-off-by: Kuniyuki Iwashima Acked-by: Paul Menzel Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 2 ++ net/bluetooth/hci_core.c | 34 ++++++++++++++++++++++++++++++---- 2 files changed, 32 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index a760f05fa3fb..9fc8f544e20e 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -347,6 +348,7 @@ struct adv_monitor { struct hci_dev { struct list_head list; + struct srcu_struct srcu; struct mutex lock; struct ida unset_handle_ida; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 07a8b4281a39..14d7221b8ac0 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -64,7 +64,7 @@ static DEFINE_IDA(hci_index_ida); /* Get HCI device by index. * Device is held on return. */ -struct hci_dev *hci_dev_get(int index) +static struct hci_dev *__hci_dev_get(int index, int *srcu_index) { struct hci_dev *hdev = NULL, *d; @@ -77,6 +77,8 @@ struct hci_dev *hci_dev_get(int index) list_for_each_entry(d, &hci_dev_list, list) { if (d->id == index) { hdev = hci_dev_hold(d); + if (srcu_index) + *srcu_index = srcu_read_lock(&d->srcu); break; } } @@ -84,6 +86,22 @@ struct hci_dev *hci_dev_get(int index) return hdev; } +struct hci_dev *hci_dev_get(int index) +{ + return __hci_dev_get(index, NULL); +} + +static struct hci_dev *hci_dev_get_srcu(int index, int *srcu_index) +{ + return __hci_dev_get(index, srcu_index); +} + +static void hci_dev_put_srcu(struct hci_dev *hdev, int srcu_index) +{ + srcu_read_unlock(&hdev->srcu, srcu_index); + hci_dev_put(hdev); +} + /* ---- Inquiry support ---- */ bool hci_discovery_active(struct hci_dev *hdev) @@ -568,9 +586,9 @@ static int hci_dev_do_reset(struct hci_dev *hdev) int hci_dev_reset(__u16 dev) { struct hci_dev *hdev; - int err; + int err, srcu_index; - hdev = hci_dev_get(dev); + hdev = hci_dev_get_srcu(dev, &srcu_index); if (!hdev) return -ENODEV; @@ -592,7 +610,7 @@ int hci_dev_reset(__u16 dev) err = hci_dev_do_reset(hdev); done: - hci_dev_put(hdev); + hci_dev_put_srcu(hdev, srcu_index); return err; } @@ -2433,6 +2451,11 @@ struct hci_dev *hci_alloc_dev_priv(int sizeof_priv) if (!hdev) return NULL; + if (init_srcu_struct(&hdev->srcu)) { + kfree(hdev); + return NULL; + } + hdev->pkt_type = (HCI_DM1 | HCI_DH1 | HCI_HV1); hdev->esco_type = (ESCO_HV1); hdev->link_mode = (HCI_LM_ACCEPT); @@ -2678,6 +2701,9 @@ void hci_unregister_dev(struct hci_dev *hdev) list_del(&hdev->list); write_unlock(&hci_dev_list_lock); + synchronize_srcu(&hdev->srcu); + cleanup_srcu_struct(&hdev->srcu); + disable_work_sync(&hdev->rx_work); disable_work_sync(&hdev->cmd_work); disable_work_sync(&hdev->tx_work); -- cgit v1.2.3 From 7484e15dbb016d9d40f8c6e0475810212ae181db Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 17 Jun 2025 00:09:51 -0400 Subject: replace collect_mounts()/drop_collected_mounts() with a safer variant collect_mounts() has several problems - one can't iterate over the results directly, so it has to be done with callback passed to iterate_mounts(); it has an oopsable race with d_invalidate(); it creates temporary clones of mounts invisibly for sync umount (IOW, you can have non-lazy umount succeed leaving filesystem not mounted anywhere and yet still busy). A saner approach is to give caller an array of struct path that would pin every mount in a subtree, without cloning any mounts. * collect_mounts()/drop_collected_mounts()/iterate_mounts() is gone * collect_paths(where, preallocated, size) gives either ERR_PTR(-E...) or a pointer to array of struct path, one for each chunk of tree visible under 'where' (i.e. the first element is a copy of where, followed by (mount,root) for everything mounted under it - the same set collect_mounts() would give). Unlike collect_mounts(), the mounts are *not* cloned - we just get pinning references to the roots of subtrees in the caller's namespace. Array is terminated by {NULL, NULL} struct path. If it fits into preallocated array (on-stack, normally), that's where it goes; otherwise it's allocated by kmalloc_array(). Passing 0 as size means that 'preallocated' is ignored (and expected to be NULL). * drop_collected_paths(paths, preallocated) is given the array returned by an earlier call of collect_paths() and the preallocated array passed to that call. All mount/dentry references are dropped and array is kfree'd if it's not equal to 'preallocated'. * instead of iterate_mounts(), users should just iterate over array of struct path - nothing exotic is needed for that. Existing users (all in audit_tree.c) are converted. [folded a fix for braino reported by Venkat Rao Bagalkote ] Fixes: 80b5dce8c59b0 ("vfs: Add a function to lazily unmount all mounts from any dentry") Tested-by: Venkat Rao Bagalkote Signed-off-by: Al Viro --- Documentation/filesystems/porting.rst | 9 ++++ fs/namespace.c | 97 +++++++++++++++++++++-------------- fs/pnode.h | 2 - include/linux/mount.h | 6 +-- kernel/audit_tree.c | 63 ++++++++++++----------- 5 files changed, 104 insertions(+), 73 deletions(-) (limited to 'include') diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 3616d7161dab..a5734bdd1cc7 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -1249,3 +1249,12 @@ Using try_lookup_noperm() will require linux/namei.h to be included. Calling conventions for ->d_automount() have changed; we should *not* grab an extra reference to new mount - it should be returned with refcount 1. + +--- + +collect_mounts()/drop_collected_mounts()/iterate_mounts() are gone now. +Replacement is collect_paths()/drop_collected_path(), with no special +iterator needed. Instead of a cloned mount tree, the new interface returns +an array of struct path, one for each mount collect_mounts() would've +created. These struct path point to locations in the caller's namespace +that would be roots of the cloned mounts. diff --git a/fs/namespace.c b/fs/namespace.c index e13d9ab4f564..14601ec4c2c5 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2310,21 +2310,62 @@ out: return dst_mnt; } -/* Caller should check returned pointer for errors */ +static inline bool extend_array(struct path **res, struct path **to_free, + unsigned n, unsigned *count, unsigned new_count) +{ + struct path *p; + + if (likely(n < *count)) + return true; + p = kmalloc_array(new_count, sizeof(struct path), GFP_KERNEL); + if (p && *count) + memcpy(p, *res, *count * sizeof(struct path)); + *count = new_count; + kfree(*to_free); + *to_free = *res = p; + return p; +} -struct vfsmount *collect_mounts(const struct path *path) +struct path *collect_paths(const struct path *path, + struct path *prealloc, unsigned count) { - struct mount *tree; - namespace_lock(); - if (!check_mnt(real_mount(path->mnt))) - tree = ERR_PTR(-EINVAL); - else - tree = copy_tree(real_mount(path->mnt), path->dentry, - CL_COPY_ALL | CL_PRIVATE); - namespace_unlock(); - if (IS_ERR(tree)) - return ERR_CAST(tree); - return &tree->mnt; + struct mount *root = real_mount(path->mnt); + struct mount *child; + struct path *res = prealloc, *to_free = NULL; + unsigned n = 0; + + guard(rwsem_read)(&namespace_sem); + + if (!check_mnt(root)) + return ERR_PTR(-EINVAL); + if (!extend_array(&res, &to_free, 0, &count, 32)) + return ERR_PTR(-ENOMEM); + res[n++] = *path; + list_for_each_entry(child, &root->mnt_mounts, mnt_child) { + if (!is_subdir(child->mnt_mountpoint, path->dentry)) + continue; + for (struct mount *m = child; m; m = next_mnt(m, child)) { + if (!extend_array(&res, &to_free, n, &count, 2 * count)) + return ERR_PTR(-ENOMEM); + res[n].mnt = &m->mnt; + res[n].dentry = m->mnt.mnt_root; + n++; + } + } + if (!extend_array(&res, &to_free, n, &count, count + 1)) + return ERR_PTR(-ENOMEM); + memset(res + n, 0, (count - n) * sizeof(struct path)); + for (struct path *p = res; p->mnt; p++) + path_get(p); + return res; +} + +void drop_collected_paths(struct path *paths, struct path *prealloc) +{ + for (struct path *p = paths; p->mnt; p++) + path_put(p); + if (paths != prealloc) + kfree(paths); } static void free_mnt_ns(struct mnt_namespace *); @@ -2401,15 +2442,6 @@ void dissolve_on_fput(struct vfsmount *mnt) free_mnt_ns(ns); } -void drop_collected_mounts(struct vfsmount *mnt) -{ - namespace_lock(); - lock_mount_hash(); - umount_tree(real_mount(mnt), 0); - unlock_mount_hash(); - namespace_unlock(); -} - static bool __has_locked_children(struct mount *mnt, struct dentry *dentry) { struct mount *child; @@ -2511,21 +2543,6 @@ struct vfsmount *clone_private_mount(const struct path *path) } EXPORT_SYMBOL_GPL(clone_private_mount); -int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, - struct vfsmount *root) -{ - struct mount *mnt; - int res = f(root, arg); - if (res) - return res; - list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) { - res = f(&mnt->mnt, arg); - if (res) - return res; - } - return 0; -} - static void lock_mnt_tree(struct mount *mnt) { struct mount *p; @@ -6262,7 +6279,11 @@ void put_mnt_ns(struct mnt_namespace *ns) { if (!refcount_dec_and_test(&ns->ns.count)) return; - drop_collected_mounts(&ns->root->mnt); + namespace_lock(); + lock_mount_hash(); + umount_tree(ns->root, 0); + unlock_mount_hash(); + namespace_unlock(); free_mnt_ns(ns); } diff --git a/fs/pnode.h b/fs/pnode.h index 34b6247af01d..2d026fb98b18 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -28,8 +28,6 @@ #define CL_SHARED_TO_SLAVE 0x20 #define CL_COPY_MNT_NS_FILE 0x40 -#define CL_COPY_ALL (CL_COPY_UNBINDABLE | CL_COPY_MNT_NS_FILE) - static inline void set_mnt_shared(struct mount *mnt) { mnt->mnt.mnt_flags &= ~MNT_SHARED_MASK; diff --git a/include/linux/mount.h b/include/linux/mount.h index 4880f434c021..1a508beba446 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -116,10 +116,8 @@ extern int may_umount_tree(struct vfsmount *); extern int may_umount(struct vfsmount *); int do_mount(const char *, const char __user *, const char *, unsigned long, void *); -extern struct vfsmount *collect_mounts(const struct path *); -extern void drop_collected_mounts(struct vfsmount *); -extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *, - struct vfsmount *); +extern struct path *collect_paths(const struct path *, struct path *, unsigned); +extern void drop_collected_paths(struct path *, struct path *); extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int num); extern int cifs_root_data(char **dev, char **opts); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index f2f38903b2fe..b0eae2a3c895 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -668,12 +668,6 @@ int audit_remove_tree_rule(struct audit_krule *rule) return 0; } -static int compare_root(struct vfsmount *mnt, void *arg) -{ - return inode_to_key(d_backing_inode(mnt->mnt_root)) == - (unsigned long)arg; -} - void audit_trim_trees(void) { struct list_head cursor; @@ -683,8 +677,9 @@ void audit_trim_trees(void) while (cursor.next != &tree_list) { struct audit_tree *tree; struct path path; - struct vfsmount *root_mnt; struct audit_node *node; + struct path *paths; + struct path array[16]; int err; tree = container_of(cursor.next, struct audit_tree, list); @@ -696,9 +691,9 @@ void audit_trim_trees(void) if (err) goto skip_it; - root_mnt = collect_mounts(&path); + paths = collect_paths(&path, array, 16); path_put(&path); - if (IS_ERR(root_mnt)) + if (IS_ERR(paths)) goto skip_it; spin_lock(&hash_lock); @@ -706,14 +701,17 @@ void audit_trim_trees(void) struct audit_chunk *chunk = find_chunk(node); /* this could be NULL if the watch is dying else where... */ node->index |= 1U<<31; - if (iterate_mounts(compare_root, - (void *)(chunk->key), - root_mnt)) - node->index &= ~(1U<<31); + for (struct path *p = paths; p->dentry; p++) { + struct inode *inode = p->dentry->d_inode; + if (inode_to_key(inode) == chunk->key) { + node->index &= ~(1U<<31); + break; + } + } } spin_unlock(&hash_lock); trim_marked(tree); - drop_collected_mounts(root_mnt); + drop_collected_paths(paths, array); skip_it: put_tree(tree); mutex_lock(&audit_filter_mutex); @@ -742,9 +740,14 @@ void audit_put_tree(struct audit_tree *tree) put_tree(tree); } -static int tag_mount(struct vfsmount *mnt, void *arg) +static int tag_mounts(struct path *paths, struct audit_tree *tree) { - return tag_chunk(d_backing_inode(mnt->mnt_root), arg); + for (struct path *p = paths; p->dentry; p++) { + int err = tag_chunk(p->dentry->d_inode, tree); + if (err) + return err; + } + return 0; } /* @@ -801,7 +804,8 @@ int audit_add_tree_rule(struct audit_krule *rule) { struct audit_tree *seed = rule->tree, *tree; struct path path; - struct vfsmount *mnt; + struct path array[16]; + struct path *paths; int err; rule->tree = NULL; @@ -828,16 +832,16 @@ int audit_add_tree_rule(struct audit_krule *rule) err = kern_path(tree->pathname, 0, &path); if (err) goto Err; - mnt = collect_mounts(&path); + paths = collect_paths(&path, array, 16); path_put(&path); - if (IS_ERR(mnt)) { - err = PTR_ERR(mnt); + if (IS_ERR(paths)) { + err = PTR_ERR(paths); goto Err; } get_tree(tree); - err = iterate_mounts(tag_mount, tree, mnt); - drop_collected_mounts(mnt); + err = tag_mounts(paths, tree); + drop_collected_paths(paths, array); if (!err) { struct audit_node *node; @@ -872,20 +876,21 @@ int audit_tag_tree(char *old, char *new) struct list_head cursor, barrier; int failed = 0; struct path path1, path2; - struct vfsmount *tagged; + struct path array[16]; + struct path *paths; int err; err = kern_path(new, 0, &path2); if (err) return err; - tagged = collect_mounts(&path2); + paths = collect_paths(&path2, array, 16); path_put(&path2); - if (IS_ERR(tagged)) - return PTR_ERR(tagged); + if (IS_ERR(paths)) + return PTR_ERR(paths); err = kern_path(old, 0, &path1); if (err) { - drop_collected_mounts(tagged); + drop_collected_paths(paths, array); return err; } @@ -914,7 +919,7 @@ int audit_tag_tree(char *old, char *new) continue; } - failed = iterate_mounts(tag_mount, tree, tagged); + failed = tag_mounts(paths, tree); if (failed) { put_tree(tree); mutex_lock(&audit_filter_mutex); @@ -955,7 +960,7 @@ int audit_tag_tree(char *old, char *new) list_del(&cursor); mutex_unlock(&audit_filter_mutex); path_put(&path1); - drop_collected_mounts(tagged); + drop_collected_paths(paths, array); return failed; } -- cgit v1.2.3 From 22bbc1dcd0d6785fb390c41f0dd5b5e218d23bdd Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Mon, 23 Jun 2025 12:00:53 +0200 Subject: vsock/uapi: fix linux/vm_sockets.h userspace compilation errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a userspace application just include will fail to build with the following errors: /usr/include/linux/vm_sockets.h:182:39: error: invalid application of ‘sizeof’ to incomplete type ‘struct sockaddr’ 182 | unsigned char svm_zero[sizeof(struct sockaddr) - | ^~~~~~ /usr/include/linux/vm_sockets.h:183:39: error: ‘sa_family_t’ undeclared here (not in a function) 183 | sizeof(sa_family_t) - | Include for userspace (guarded by ifndef __KERNEL__) where `struct sockaddr` and `sa_family_t` are defined. We already do something similar in and . Fixes: d021c344051a ("VSOCK: Introduce VM Sockets") Reported-by: Daan De Meyer Signed-off-by: Stefano Garzarella Link: https://patch.msgid.link/20250623100053.40979-1-sgarzare@redhat.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/vm_sockets.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/vm_sockets.h b/include/uapi/linux/vm_sockets.h index ed07181d4eff..e05280e41522 100644 --- a/include/uapi/linux/vm_sockets.h +++ b/include/uapi/linux/vm_sockets.h @@ -17,6 +17,10 @@ #ifndef _UAPI_VM_SOCKETS_H #define _UAPI_VM_SOCKETS_H +#ifndef __KERNEL__ +#include /* for struct sockaddr and sa_family_t */ +#endif + #include #include -- cgit v1.2.3 From 67caa528ae08cd05e485c0ea6aea0baaf6579b06 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Sat, 21 Jun 2025 10:28:41 -0600 Subject: ublk: fix narrowing warnings in UAPI header When a C++ file compiled with -Wc++11-narrowing includes the UAPI header linux/ublk_cmd.h, ublk_sqe_addr_to_auto_buf_reg()'s assignments of u64 values to u8, u16, and u32 fields result in compiler warnings. Add explicit casts to the intended types to avoid these warnings. Drop the unnecessary bitmasks. Reported-by: Uday Shankar Signed-off-by: Caleb Sander Mateos Fixes: 99c1e4eb6a3f ("ublk: register buffer to local io_uring with provided buf index via UBLK_F_AUTO_BUF_REG") Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20250621162842.337452-1-csander@purestorage.com Signed-off-by: Jens Axboe --- include/uapi/linux/ublk_cmd.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 77d9d6af46da..c062109cb686 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -450,10 +450,10 @@ static inline struct ublk_auto_buf_reg ublk_sqe_addr_to_auto_buf_reg( __u64 sqe_addr) { struct ublk_auto_buf_reg reg = { - .index = sqe_addr & 0xffff, - .flags = (sqe_addr >> 16) & 0xff, - .reserved0 = (sqe_addr >> 24) & 0xff, - .reserved1 = sqe_addr >> 32, + .index = (__u16)sqe_addr, + .flags = (__u8)(sqe_addr >> 16), + .reserved0 = (__u8)(sqe_addr >> 24), + .reserved1 = (__u32)(sqe_addr >> 32), }; return reg; -- cgit v1.2.3 From 81b4d1a1d03301dcca8af5c58eded9e535f1f6ed Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Sat, 21 Jun 2025 11:10:14 -0600 Subject: ublk: update UBLK_F_SUPPORT_ZERO_COPY comment in UAPI header UBLK_F_SUPPORT_ZERO_COPY has a very old comment describing the initial idea for how zero-copy would be implemented. The actual implementation added in commit 1f6540e2aabb ("ublk: zc register/unregister bvec") uses io_uring registered buffers rather than shared memory mapping. Remove the inaccurate remarks about mapping ublk request memory into the ublk server's address space and requiring 4K block size. Replace them with a description of the current zero-copy mechanism. Signed-off-by: Caleb Sander Mateos Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20250621171015.354932-1-csander@purestorage.com Signed-off-by: Jens Axboe --- include/uapi/linux/ublk_cmd.h | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index c062109cb686..c9751bdfd937 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -135,8 +135,28 @@ #define UBLKSRV_IO_BUF_TOTAL_SIZE (1ULL << UBLKSRV_IO_BUF_TOTAL_BITS) /* - * zero copy requires 4k block size, and can remap ublk driver's io - * request into ublksrv's vm space + * ublk server can register data buffers for incoming I/O requests with a sparse + * io_uring buffer table. The request buffer can then be used as the data buffer + * for io_uring operations via the fixed buffer index. + * Note that the ublk server can never directly access the request data memory. + * + * To use this feature, the ublk server must first register a sparse buffer + * table on an io_uring instance. + * When an incoming ublk request is received, the ublk server submits a + * UBLK_U_IO_REGISTER_IO_BUF command to that io_uring instance. The + * ublksrv_io_cmd's q_id and tag specify the request whose buffer to register + * and addr is the index in the io_uring's buffer table to install the buffer. + * SQEs can now be submitted to the io_uring to read/write the request's buffer + * by enabling fixed buffers (e.g. using IORING_OP_{READ,WRITE}_FIXED or + * IORING_URING_CMD_FIXED) and passing the registered buffer index in buf_index. + * Once the last io_uring operation using the request's buffer has completed, + * the ublk server submits a UBLK_U_IO_UNREGISTER_IO_BUF command with q_id, tag, + * and addr again specifying the request buffer to unregister. + * The ublk request is completed when its buffer is unregistered from all + * io_uring instances and the ublk server issues UBLK_U_IO_COMMIT_AND_FETCH_REQ. + * + * Not available for UBLK_F_UNPRIVILEGED_DEV, as a ublk server can leak + * uninitialized kernel memory by not reading into the full request buffer. */ #define UBLK_F_SUPPORT_ZERO_COPY (1ULL << 0) -- cgit v1.2.3 From c8dc579169738a3546f57ecb38e62d3872a3cc04 Mon Sep 17 00:00:00 2001 From: Pratap Nirujogi Date: Mon, 9 Jun 2025 11:53:56 -0400 Subject: i2c: amd-isp: Initialize unique adapter name Initialize unique name for amdisp i2c adapter, which is used in the platform driver to detect the matching adapter for i2c_client creation. Add definition of amdisp i2c adapter name in a new header file (include/linux/soc/amd/isp4_misc.h) as it is referred in different driver modules. Tested-by: Randy Dunlap Signed-off-by: Pratap Nirujogi Signed-off-by: Andi Shyti Link: https://lore.kernel.org/r/20250609155601.1477055-3-pratap.nirujogi@amd.com --- MAINTAINERS | 1 + drivers/i2c/busses/i2c-designware-amdisp.c | 2 ++ include/linux/soc/amd/isp4_misc.h | 12 ++++++++++++ 3 files changed, 15 insertions(+) create mode 100644 include/linux/soc/amd/isp4_misc.h (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index c3f7fbd0d67a..8719f097aae3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -24063,6 +24063,7 @@ M: Bin Du L: linux-i2c@vger.kernel.org S: Maintained F: drivers/i2c/busses/i2c-designware-amdisp.c +F: include/linux/soc/amd/isp4_misc.h SYNOPSYS DESIGNWARE MMC/SD/SDIO DRIVER M: Jaehoon Chung diff --git a/drivers/i2c/busses/i2c-designware-amdisp.c b/drivers/i2c/busses/i2c-designware-amdisp.c index ad6f08338124..450793d5f839 100644 --- a/drivers/i2c/busses/i2c-designware-amdisp.c +++ b/drivers/i2c/busses/i2c-designware-amdisp.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "i2c-designware-core.h" @@ -62,6 +63,7 @@ static int amd_isp_dw_i2c_plat_probe(struct platform_device *pdev) adap = &isp_i2c_dev->adapter; adap->owner = THIS_MODULE; + scnprintf(adap->name, sizeof(adap->name), AMDISP_I2C_ADAP_NAME); ACPI_COMPANION_SET(&adap->dev, ACPI_COMPANION(&pdev->dev)); adap->dev.of_node = pdev->dev.of_node; /* use dynamically allocated adapter id */ diff --git a/include/linux/soc/amd/isp4_misc.h b/include/linux/soc/amd/isp4_misc.h new file mode 100644 index 000000000000..6738796986a7 --- /dev/null +++ b/include/linux/soc/amd/isp4_misc.h @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0+ + +/* + * Copyright (C) 2025 Advanced Micro Devices, Inc. + */ + +#ifndef __SOC_ISP4_MISC_H +#define __SOC_ISP4_MISC_H + +#define AMDISP_I2C_ADAP_NAME "AMDISP DesignWare I2C adapter" + +#endif -- cgit v1.2.3 From 9e6dd4c256d0774701637b958ba682eff4991277 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 24 Jun 2025 14:09:59 -0700 Subject: netlink: specs: mptcp: replace underscores with dashes in names We're trying to add a strict regexp for the name format in the spec. Underscores will not be allowed, dashes should be used instead. This makes no difference to C (codegen, if used, replaces special chars in names) but it gives more uniform naming in Python. Fixes: bc8aeb2045e2 ("Documentation: netlink: add a YAML spec for mptcp") Reviewed-by: Davide Caratti Reviewed-by: Donald Hunter Reviewed-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250624211002.3475021-8-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/mptcp_pm.yaml | 8 ++++---- include/uapi/linux/mptcp_pm.h | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index dfd017780d2f..fb57860fe778 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -57,21 +57,21 @@ definitions: doc: >- A new subflow has been established. 'error' should not be set. Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | - daddr6, sport, dport, backup, if_idx [, error]. + daddr6, sport, dport, backup, if-idx [, error]. - name: sub-closed doc: >- A subflow has been closed. An error (copy of sk_err) could be set if an error has been detected for this subflow. Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | - daddr6, sport, dport, backup, if_idx [, error]. + daddr6, sport, dport, backup, if-idx [, error]. - name: sub-priority value: 13 doc: >- The priority of a subflow has changed. 'error' should not be set. Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | - daddr6, sport, dport, backup, if_idx [, error]. + daddr6, sport, dport, backup, if-idx [, error]. - name: listener-created value: 15 @@ -255,7 +255,7 @@ attribute-sets: name: timeout type: u32 - - name: if_idx + name: if-idx type: u32 - name: reset-reason diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h index 84fa8a21dfd0..6ac84b2f636c 100644 --- a/include/uapi/linux/mptcp_pm.h +++ b/include/uapi/linux/mptcp_pm.h @@ -27,14 +27,14 @@ * token, rem_id. * @MPTCP_EVENT_SUB_ESTABLISHED: A new subflow has been established. 'error' * should not be set. Attributes: token, family, loc_id, rem_id, saddr4 | - * saddr6, daddr4 | daddr6, sport, dport, backup, if_idx [, error]. + * saddr6, daddr4 | daddr6, sport, dport, backup, if-idx [, error]. * @MPTCP_EVENT_SUB_CLOSED: A subflow has been closed. An error (copy of * sk_err) could be set if an error has been detected for this subflow. * Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | - * daddr6, sport, dport, backup, if_idx [, error]. + * daddr6, sport, dport, backup, if-idx [, error]. * @MPTCP_EVENT_SUB_PRIORITY: The priority of a subflow has changed. 'error' * should not be set. Attributes: token, family, loc_id, rem_id, saddr4 | - * saddr6, daddr4 | daddr6, sport, dport, backup, if_idx [, error]. + * saddr6, daddr4 | daddr6, sport, dport, backup, if-idx [, error]. * @MPTCP_EVENT_LISTENER_CREATED: A new PM listener is created. Attributes: * family, sport, saddr4 | saddr6. * @MPTCP_EVENT_LISTENER_CLOSED: A PM listener is closed. Attributes: family, -- cgit v1.2.3 From f5769359c5b241978e6933672bb78b3adc36aa18 Mon Sep 17 00:00:00 2001 From: Hao Ge Date: Fri, 20 Jun 2025 02:31:54 +0800 Subject: mm/alloc_tag: fix the kmemleak false positive issue in the allocation of the percpu variable tag->counters When loading a module, as long as the module has memory allocation operations, kmemleak produces a false positive report that resembles the following: unreferenced object (percpu) 0x7dfd232a1650 (size 16): comm "modprobe", pid 1301, jiffies 4294940249 hex dump (first 16 bytes on cpu 2): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace (crc 0): kmemleak_alloc_percpu+0xb4/0xd0 pcpu_alloc_noprof+0x700/0x1098 load_module+0xd4/0x348 codetag_module_init+0x20c/0x450 codetag_load_module+0x70/0xb8 load_module+0xef8/0x1608 init_module_from_file+0xec/0x158 idempotent_init_module+0x354/0x608 __arm64_sys_finit_module+0xbc/0x150 invoke_syscall+0xd4/0x258 el0_svc_common.constprop.0+0xb4/0x240 do_el0_svc+0x48/0x68 el0_svc+0x40/0xf8 el0t_64_sync_handler+0x10c/0x138 el0t_64_sync+0x1ac/0x1b0 This is because the module can only indirectly reference alloc_tag_counters through the alloc_tag section, which misleads kmemleak. However, we don't have a kmemleak ignore interface for percpu allocations yet. So let's create one and invoke it for tag->counters. [gehao@kylinos.cn: fix build error when CONFIG_DEBUG_KMEMLEAK=n, s/igonore/ignore/] Link: https://lkml.kernel.org/r/20250620093102.2416767-1-hao.ge@linux.dev Link: https://lkml.kernel.org/r/20250619183154.2122608-1-hao.ge@linux.dev Fixes: 12ca42c23775 ("alloc_tag: allocate percpu counters for module tags dynamically") Signed-off-by: Hao Ge Reviewed-by: Catalin Marinas Acked-by: Suren Baghdasaryan [lib/alloc_tag.c] Cc: Kent Overstreet Signed-off-by: Andrew Morton --- include/linux/kmemleak.h | 4 ++++ lib/alloc_tag.c | 8 +++++++- mm/kmemleak.c | 14 ++++++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index 93a73c076d16..fbd424b2abb1 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h @@ -28,6 +28,7 @@ extern void kmemleak_update_trace(const void *ptr) __ref; extern void kmemleak_not_leak(const void *ptr) __ref; extern void kmemleak_transient_leak(const void *ptr) __ref; extern void kmemleak_ignore(const void *ptr) __ref; +extern void kmemleak_ignore_percpu(const void __percpu *ptr) __ref; extern void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) __ref; extern void kmemleak_no_scan(const void *ptr) __ref; extern void kmemleak_alloc_phys(phys_addr_t phys, size_t size, @@ -97,6 +98,9 @@ static inline void kmemleak_not_leak(const void *ptr) static inline void kmemleak_transient_leak(const void *ptr) { } +static inline void kmemleak_ignore_percpu(const void __percpu *ptr) +{ +} static inline void kmemleak_ignore(const void *ptr) { } diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index d48b80f3f007..3a74d63a959e 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -10,6 +10,7 @@ #include #include #include +#include #define ALLOCINFO_FILE_NAME "allocinfo" #define MODULE_ALLOC_TAG_VMAP_SIZE (100000UL * sizeof(struct alloc_tag)) @@ -632,8 +633,13 @@ static int load_module(struct module *mod, struct codetag *start, struct codetag mod->name); return -ENOMEM; } - } + /* + * Avoid a kmemleak false positive. The pointer to the counters is stored + * in the alloc_tag section of the module and cannot be directly accessed. + */ + kmemleak_ignore_percpu(tag->counters); + } return 0; } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index da9cee34ee1b..8d588e685311 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1246,6 +1246,20 @@ void __ref kmemleak_transient_leak(const void *ptr) } EXPORT_SYMBOL(kmemleak_transient_leak); +/** + * kmemleak_ignore_percpu - similar to kmemleak_ignore but taking a percpu + * address argument + * @ptr: percpu address of the object + */ +void __ref kmemleak_ignore_percpu(const void __percpu *ptr) +{ + pr_debug("%s(0x%px)\n", __func__, ptr); + + if (kmemleak_enabled && ptr && !IS_ERR_PCPU(ptr)) + make_black_object((unsigned long)ptr, OBJECT_PERCPU); +} +EXPORT_SYMBOL_GPL(kmemleak_ignore_percpu); + /** * kmemleak_ignore - ignore an allocated object * @ptr: pointer to beginning of the object -- cgit v1.2.3