From 1e5b50c78d10119be08bf8f7a11d8ea333dd113a Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 6 May 2026 14:43:23 +0200 Subject: tty: add missing tty_driver include to tty_port.h Include the definition of struct tty_driver in tty_port.h to keep the header self-contained and avoid build breakage in case anyone includes it before tty_driver.h. Fixes: eb3b0d92c9c3 ("tty: tty_port: add workqueue to flip TTY buffer") Cc: Xin Zhao Signed-off-by: Johan Hovold Link: https://patch.msgid.link/20260506124323.186703-1-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/tty_port.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tty_port.h b/include/linux/tty_port.h index d2a7882c0b58..23cad403bb8f 100644 --- a/include/linux/tty_port.h +++ b/include/linux/tty_port.h @@ -6,10 +6,10 @@ #include #include #include +#include #include struct attribute_group; -struct tty_driver; struct tty_port; struct tty_struct; -- cgit v1.2.3 From dd2147375a8fe7c5bc3f1f1b1d3a9567c26faefa Mon Sep 17 00:00:00 2001 From: Liu Kai Date: Thu, 7 May 2026 16:32:04 +0800 Subject: HID: remove duplicate hid_warn_ratelimited definition The hid_warn_ratelimited macro is defined twice in include/linux/hid.h: - first one added by commit 4051ead99888 ("HID: rate-limit hid_warn to prevent log flooding") - second one added by commit 1d64624243af ("HID: core: Add printk_ratelimited variants to hid_warn() etc")). The second definition is correctly grouped with other ratelimited macros. Remove the duplicate definition. Fixes: 1d64624243af ("HID: core: Add printk_ratelimited variants to hid_warn() etc") Signed-off-by: Liu Kai [bentiss: edited commit message] Signed-off-by: Benjamin Tissoires --- include/linux/hid.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index bfb9859f391e..47dc0bc89fa4 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -1316,8 +1316,6 @@ void hid_quirks_exit(__u16 bus); dev_notice(&(hid)->dev, fmt, ##__VA_ARGS__) #define hid_warn(hid, fmt, ...) \ dev_warn(&(hid)->dev, fmt, ##__VA_ARGS__) -#define hid_warn_ratelimited(hid, fmt, ...) \ - dev_warn_ratelimited(&(hid)->dev, fmt, ##__VA_ARGS__) #define hid_info(hid, fmt, ...) \ dev_info(&(hid)->dev, fmt, ##__VA_ARGS__) #define hid_dbg(hid, fmt, ...) \ -- cgit v1.2.3 From 83f9efcce93f8574be2279090ee2aec58b86cda7 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 12 May 2026 17:06:43 +0100 Subject: Revert "mm/hugetlbfs: update hugetlbfs to use mmap_prepare" This reverts commit ea52cb24cd3f ("mm/hugetlbfs: update hugetlbfs to use mmap_prepare") with conflict resolution to account for changes in commit ea52cb24cd3f ("mm/hugetlbfs: update hugetlbfs to use mmap_prepare"). The patch incorrectly handled hugetlb VMA lock allocation at the mmap_prepare stage, where a failed allocation occurring after mmap_prepare is called might result in the lock leaking. There is no risk of a merge causing a similar issues, as VMA_DONTEXPAND_BIT is set for hugetlb mappings. As a first step in addressing this issue, simply revert the change so we can rework how we do this having corrected the underlying issues. We maintain the VMA flags changes as best we can, accounting for the fact that we were working with a VMA descriptor previously and propagating like-for-like changes for this. Note that we invoke vma_set_flags() and do not call vma_start_write() as vm_flags_set() does. This is OK as it's being done in an .mmap hook where the VMA is not yet linked into the tree so nobody else can be accessing it. Link: https://lore.kernel.org/20260512160643.266960-1-ljs@kernel.org Fixes: ea52cb24cd3f ("mm/hugetlbfs: update hugetlbfs to use mmap_prepare") Signed-off-by: Lorenzo Stoakes Reported-by: Mingyu Wang <25181214217@stu.xidian.edu.cn> Closes: https://lore.kernel.org/linux-mm/20260425070700.562229-1-25181214217@stu.xidian.edu.cn/ Acked-by: Muchun Song Acked-by: Oscar Salvador Cc: David Hildenbrand Cc: Liam R. Howlett Cc: Pedro Falcato Cc: Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 46 ++++++++------------------- include/linux/hugetlb.h | 8 +---- include/linux/hugetlb_inline.h | 14 ++------- mm/hugetlb.c | 71 +++++++++++++++++------------------------- 4 files changed, 45 insertions(+), 94 deletions(-) (limited to 'include/linux') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 8b05bec08e04..78d61bf2bd9b 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -96,15 +96,8 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = { #define PGOFF_LOFFT_MAX \ (((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1))) -static int hugetlb_file_mmap_prepare_success(const struct vm_area_struct *vma) +static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) { - /* Unfortunate we have to reassign vma->vm_private_data. */ - return hugetlb_vma_lock_alloc((struct vm_area_struct *)vma); -} - -static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) -{ - struct file *file = desc->file; struct inode *inode = file_inode(file); loff_t len, vma_len; int ret; @@ -119,8 +112,8 @@ static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) * way when do_mmap unwinds (may be important on powerpc * and ia64). */ - vma_desc_set_flags(desc, VMA_HUGETLB_BIT, VMA_DONTEXPAND_BIT); - desc->vm_ops = &hugetlb_vm_ops; + vma_set_flags(vma, VMA_HUGETLB_BIT, VMA_DONTEXPAND_BIT); + vma->vm_ops = &hugetlb_vm_ops; /* * page based offset in vm_pgoff could be sufficiently large to @@ -129,16 +122,16 @@ static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) * sizeof(unsigned long). So, only check in those instances. */ if (sizeof(unsigned long) == sizeof(loff_t)) { - if (desc->pgoff & PGOFF_LOFFT_MAX) + if (vma->vm_pgoff & PGOFF_LOFFT_MAX) return -EINVAL; } /* must be huge page aligned */ - if (desc->pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) + if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) return -EINVAL; - vma_len = (loff_t)vma_desc_size(desc); - len = vma_len + ((loff_t)desc->pgoff << PAGE_SHIFT); + vma_len = (loff_t)(vma->vm_end - vma->vm_start); + len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); /* check for overflow */ if (len < vma_len) return -EINVAL; @@ -148,7 +141,7 @@ static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) ret = -ENOMEM; - vma_flags = desc->vma_flags; + vma_flags = vma->flags; /* * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip * reserving here. Note: only for SHM hugetlbfs file, the inode @@ -158,30 +151,17 @@ static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) vma_flags_set(&vma_flags, VMA_NORESERVE_BIT); if (hugetlb_reserve_pages(inode, - desc->pgoff >> huge_page_order(h), - len >> huge_page_shift(h), desc, - vma_flags) < 0) + vma->vm_pgoff >> huge_page_order(h), + len >> huge_page_shift(h), vma, + vma_flags) < 0) goto out; ret = 0; - if (vma_desc_test(desc, VMA_WRITE_BIT) && inode->i_size < len) + if (vma_test(vma, VMA_WRITE_BIT) && inode->i_size < len) i_size_write(inode, len); out: inode_unlock(inode); - if (!ret) { - /* Allocate the VMA lock after we set it up. */ - desc->action.success_hook = hugetlb_file_mmap_prepare_success; - /* - * We cannot permit the rmap finding this VMA in the time - * between the VMA being inserted into the VMA tree and the - * completion/success hook being invoked. - * - * This is because we establish a per-VMA hugetlb lock which can - * be raced by rmap. - */ - desc->action.hide_from_rmap_until_complete = true; - } return ret; } @@ -1227,7 +1207,7 @@ static void init_once(void *foo) static const struct file_operations hugetlbfs_file_operations = { .read_iter = hugetlbfs_read_iter, - .mmap_prepare = hugetlbfs_file_mmap_prepare, + .mmap = hugetlbfs_file_mmap, .fsync = noop_fsync, .get_unmapped_area = hugetlb_get_unmapped_area, .llseek = default_llseek, diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 93418625d3c5..5957bc25efa8 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -148,7 +148,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, struct folio **foliop); #endif /* CONFIG_USERFAULTFD */ long hugetlb_reserve_pages(struct inode *inode, long from, long to, - struct vm_area_desc *desc, vma_flags_t vma_flags); + struct vm_area_struct *vma, vma_flags_t vma_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list); @@ -276,7 +276,6 @@ long hugetlb_change_protection(struct vm_area_struct *vma, void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); void fixup_hugetlb_reservations(struct vm_area_struct *vma); void hugetlb_split(struct vm_area_struct *vma, unsigned long addr); -int hugetlb_vma_lock_alloc(struct vm_area_struct *vma); unsigned int arch_hugetlb_cma_order(void); @@ -469,11 +468,6 @@ static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma) static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {} -static inline int hugetlb_vma_lock_alloc(struct vm_area_struct *vma) -{ - return 0; -} - #endif /* !CONFIG_HUGETLB_PAGE */ #ifndef pgd_write diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h index 565b473fd135..5c29cd3223a1 100644 --- a/include/linux/hugetlb_inline.h +++ b/include/linux/hugetlb_inline.h @@ -6,23 +6,13 @@ #ifdef CONFIG_HUGETLB_PAGE -static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags) -{ - return !!(vm_flags & VM_HUGETLB); -} - static inline bool is_vma_hugetlb_flags(const vma_flags_t *flags) { - return vma_flags_test_any(flags, VMA_HUGETLB_BIT); + return vma_flags_test(flags, VMA_HUGETLB_BIT); } #else -static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags) -{ - return false; -} - static inline bool is_vma_hugetlb_flags(const vma_flags_t *flags) { return false; @@ -32,7 +22,7 @@ static inline bool is_vma_hugetlb_flags(const vma_flags_t *flags) static inline bool is_vm_hugetlb_page(const struct vm_area_struct *vma) { - return is_vm_hugetlb_flags(vma->vm_flags); + return is_vma_hugetlb_flags(&vma->flags); } #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f24bf49be047..4b80b167cc9c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -116,6 +116,7 @@ struct mutex *hugetlb_fault_mutex_table __ro_after_init; /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); static void hugetlb_vma_lock_free(struct vm_area_struct *vma); +static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma); static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool take_locks); @@ -413,21 +414,17 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma) } } -/* - * vma specific semaphore used for pmd sharing and fault/truncation - * synchronization - */ -int hugetlb_vma_lock_alloc(struct vm_area_struct *vma) +static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) { struct hugetlb_vma_lock *vma_lock; /* Only establish in (flags) sharable vmas */ if (!vma || !(vma->vm_flags & VM_MAYSHARE)) - return 0; + return; /* Should never get here with non-NULL vm_private_data */ if (vma->vm_private_data) - return -EINVAL; + return; vma_lock = kmalloc_obj(*vma_lock); if (!vma_lock) { @@ -442,15 +439,13 @@ int hugetlb_vma_lock_alloc(struct vm_area_struct *vma) * allocation failure. */ pr_warn_once("HugeTLB: unable to allocate vma specific lock\n"); - return -EINVAL; + return; } kref_init(&vma_lock->refs); init_rwsem(&vma_lock->rw_sema); vma_lock->vma = vma; vma->vm_private_data = vma_lock; - - return 0; } /* Helper that removes a struct file_region from the resv_map cache and returns @@ -1147,28 +1142,20 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma) } } -static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) +static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) { VM_WARN_ON_ONCE_VMA(!is_vm_hugetlb_page(vma), vma); - VM_WARN_ON_ONCE_VMA(vma->vm_flags & VM_MAYSHARE, vma); + VM_WARN_ON_ONCE_VMA(vma_test(vma, VMA_MAYSHARE_BIT), vma); - set_vma_private_data(vma, get_vma_private_data(vma) | flags); + set_vma_private_data(vma, (unsigned long)map); } -static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *map) -{ - VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags)); - VM_WARN_ON_ONCE(vma_desc_test(desc, VMA_MAYSHARE_BIT)); - - desc->private_data = map; -} - -static void set_vma_desc_resv_flags(struct vm_area_desc *desc, unsigned long flags) +static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) { - VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags)); - VM_WARN_ON_ONCE(vma_desc_test(desc, VMA_MAYSHARE_BIT)); + VM_WARN_ON_ONCE_VMA(!is_vm_hugetlb_page(vma), vma); + VM_WARN_ON_ONCE_VMA(vma_test(vma, VMA_MAYSHARE_BIT), vma); - desc->private_data = (void *)((unsigned long)desc->private_data | flags); + set_vma_private_data(vma, get_vma_private_data(vma) | flags); } static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) @@ -1178,13 +1165,6 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) return (get_vma_private_data(vma) & flag) != 0; } -static bool is_vma_desc_resv_set(struct vm_area_desc *desc, unsigned long flag) -{ - VM_WARN_ON_ONCE(!is_vma_hugetlb_flags(&desc->vma_flags)); - - return ((unsigned long)desc->private_data) & flag; -} - bool __vma_private_lock(struct vm_area_struct *vma) { return !(vma->vm_flags & VM_MAYSHARE) && @@ -6553,7 +6533,7 @@ next: long hugetlb_reserve_pages(struct inode *inode, long from, long to, - struct vm_area_desc *desc, + struct vm_area_struct *vma, vma_flags_t vma_flags) { long chg = -1, add = -1, spool_resv, gbl_resv; @@ -6570,6 +6550,12 @@ long hugetlb_reserve_pages(struct inode *inode, return -EINVAL; } + /* + * vma specific semaphore used for pmd sharing and fault/truncation + * synchronization + */ + hugetlb_vma_lock_alloc(vma); + /* * Only apply hugepage reservation if asked. At fault time, an * attempt will be made for VM_NORESERVE to allocate a page @@ -6582,9 +6568,9 @@ long hugetlb_reserve_pages(struct inode *inode, * Shared mappings base their reservation on the number of pages that * are already allocated on behalf of the file. Private mappings need * to reserve the full area even if read-only as mprotect() may be - * called to make the mapping read-write. Assume !desc is a shm mapping + * called to make the mapping read-write. Assume !vma is a shm mapping */ - if (!desc || vma_desc_test(desc, VMA_MAYSHARE_BIT)) { + if (!vma || vma_test(vma, VMA_MAYSHARE_BIT)) { /* * resv_map can not be NULL as hugetlb_reserve_pages is only * called for inodes for which resv_maps were created (see @@ -6603,8 +6589,8 @@ long hugetlb_reserve_pages(struct inode *inode, chg = to - from; - set_vma_desc_resv_map(desc, resv_map); - set_vma_desc_resv_flags(desc, HPAGE_RESV_OWNER); + set_vma_resv_map(vma, resv_map); + set_vma_resv_flags(vma, HPAGE_RESV_OWNER); } if (chg < 0) { @@ -6618,7 +6604,7 @@ long hugetlb_reserve_pages(struct inode *inode, if (err < 0) goto out_err; - if (desc && !vma_desc_test(desc, VMA_MAYSHARE_BIT) && h_cg) { + if (vma && !vma_test(vma, VMA_MAYSHARE_BIT) && h_cg) { /* For private mappings, the hugetlb_cgroup uncharge info hangs * of the resv_map. */ @@ -6655,7 +6641,7 @@ long hugetlb_reserve_pages(struct inode *inode, * consumed reservations are stored in the map. Hence, nothing * else has to be done for private mappings here */ - if (!desc || vma_desc_test(desc, VMA_MAYSHARE_BIT)) { + if (!vma || vma_test(vma, VMA_MAYSHARE_BIT)) { add = region_add(resv_map, from, to, regions_needed, h, h_cg); if (unlikely(add < 0)) { @@ -6719,15 +6705,16 @@ out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); out_err: - if (!desc || vma_desc_test(desc, VMA_MAYSHARE_BIT)) + hugetlb_vma_lock_free(vma); + if (!vma || vma_test(vma, VMA_MAYSHARE_BIT)) /* Only call region_abort if the region_chg succeeded but the * region_add failed or didn't run. */ if (chg >= 0 && add < 0) region_abort(resv_map, from, to, regions_needed); - if (desc && is_vma_desc_resv_set(desc, HPAGE_RESV_OWNER)) { + if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { kref_put(&resv_map->refs, resv_map_release); - set_vma_desc_resv_map(desc, NULL); + set_vma_resv_map(vma, NULL); } return err; } -- cgit v1.2.3 From 54cf41c969da6637cce790b7400da1451609db9b Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Fri, 15 May 2026 12:47:01 +0900 Subject: Revert "mm: introduce a new page type for page pool in page type" This reverts commit db359fccf212 ("mm: introduce a new page type for page pool in page type") and a part of 735a309b4bfb9e ("net: add net_iov_init() and use it to initialize ->page_type"). Netpp page_type'ed pages might be used in mapping so as to use @_mapcount. However, since @page_type and @_mapcount are union'ed in struct page, these two can't be used at the same time. Revert the commit introducing page_type for Netpp for now. The patch will be retried once @page_type and @_mapcount get allowed to be used at the same time. The revert also includes removal of @page_type initialization part introduced by commit 735a309b4bfb9e ("net: add net_iov_init() and use it to initialize ->page_type"), which will be restored on the retry. Link: https://lore.kernel.org/20260515034701.17027-1-byungchul@sk.com Fixes: db359fccf212 ("mm: introduce a new page type for page pool in page type") Signed-off-by: Byungchul Park Reported-by: Dragos Tatulea Closes: https://lore.kernel.org/all/982b9bc1-0a0a-4fc5-8e3a-3672db2b29a1@nvidia.com Acked-by: Jakub Kicinski Acked-by: David Hildenbrand (Arm) Acked-by: Harry Yoo (Oracle) Reviewed-by: Lorenzo Stoakes Cc: Alexei Starovoitov Cc: Baolin Wang Cc: Brendan Jackman Cc: David S. Miller Cc: Eric Dumazet Cc: Ilias Apalodimas Cc: Jesper Dangaard Brouer Cc: Johannes Weiner Cc: John Fastabend Cc: Leon Romanovsky Cc: Liam R. Howlett Cc: Mark Bloch Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Paolo Abeni Cc: Pavel Begunkov Cc: Saeed Mahameed Cc: Simon Horman Cc: Stanislav Fomichev Cc: Suren Baghdasaryan Cc: Tariq Toukan Cc: Toke Hoiland-Jorgensen Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c | 2 +- include/linux/mm.h | 27 +++++++++++++++++++++--- include/linux/page-flags.h | 6 ------ include/net/netmem.h | 19 ++--------------- mm/page_alloc.c | 13 ++++-------- net/core/netmem_priv.h | 23 +++++++++++--------- net/core/page_pool.c | 24 ++------------------- 7 files changed, 46 insertions(+), 68 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index 190b8b66b3ce..d3bab198c99c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -708,7 +708,7 @@ static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq, xdpi = mlx5e_xdpi_fifo_pop(xdpi_fifo); page = xdpi.page.page; - /* No need to check PageNetpp() as we + /* No need to check page_pool_page_is_pp() as we * know this is a page_pool page. */ page_pool_recycle_direct(pp_page_to_nmdesc(page)->pp, diff --git a/include/linux/mm.h b/include/linux/mm.h index af23453e9dbd..06bbe9eba636 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -5174,9 +5174,10 @@ int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); * DMA mapping IDs for page_pool * * When DMA-mapping a page, page_pool allocates an ID (from an xarray) and - * stashes it in the upper bits of page->pp_magic. Non-PP pages can have - * arbitrary kernel pointers stored in the same field as pp_magic (since - * it overlaps with page->lru.next), so we must ensure that we cannot + * stashes it in the upper bits of page->pp_magic. We always want to be able to + * unambiguously identify page pool pages (using page_pool_page_is_pp()). Non-PP + * pages can have arbitrary kernel pointers stored in the same field as pp_magic + * (since it overlaps with page->lru.next), so we must ensure that we cannot * mistake a valid kernel pointer with any of the values we write into this * field. * @@ -5211,6 +5212,26 @@ int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); #define PP_DMA_INDEX_MASK GENMASK(PP_DMA_INDEX_BITS + PP_DMA_INDEX_SHIFT - 1, \ PP_DMA_INDEX_SHIFT) +/* Mask used for checking in page_pool_page_is_pp() below. page->pp_magic is + * OR'ed with PP_SIGNATURE after the allocation in order to preserve bit 0 for + * the head page of compound page and bit 1 for pfmemalloc page, as well as the + * bits used for the DMA index. page_is_pfmemalloc() is checked in + * __page_pool_put_page() to avoid recycling the pfmemalloc page. + */ +#define PP_MAGIC_MASK ~(PP_DMA_INDEX_MASK | 0x3UL) + +#ifdef CONFIG_PAGE_POOL +static inline bool page_pool_page_is_pp(const struct page *page) +{ + return (page->pp_magic & PP_MAGIC_MASK) == PP_SIGNATURE; +} +#else +static inline bool page_pool_page_is_pp(const struct page *page) +{ + return false; +} +#endif + #define PAGE_SNAPSHOT_FAITHFUL (1 << 0) #define PAGE_SNAPSHOT_PG_BUDDY (1 << 1) #define PAGE_SNAPSHOT_PG_IDLE (1 << 2) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 0e03d816e8b9..7223f6f4e2b4 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -923,7 +923,6 @@ enum pagetype { PGTY_zsmalloc = 0xf6, PGTY_unaccepted = 0xf7, PGTY_large_kmalloc = 0xf8, - PGTY_netpp = 0xf9, PGTY_mapcount_underflow = 0xff }; @@ -1056,11 +1055,6 @@ PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc) PAGE_TYPE_OPS(Unaccepted, unaccepted, unaccepted) PAGE_TYPE_OPS(LargeKmalloc, large_kmalloc, large_kmalloc) -/* - * Marks page_pool allocated pages. - */ -PAGE_TYPE_OPS(Netpp, netpp, netpp) - /** * PageHuge - Determine if the page belongs to hugetlbfs * @page: The page to test. diff --git a/include/net/netmem.h b/include/net/netmem.h index 78fe51e5756b..bccacd21b6c3 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -94,20 +94,10 @@ enum net_iov_type { */ struct net_iov { struct netmem_desc desc; - unsigned int page_type; enum net_iov_type type; struct net_iov_area *owner; }; -/* Make sure 'the offset of page_type in struct page == the offset of - * type in struct net_iov'. - */ -#define NET_IOV_ASSERT_OFFSET(pg, iov) \ - static_assert(offsetof(struct page, pg) == \ - offsetof(struct net_iov, iov)) -NET_IOV_ASSERT_OFFSET(page_type, page_type); -#undef NET_IOV_ASSERT_OFFSET - struct net_iov_area { /* Array of net_iovs for this area. */ struct net_iov *niovs; @@ -127,11 +117,7 @@ static inline unsigned int net_iov_idx(const struct net_iov *niov) return niov - net_iov_owner(niov)->niovs; } -/* Initialize a niov: stamp the owning area, the memory provider type, - * and the page_type "no type" sentinel expected by the page-type API - * (see PAGE_TYPE_OPS in ) so that - * page_pool_set_pp_info() can later call __SetPageNetpp() on a niov - * cast to struct page. +/* Initialize a niov: stamp the owning area, the memory provider type. */ static inline void net_iov_init(struct net_iov *niov, struct net_iov_area *owner, @@ -139,7 +125,6 @@ static inline void net_iov_init(struct net_iov *niov, { niov->owner = owner; niov->type = type; - niov->page_type = UINT_MAX; } /* netmem */ @@ -245,7 +230,7 @@ static inline unsigned long netmem_pfn_trace(netmem_ref netmem) */ #define pp_page_to_nmdesc(p) \ ({ \ - DEBUG_NET_WARN_ON_ONCE(!PageNetpp(p)); \ + DEBUG_NET_WARN_ON_ONCE(!page_pool_page_is_pp(p)); \ __pp_page_to_nmdesc(p); \ }) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 23c7298d3be2..d49c254174da 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1035,6 +1035,7 @@ static inline bool page_expected_state(struct page *page, #ifdef CONFIG_MEMCG page->memcg_data | #endif + page_pool_page_is_pp(page) | (page->flags.f & check_flags))) return false; @@ -1061,6 +1062,8 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) if (unlikely(page->memcg_data)) bad_reason = "page still charged to cgroup"; #endif + if (unlikely(page_pool_page_is_pp(page))) + bad_reason = "page_pool leak"; return bad_reason; } @@ -1377,17 +1380,9 @@ __always_inline bool __free_pages_prepare(struct page *page, mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1); folio->mapping = NULL; } - if (unlikely(page_has_type(page))) { - /* networking expects to clear its page type before releasing */ - if (is_check_pages_enabled()) { - if (unlikely(PageNetpp(page))) { - bad_page(page, "page_pool leak"); - return false; - } - } + if (unlikely(page_has_type(page))) /* Reset the page_type (which overlays _mapcount) */ page->page_type = UINT_MAX; - } if (is_check_pages_enabled()) { if (free_page_is_bad(page)) diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h index 3e6fde8f1726..23175cb2bd86 100644 --- a/net/core/netmem_priv.h +++ b/net/core/netmem_priv.h @@ -8,18 +8,21 @@ static inline unsigned long netmem_get_pp_magic(netmem_ref netmem) return netmem_to_nmdesc(netmem)->pp_magic & ~PP_DMA_INDEX_MASK; } -static inline bool netmem_is_pp(netmem_ref netmem) +static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic) +{ + netmem_to_nmdesc(netmem)->pp_magic |= pp_magic; +} + +static inline void netmem_clear_pp_magic(netmem_ref netmem) { - struct page *page; + WARN_ON_ONCE(netmem_to_nmdesc(netmem)->pp_magic & PP_DMA_INDEX_MASK); - /* XXX: Now that the offset of page_type is shared between - * struct page and net_iov, just cast the netmem to struct page - * unconditionally by clearing NET_IOV if any, no matter whether - * it comes from struct net_iov or struct page. This should be - * adjusted once the offset is no longer shared. - */ - page = (struct page *)((__force unsigned long)netmem & ~NET_IOV); - return PageNetpp(page); + netmem_to_nmdesc(netmem)->pp_magic = 0; +} + +static inline bool netmem_is_pp(netmem_ref netmem) +{ + return (netmem_get_pp_magic(netmem) & PP_MAGIC_MASK) == PP_SIGNATURE; } static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 6e576dec80db..8171d1173221 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -707,18 +707,8 @@ s32 page_pool_inflight(const struct page_pool *pool, bool strict) void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem) { - struct page *page; - netmem_set_pp(netmem, pool); - - /* XXX: Now that the offset of page_type is shared between - * struct page and net_iov, just cast the netmem to struct page - * unconditionally by clearing NET_IOV if any, no matter whether - * it comes from struct net_iov or struct page. This should be - * adjusted once the offset is no longer shared. - */ - page = (struct page *)((__force unsigned long)netmem & ~NET_IOV); - __SetPageNetpp(page); + netmem_or_pp_magic(netmem, PP_SIGNATURE); /* Ensuring all pages have been split into one fragment initially: * page_pool_set_pp_info() is only called once for every page when it @@ -733,17 +723,7 @@ void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem) void page_pool_clear_pp_info(netmem_ref netmem) { - struct page *page; - - /* XXX: Now that the offset of page_type is shared between - * struct page and net_iov, just cast the netmem to struct page - * unconditionally by clearing NET_IOV if any, no matter whether - * it comes from struct net_iov or struct page. This should be - * adjusted once the offset is no longer shared. - */ - page = (struct page *)((__force unsigned long)netmem & ~NET_IOV); - __ClearPageNetpp(page); - + netmem_clear_pp_magic(netmem); netmem_set_pp(netmem, NULL); } -- cgit v1.2.3 From c3cce2e67bb22a223f5b8ef05db0fcde70994068 Mon Sep 17 00:00:00 2001 From: Jacques Nilo Date: Wed, 13 May 2026 15:30:23 +0200 Subject: serial: core: introduce guard(uart_port_lock_check_sysrq_irqsave) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit uart_handle_break() and uart_prepare_sysrq_char() (in include/linux/serial_core.h) capture a SysRq character into port->sysrq_ch while the port lock is held and rely on the unlock helper -- uart_unlock_and_check_sysrq_irqrestore() -- to dispatch the captured character to handle_sysrq() on scope exit. The existing guard(uart_port_lock_irqsave) cannot be used by IRQ handlers that process RX, because its destructor calls plain uart_port_unlock_irqrestore() and silently drops port->sysrq_ch. Add a dedicated guard(uart_port_lock_check_sysrq_irqsave) variant whose destructor is the sysrq-aware unlock helper. The lock side is identical to uart_port_lock_irqsave -- only the unlock-time behaviour differs. Callers that may capture SysRq characters must use guard(uart_port_lock_check_sysrq_irqsave); the existing guard(uart_port_lock_irqsave) keeps its current plain-unlock semantics for the many callers that do not process RX. The new macro is placed after the CONFIG_MAGIC_SYSRQ_SERIAL block so both definitions of uart_unlock_and_check_sysrq_irqrestore() (sysrq enabled and disabled) are visible at expansion time. When CONFIG_MAGIC_SYSRQ_SERIAL=n the destructor degenerates to plain uart_port_unlock_irqrestore(), so there is no overhead. No functional change on its own; users are converted in the following patches. Cc: stable@vger.kernel.org Signed-off-by: Jacques Nilo Reviewed-by: Ilpo Järvinen Link: https://patch.msgid.link/3849af4bc55d5d2a424fa850844e94d641b2f8a6.1778675349.git.jnilo@free.fr Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 666430b47899..110ad4e2aef9 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -1274,6 +1274,18 @@ static inline void uart_unlock_and_check_sysrq_irqrestore(struct uart_port *port } #endif /* CONFIG_MAGIC_SYSRQ_SERIAL */ +/* + * Variant of guard(uart_port_lock_irqsave) for IRQ handlers that may capture + * a SysRq character via uart_prepare_sysrq_char(). The destructor uses the + * sysrq-aware unlock helper so that a captured port->sysrq_ch is dispatched + * to handle_sysrq() on scope exit. The plain guard variant silently drops + * sysrq_ch and must not be used by callers that process RX. + */ +DEFINE_LOCK_GUARD_1(uart_port_lock_check_sysrq_irqsave, struct uart_port, + uart_port_lock_irqsave(_T->lock, &_T->flags), + uart_unlock_and_check_sysrq_irqrestore(_T->lock, _T->flags), + unsigned long flags); + /* * We do the SysRQ and SAK checking like this... */ -- cgit v1.2.3 From ef15ccbb3e8640a723c42ad90eaf81d66ae02017 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 5 May 2026 20:45:12 +0200 Subject: parport: Fix race between port and client registration The parport subsystem registers port devices before they are fully initialised, resulting in a race condition where client drivers such as lp can attach to ports that are not completely initialised or even being torn down. When the port and client drivers are built as modules and loaded around the same time during boot, this occasionally results in a crash. I was able to make this happen reliably in a VM with a PC-style parallel port by patching parport_pc to fail probing: > --- a/drivers/parport/parport_pc.c > +++ b/drivers/parport/parport_pc.c > @@ -2069,7 +2069,7 @@ static struct parport *__parport_pc_probe_port(unsigned long int base, > if (!p) > goto out3; > > - base_res = request_region(base, 3, p->name); > + base_res = NULL; > if (!base_res) > goto out4; > and then running: while true; do modprobe lp & modprobe parport_pc wait rmmod lp parport_pc done for a few seconds. In the long term I think port registration should be changed to put the call to device_add() inside parport_announce_port(), but since the latter currently cannot fail this will require changing all port drivers. For now, add a flag to indicate whether a port has been "announced" and only try to attach client drivers to ports when the flag is set. Fixes: 6fa45a226897 ("parport: add device-model to parport subsystem") Closes: https://bugs.debian.org/1130365 Closes: https://lore.kernel.org/all/6ba903ad-9897-42bb-8c2d-337385cc3746@molgen.mpg.de/ Cc: stable Signed-off-by: Ben Hutchings Acked-by: Sudip Mukherjee Link: https://patch.msgid.link/afo6uBv68GDevbMD@decadent.org.uk Signed-off-by: Greg Kroah-Hartman --- drivers/parport/share.c | 11 +++++++++-- include/linux/parport.h | 1 + 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/parport/share.c b/drivers/parport/share.c index ba5292828703..eb0977ca1605 100644 --- a/drivers/parport/share.c +++ b/drivers/parport/share.c @@ -214,10 +214,14 @@ static void get_lowlevel_driver(void) static int port_check(struct device *dev, void *dev_drv) { struct parport_driver *drv = dev_drv; + struct parport *port; /* only send ports, do not send other devices connected to bus */ - if (is_parport(dev)) - drv->match_port(to_parport_dev(dev)); + if (is_parport(dev)) { + port = to_parport_dev(dev); + if (test_bit(PARPORT_ANNOUNCED, &port->devflags)) + drv->match_port(port); + } return 0; } @@ -532,6 +536,7 @@ void parport_announce_port(struct parport *port) if (slave) attach_driver_chain(slave); } + set_bit(PARPORT_ANNOUNCED, &port->devflags); mutex_unlock(®istration_lock); } EXPORT_SYMBOL(parport_announce_port); @@ -561,6 +566,8 @@ void parport_remove_port(struct parport *port) mutex_lock(®istration_lock); + clear_bit(PARPORT_ANNOUNCED, &port->devflags); + /* Spread the word. */ detach_driver_chain(port); diff --git a/include/linux/parport.h b/include/linux/parport.h index 464c2ad28039..f64cb0676e3b 100644 --- a/include/linux/parport.h +++ b/include/linux/parport.h @@ -240,6 +240,7 @@ struct parport { unsigned long devflags; #define PARPORT_DEVPROC_REGISTERED 0 +#define PARPORT_ANNOUNCED 1 struct pardevice *proc_device; /* Currently register proc device */ struct list_head full_list; -- cgit v1.2.3 From 47980b6dbf83961eec1c1363ea986e9c06ff8054 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 14 May 2026 14:21:57 +0200 Subject: netfilter: nf_conntrack_gre: fix gre keymap list corruption Quoting reporter: A race between GRE keymap insertion and destruction can corrupt the kernel list or use a freed object. `nf_ct_gre_keymap_add()` publishes a new keymap pointer before the embedded `list_head` is linked, while `nf_ct_gre_keymap_destroy()` can concurrently delete and free that same object. An unprivileged user can reach this through the PPTP conntrack helper by racing PPTP control messages or helper teardown, leading to KASAN-detectable list corruption/UAF in kernel context. ## Root Cause Analysis `exp_gre()` installs GRE expectations for a PPTP control flow and then adds two GRE keymap entries [..] The add path publishes `ct_pptp_info->keymap[dir]` before linking the embedded list node [..] Concurrent teardown deletes that partially initialized object. Make add/destroy symmetric: install both, destroy both while under lock. Furthermore, we should refuse to publish a new mapping in case ct is going away, else we may leak the allocation. The "retrans" detection is strange: existing mapping is checked for key equality with the new mapping, then for "is on the list" via list walk. But I can't see how an existing keymap entry can be NOT on list. Change this to only check if we're asked to map same tuple again -- if so, skip re-install, else signal failure. Last, add a bug trap for the keymap list; it has to be empty when namespace is going away. Reported-by: Leo Lin Signed-off-by: Florian Westphal --- include/linux/netfilter/nf_conntrack_proto_gre.h | 7 +- net/netfilter/nf_conntrack_core.c | 8 ++ net/netfilter/nf_conntrack_pptp.c | 8 +- net/netfilter/nf_conntrack_proto_gre.c | 106 +++++++++++++++++------ 4 files changed, 95 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/nf_conntrack_proto_gre.h b/include/linux/netfilter/nf_conntrack_proto_gre.h index 9ee7014400e8..ad5563f0f864 100644 --- a/include/linux/netfilter/nf_conntrack_proto_gre.h +++ b/include/linux/netfilter/nf_conntrack_proto_gre.h @@ -18,9 +18,10 @@ struct nf_ct_gre_keymap { struct rcu_head rcu; }; -/* add new tuple->key_reply pair to keymap */ -int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir, - struct nf_conntrack_tuple *t); +/* add tuple->key_reply pairs to keymap */ +bool nf_ct_gre_keymap_add(struct nf_conn *ct, + const struct nf_conntrack_tuple *orig, + const struct nf_conntrack_tuple *repl); /* delete keymap entries */ void nf_ct_gre_keymap_destroy(struct nf_conn *ct); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 8ba5b22a1eef..b521b5ebd664 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -568,6 +568,13 @@ static void destroy_gre_conntrack(struct nf_conn *ct) #endif } +static void warn_on_keymap_list_leak(const struct net *net) +{ +#ifdef CONFIG_NF_CT_PROTO_GRE + WARN_ON_ONCE(!list_empty(&net->ct.nf_ct_proto.gre.keymap_list)); +#endif +} + void nf_ct_destroy(struct nf_conntrack *nfct) { struct nf_conn *ct = (struct nf_conn *)nfct; @@ -2510,6 +2517,7 @@ i_see_dead_people: } list_for_each_entry(net, net_exit_list, exit_list) { + warn_on_keymap_list_leak(net); nf_conntrack_ecache_pernet_fini(net); nf_conntrack_expect_pernet_fini(net); free_percpu(net->ct.stat); diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index 4c679638df06..dc23e4181618 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -225,13 +225,9 @@ static int exp_gre(struct nf_conn *ct, __be16 callid, __be16 peer_callid) if (nf_ct_expect_related(exp_reply, 0) != 0) goto out_unexpect_orig; - /* Add GRE keymap entries */ - if (nf_ct_gre_keymap_add(ct, IP_CT_DIR_ORIGINAL, &exp_orig->tuple) != 0) + if (!nf_ct_gre_keymap_add(ct, &exp_orig->tuple, + &exp_reply->tuple)) goto out_unexpect_both; - if (nf_ct_gre_keymap_add(ct, IP_CT_DIR_REPLY, &exp_reply->tuple) != 0) { - nf_ct_gre_keymap_destroy(ct); - goto out_unexpect_both; - } ret = 0; out_put_both: diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 94c19bc4edc5..35e22082d65a 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -87,41 +87,97 @@ static __be16 gre_keymap_lookup(struct net *net, struct nf_conntrack_tuple *t) return key; } -/* add a single keymap entry, associate with specified master ct */ -int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir, - struct nf_conntrack_tuple *t) +enum nf_ct_gre_km_act { + NF_CT_GRE_KM_NEW, + NF_CT_GRE_KM_BAD, + NF_CT_GRE_KM_DUP +}; + +static enum nf_ct_gre_km_act +nf_ct_gre_km_acceptable(const struct nf_ct_pptp_master *ct_pptp_info, + const struct nf_conntrack_tuple *orig, + const struct nf_conntrack_tuple *repl) +{ + struct nf_ct_gre_keymap *km_orig, *km_repl; + + lockdep_assert_held(&keymap_lock); + + km_orig = ct_pptp_info->keymap[IP_CT_DIR_ORIGINAL]; + km_repl = ct_pptp_info->keymap[IP_CT_DIR_REPLY]; + + if (km_orig && km_repl) { + if (!gre_key_cmpfn(km_orig, orig)) + return NF_CT_GRE_KM_BAD; + + if (!gre_key_cmpfn(km_repl, repl)) + return NF_CT_GRE_KM_BAD; + + return NF_CT_GRE_KM_DUP; + } + + DEBUG_NET_WARN_ON_ONCE(km_orig); + DEBUG_NET_WARN_ON_ONCE(km_repl); + return NF_CT_GRE_KM_NEW; +} + +/* add keymap entries, associate with specified master ct */ +bool nf_ct_gre_keymap_add(struct nf_conn *ct, + const struct nf_conntrack_tuple *orig, + const struct nf_conntrack_tuple *repl) { struct net *net = nf_ct_net(ct); struct nf_gre_net *net_gre = gre_pernet(net); struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct); - struct nf_ct_gre_keymap **kmp, *km; - - kmp = &ct_pptp_info->keymap[dir]; - if (*kmp) { - /* check whether it's a retransmission */ - list_for_each_entry_rcu(km, &net_gre->keymap_list, list) { - if (gre_key_cmpfn(km, t) && km == *kmp) - return 0; - } - pr_debug("trying to override keymap_%s for ct %p\n", - dir == IP_CT_DIR_REPLY ? "reply" : "orig", ct); - return -EEXIST; - } + struct nf_ct_gre_keymap *km_orig, *km_repl; + bool ret = false; - km = kmalloc_obj(*km, GFP_ATOMIC); - if (!km) - return -ENOMEM; - memcpy(&km->tuple, t, sizeof(*t)); - *kmp = km; + km_orig = kmalloc_obj(*km_orig, GFP_ATOMIC); + if (!km_orig) + return false; + km_repl = kmalloc_obj(*km_repl, GFP_ATOMIC); + if (!km_repl) + goto km_free; - pr_debug("adding new entry %p: ", km); - nf_ct_dump_tuple(&km->tuple); + memcpy(&km_orig->tuple, orig, sizeof(*orig)); + memcpy(&km_repl->tuple, repl, sizeof(*repl)); spin_lock_bh(&keymap_lock); - list_add_tail(&km->list, &net_gre->keymap_list); + if (nf_ct_is_dying(ct)) + goto unlock_free; + + switch (nf_ct_gre_km_acceptable(ct_pptp_info, orig, repl)) { + case NF_CT_GRE_KM_NEW: + break; + case NF_CT_GRE_KM_DUP: + ret = true; + goto unlock_free; + case NF_CT_GRE_KM_BAD: + pr_debug("trying to override keymap for ct %p\n", ct); + goto unlock_free; + } + + if (ct_pptp_info->keymap[IP_CT_DIR_ORIGINAL] || + ct_pptp_info->keymap[IP_CT_DIR_REPLY]) + goto unlock_free; + + pr_debug("adding new entries %p,%p: ", km_orig, km_repl); + nf_ct_dump_tuple(&km_orig->tuple); + nf_ct_dump_tuple(&km_repl->tuple); + + list_add_tail_rcu(&km_orig->list, &net_gre->keymap_list); + list_add_tail_rcu(&km_repl->list, &net_gre->keymap_list); + ct_pptp_info->keymap[IP_CT_DIR_ORIGINAL] = km_orig; + ct_pptp_info->keymap[IP_CT_DIR_REPLY] = km_repl; spin_unlock_bh(&keymap_lock); - return 0; + return true; + +unlock_free: + spin_unlock_bh(&keymap_lock); +km_free: + kfree(km_orig); + kfree(km_repl); + return ret; } EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_add); -- cgit v1.2.3 From d64b0372760e09de6d18a0616d7bc652c8c6891d Mon Sep 17 00:00:00 2001 From: George Guo Date: Sat, 9 May 2026 10:44:15 +0800 Subject: kho: fix KHO_TREE_MAX_DEPTH for non-4KB page sizes KHO_TREE_MAX_DEPTH is calculated as: DIV_ROUND_UP(KHO_ORDER_0_LOG2 - KHO_BITMAP_SIZE_LOG2, KHO_TABLE_SIZE_LOG2) + 1 For systems with 16KB pages (e.g. arm64 with CONFIG_ARM64_16K_PAGES=y or LoongArch), this gives a depth of 4. Since levels are 0 based, with depth = 4 the effective top level is 3 and the top-level shift at bit 39. PAGE_SHIFT = 14 KHO_BITMAP_SIZE_LOG2 = PAGE_SHIFT + 3 = 17 KHO_TABLE_SIZE_LOG2 = log(2; (1 << PAGE_SHIFT) / 8) = 11 shift = ((3 - 1) * KHO_TABLE_SIZE_LOG2) + KHO_BITMAP_SIZE_LOG2 = 39 The order-0 bit sits at bit 50 (KHO_ORDER_0_LOG2 = 64 - PAGE_SHIFT = 50). When inserting or reading a key, the index extracted at the top level is: (1 << 50) >> 39 = 2048 2048 is exactly the table size (PAGE_SIZE / sizeof(phys_addr_t) = 2048 for 16KB pages), so it wraps to 0, aliasing the order bit to index 0 and losing it silently. On the second kernel, kho_radix_decode_key() sees a key without the order bit, calls fls64() on the wrong bit, computes a wrong order and thus a garbage physical address. phys_to_page() of that address faults in kho_preserved_memory_reserve(), causing a kernel panic early in boot. Fix by adding +1 to the DIV_ROUND_UP numerator so the formula accounts for the order bit itself, giving depth 5 for 16KB pages. The top-level shift becomes 50, and (1 << 50) >> 50 = 1, which is nonzero and unambiguous. For 4KB and 64KB page sizes the depth is unchanged. Link: https://patch.msgid.link/20260509024415.33190-1-dongtai.guo@linux.dev Fixes: 3f2ad90060f6 ("kho: adopt radix tree for preserved memory tracking") Tested-by: Kexin Liu Signed-off-by: George Guo Reviewed-by: Pasha Tatashin [rppt: added actual math to the changelog] Signed-off-by: Mike Rapoport (Microsoft) --- include/linux/kho/abi/kexec_handover.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kho/abi/kexec_handover.h b/include/linux/kho/abi/kexec_handover.h index 7e847a2339b0..db9bda6dd310 100644 --- a/include/linux/kho/abi/kexec_handover.h +++ b/include/linux/kho/abi/kexec_handover.h @@ -274,7 +274,7 @@ enum kho_radix_consts { * and 1 bitmap level. */ KHO_TREE_MAX_DEPTH = - DIV_ROUND_UP(KHO_ORDER_0_LOG2 - KHO_BITMAP_SIZE_LOG2, + DIV_ROUND_UP(KHO_ORDER_0_LOG2 - KHO_BITMAP_SIZE_LOG2 + 1, KHO_TABLE_SIZE_LOG2) + 1, }; -- cgit v1.2.3 From 4c9ad387aa2d6785299722e54224d34764edaeb3 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 13 May 2026 16:53:54 +0200 Subject: iommu, debugobjects: avoid gcc-16.1 section mismatch warnings gcc-16 has gained some more advanced inter-procedual optimization techniques that enable it to inline the dummy_tlb_add_page() and dummy_tlb_flush() function pointers into a specialized version of __arm_v7s_unmap: WARNING: modpost: vmlinux: section mismatch in reference: __arm_v7s_unmap+0x2cc (section: .text) -> dummy_tlb_add_page (section: .init.text) ERROR: modpost: Section mismatches detected. >From what I can tell, the transformation is correct, as this is only called when __arm_v7s_unmap() is called from arm_v7s_do_selftests(), which is also __init. Since __arm_v7s_unmap() however is not __init, gcc cannot inline the inner function calls directly. In debug_objects_selftest(), the same thing happens. Both the caller and the leaf function are __init, but the IPA pulls it into a non-init one: WARNING: modpost: vmlinux: section mismatch in reference: lookup_object_or_alloc+0x7c (section: .text.lookup_object_or_alloc) -> is_static_object (section: .init.text) Marking the affected functions as not "__init" would reliably avoid this issue but is not a good solution because it removes an otherwise correct annotation. I tried marking the functions as 'noinline', but that ended up not covering all the affected configurations. With some more experimenting, I found that marking these functions as __attribute__((noipa)) is both logical and reliable. In order to keep the syntax readable, add a custom macro for this in include/linux/compiler_attributes.h next to other related macros and use it to annotate both files. Link: https://lore.kernel.org/all/abRB6g-48ZX6Yl2r@willie-the-truck/ Cc: Will Deacon Cc: Thomas Gleixner Cc: Andrew Morton Cc: Miguel Ojeda Cc: linux-kbuild@vger.kernel.org Cc: stable@vger.kernel.org Signed-off-by: Arnd Bergmann Acked-by: Will Deacon Acked-by: Thomas Gleixner Acked-by: Miguel Ojeda Signed-off-by: Joerg Roedel --- drivers/iommu/io-pgtable-arm-v7s.c | 18 ++++++++++++------ include/linux/compiler_attributes.h | 11 +++++++++++ lib/debugobjects.c | 2 +- 3 files changed, 24 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c index 40e33257d3c2..1dbef8c55007 100644 --- a/drivers/iommu/io-pgtable-arm-v7s.c +++ b/drivers/iommu/io-pgtable-arm-v7s.c @@ -777,21 +777,27 @@ struct io_pgtable_init_fns io_pgtable_arm_v7s_init_fns = { static struct io_pgtable_cfg *cfg_cookie __initdata; -static void __init dummy_tlb_flush_all(void *cookie) +/* + * __noipa prevents gcc from turning indirect iommu_flush_ops calls + * into direct calls from a specialized __arm_v7s_unmap() that triggers + * a build time section mismatch assertion. + */ +static __noipa void __init dummy_tlb_flush_all(void *cookie) { WARN_ON(cookie != cfg_cookie); } -static void __init dummy_tlb_flush(unsigned long iova, size_t size, - size_t granule, void *cookie) +static __noipa void __init dummy_tlb_flush(unsigned long iova, size_t size, + size_t granule, void *cookie) { WARN_ON(cookie != cfg_cookie); WARN_ON(!(size & cfg_cookie->pgsize_bitmap)); } -static void __init dummy_tlb_add_page(struct iommu_iotlb_gather *gather, - unsigned long iova, size_t granule, - void *cookie) +static __noipa void __init dummy_tlb_add_page(struct iommu_iotlb_gather *gather, + unsigned long iova, + size_t granule, + void *cookie) { dummy_tlb_flush(iova, granule, granule, cookie); } diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index c16d4199bf92..836a50f5917a 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -396,6 +396,17 @@ # define __disable_sanitizer_instrumentation #endif +/* + * Optional: not supported by clang + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Attributes.html#index-noipa + */ +#if __has_attribute(noipa) +# define __noipa __attribute__((noipa)) +#else +# define __noipa +#endif + /* * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-weak-function-attribute * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#index-weak-variable-attribute diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 12e2e42e6a31..c93b7ca3e1ab 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -1212,7 +1212,7 @@ struct self_test { static __initconst const struct debug_obj_descr descr_type_test; -static bool __init is_static_object(void *addr) +static __noipa bool __init is_static_object(void *addr) { struct self_test *obj = addr; -- cgit v1.2.3 From 98b34f3e8c3492cfc89ff943c9d92b4d52863d1d Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Mon, 25 May 2026 08:25:48 -0400 Subject: net: Introduce skb tc depth field to track packet loops Add a 2-bit per-skb tc depth field to track packet loops across the stack. The previous per-CPU loop counters like MIRRED_NEST_LIMIT assume a single call stack and lose state in two cases: 1) When a packet is queued and reprocessed later (e.g., egress->ingress via backlog), the per-cpu state is gone by the time it is dequeued. 2) With XPS/RPS a packet may arrive on one CPU and be processed on another. A per-skb field solves both by travelling with the packet itself. The field fits in existing padding, using 2 bits that were previously a hole: pahole before(-) and after (+) diff looks like: __u8 slow_gro:1; /* 132: 3 1 */ __u8 csum_not_inet:1; /* 132: 4 1 */ __u8 unreadable:1; /* 132: 5 1 */ + __u8 tc_depth:2; /* 132: 6 1 */ - /* XXX 2 bits hole, try to pack */ /* XXX 1 byte hole, try to pack */ __u16 tc_index; /* 134 2 */ There used to be a ttl field which was removed as part of tc_verd in commit aec745e2c520 ("net-tc: remove unused tc_verd fields"). It was already unused by that time, due to remove earlier in commit c19ae86a510c ("tc: remove unused redirect ttl"). The first user of this field is netem, which increments tc_depth on duplicated packets before re-enqueueing them at the root qdisc. On re-entry, netem skips duplication for any skb with tc_depth already set, bounding recursion to a single level regardless of tree topology. The other user is mirred which increments it on each pass and limits to depth to MIRRED_DEFER_LIMIT (3). The new field was called ttl in earlier versions of this patch but renamed to tc_depth to avoid confusion with IP ttl. Note (looking at you Sashiko! Dont ignore me and continue bringing this up): 1. Since both mirred and netem utilize the same 2-bit tc_depth field it is possible when netem and mirred are used together that netem qdisc to skip the duplication step. This is a known trade-off, as a 2-bit field cannot independently track both features' recursion depths and it is not considered sane to have a setup that addresses both features on at the same time. 2. skb_scrub_packet does not clear tc_depth. This means a packet's loop history is preserved even across namespaces. While this might be restrictive for some topologies, it is also design intent to provide robustness against loops across namespaces. Reviewed-by: Stephen Hemminger Signed-off-by: Jamal Hadi Salim Link: https://patch.msgid.link/20260525122556.973584-2-jhs@mojatatu.com Signed-off-by: Paolo Abeni --- include/linux/skbuff.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2bcf78a4de7b..3f06254ab1b7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -821,6 +821,7 @@ enum skb_tstamp_type { * @_sk_redir: socket redirection information for skmsg * @_nfct: Associated connection, if any (with nfctinfo bits) * @skb_iif: ifindex of device we arrived on + * @tc_depth: counter for packet duplication * @tc_index: Traffic control index * @hash: the packet hash * @queue_mapping: Queue mapping for multiqueue devices @@ -1030,6 +1031,7 @@ struct sk_buff { __u8 csum_not_inet:1; #endif __u8 unreadable:1; + __u8 tc_depth:2; #if defined(CONFIG_NET_SCHED) || defined(CONFIG_NET_XGRESS) __u16 tc_index; /* traffic control index */ #endif -- cgit v1.2.3 From 20040b2a3cb992f84d3db4c086b909eb9b906b31 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Tue, 26 May 2026 09:45:23 +0200 Subject: dpll: export __dpll_device_change_ntf() for use under dpll_lock Export __dpll_device_change_ntf() so that drivers can send device change notifications from within device callbacks, which are already called under dpll_lock. Using dpll_device_change_ntf() in that context would deadlock. Add lockdep_assert_held() to catch misuse without the lock held. Signed-off-by: Ivan Vecera Reviewed-by: Jiri Pirko Link: https://patch.msgid.link/20260526074525.1451008-2-ivecera@redhat.com Signed-off-by: Paolo Abeni --- drivers/dpll/dpll_netlink.c | 13 +++++++++++-- include/linux/dpll.h | 1 + 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c index 0ff1658c2dc1..75e3ae0c16d0 100644 --- a/drivers/dpll/dpll_netlink.c +++ b/drivers/dpll/dpll_netlink.c @@ -829,12 +829,21 @@ int dpll_device_delete_ntf(struct dpll_device *dpll) return dpll_device_event_send(DPLL_CMD_DEVICE_DELETE_NTF, dpll); } -static int -__dpll_device_change_ntf(struct dpll_device *dpll) +/** + * __dpll_device_change_ntf - notify that the dpll device has been changed + * @dpll: registered dpll pointer + * + * Context: caller must hold dpll_lock. Suitable for use inside device + * callbacks which are already invoked under dpll_lock. + * Return: 0 if succeeds, error code otherwise. + */ +int __dpll_device_change_ntf(struct dpll_device *dpll) { + lockdep_assert_held(&dpll_lock); dpll_device_notify(dpll, DPLL_DEVICE_CHANGED); return dpll_device_event_send(DPLL_CMD_DEVICE_CHANGE_NTF, dpll); } +EXPORT_SYMBOL_GPL(__dpll_device_change_ntf); /** * dpll_device_change_ntf - notify that the dpll device has been changed diff --git a/include/linux/dpll.h b/include/linux/dpll.h index f8037f1ab20b..2dbe8567eafc 100644 --- a/include/linux/dpll.h +++ b/include/linux/dpll.h @@ -284,6 +284,7 @@ void dpll_pin_on_pin_unregister(struct dpll_pin *parent, struct dpll_pin *pin, int dpll_pin_ref_sync_pair_add(struct dpll_pin *pin, struct dpll_pin *ref_sync_pin); +int __dpll_device_change_ntf(struct dpll_device *dpll); int dpll_device_change_ntf(struct dpll_device *dpll); int __dpll_pin_change_ntf(struct dpll_pin *pin); -- cgit v1.2.3 From 175db11786bde9061db526bf1ac5107d915f5163 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Sat, 16 May 2026 04:34:14 +0900 Subject: Disable -Wattribute-alias for clang-23 and newer Clang recently added support for -Wattribute-alias [1], which results in the same warnings that necessitated commit bee20031772a ("disable -Wattribute-alias warning for SYSCALL_DEFINEx()") for GCC. kernel/time/itimer.c:325:1: error: alias and aliasee have different types 'long (unsigned int)' and 'long (typeof (__builtin_choose_expr((__builtin_types_compatible_p(typeof ((unsigned int)0), typeof (0LL)) || __builtin_types_compatible_p(typeof ((unsigned int)0), typeof (0ULL))), 0LL, 0L)))' (aka 'long (long)') [-Werror,-Wattribute-alias] 325 | SYSCALL_DEFINE1(alarm, unsigned int, seconds) | ^ include/linux/syscalls.h:225:36: note: expanded from macro 'SYSCALL_DEFINE1' 225 | #define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__) | ^ include/linux/syscalls.h:236:2: note: expanded from macro 'SYSCALL_DEFINEx' 236 | __SYSCALL_DEFINEx(x, sname, __VA_ARGS__) | ^ include/linux/syscalls.h:251:18: note: expanded from macro '__SYSCALL_DEFINEx' 251 | __attribute__((alias(__stringify(__se_sys##name)))); \ | ^ kernel/time/itimer.c:325:1: note: aliasee is declared here include/linux/syscalls.h:225:36: note: expanded from macro 'SYSCALL_DEFINE1' 225 | #define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__) | ^ include/linux/syscalls.h:236:2: note: expanded from macro 'SYSCALL_DEFINEx' 236 | __SYSCALL_DEFINEx(x, sname, __VA_ARGS__) | ^ include/linux/syscalls.h:255:18: note: expanded from macro '__SYSCALL_DEFINEx' 255 | asmlinkage long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ | ^ :16:1: note: expanded from here 16 | __se_sys_alarm | ^ Disable the warnings in the same way for clang-23 and newer. Disable the warning about unknown warning options to avoid breaking the build for versions of clang-23 that do not have -Wattribute-alias, such as ones deployed by vendors like Android or CI systems or when bisecting LLVM between llvmorg-23-init and release/23.x. Cc: stable@vger.kernel.org Closes: https://github.com/ClangBuiltLinux/linux/issues/2163 Link: https://github.com/llvm/llvm-project/commit/40da6920a0d71d49dfa2392b09153600b0759f5e [1] Link: https://patch.msgid.link/20260515-syscall-disable-attribute-alias-for-clang-v1-1-9a9d95d41df6@kernel.org Signed-off-by: Nathan Chancellor --- arch/riscv/include/asm/syscall_wrapper.h | 4 ++++ include/linux/compat.h | 4 ++++ include/linux/compiler-clang.h | 6 ++++++ include/linux/compiler_types.h | 4 ++++ include/linux/syscalls.h | 4 ++++ 5 files changed, 22 insertions(+) (limited to 'include/linux') diff --git a/arch/riscv/include/asm/syscall_wrapper.h b/arch/riscv/include/asm/syscall_wrapper.h index ac80216549ff..226289c3b5c8 100644 --- a/arch/riscv/include/asm/syscall_wrapper.h +++ b/arch/riscv/include/asm/syscall_wrapper.h @@ -32,6 +32,10 @@ asmlinkage long __riscv_sys_ni_syscall(const struct pt_regs *); __diag_push(); \ __diag_ignore(GCC, 8, "-Wattribute-alias", \ "Type aliasing is used to sanitize syscall arguments"); \ + __diag_ignore(clang, 23, "-Wunknown-warning-option", \ + "Avoid breaking versions without -Wattribute-alias"); \ + __diag_ignore(clang, 23, "-Wattribute-alias", \ + "Type aliasing is used to sanitize syscall arguments"); \ static long __se_##prefix##name(ulong, ulong, ulong, ulong, ulong, ulong, \ ulong) \ __attribute__((alias(__stringify(___se_##prefix##name)))); \ diff --git a/include/linux/compat.h b/include/linux/compat.h index 56cebaff0c91..8da0a15c95f4 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -72,6 +72,10 @@ __diag_push(); \ __diag_ignore(GCC, 8, "-Wattribute-alias", \ "Type aliasing is used to sanitize syscall arguments");\ + __diag_ignore(clang, 23, "-Wunknown-warning-option", \ + "Avoid breaking versions without -Wattribute-alias"); \ + __diag_ignore(clang, 23, "-Wattribute-alias", \ + "Type aliasing is used to sanitize syscall arguments"); \ asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \ __attribute__((alias(__stringify(__se_compat_sys##name)))); \ ALLOW_ERROR_INJECTION(compat_sys##name, ERRNO); \ diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index e1123dd28486..527e4e136020 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -131,6 +131,12 @@ #define __diag_str(s) __diag_str1(s) #define __diag(s) _Pragma(__diag_str(clang diagnostic s)) +#if CONFIG_CLANG_VERSION >= 230000 +#define __diag_clang_23(s) __diag(s) +#else +#define __diag_clang_23(s) +#endif + #define __diag_clang_13(s) __diag(s) #define __diag_ignore_all(option, comment) \ diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index e8fd77593b68..369966598a2c 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -711,6 +711,10 @@ struct ftrace_likely_data { #define __diag_GCC(version, severity, string) #endif +#ifndef __diag_clang +#define __diag_clang(version, severity, string) +#endif + #define __diag_push() __diag(push) #define __diag_pop() __diag(pop) diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index f5639d5ac331..4fb7291f54b6 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -247,6 +247,10 @@ static inline int is_syscall_trace_event(struct trace_event_call *tp_event) __diag_push(); \ __diag_ignore(GCC, 8, "-Wattribute-alias", \ "Type aliasing is used to sanitize syscall arguments");\ + __diag_ignore(clang, 23, "-Wunknown-warning-option", \ + "Avoid breaking versions without -Wattribute-alias");\ + __diag_ignore(clang, 23, "-Wattribute-alias", \ + "Type aliasing is used to sanitize syscall arguments");\ asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \ __attribute__((alias(__stringify(__se_sys##name)))); \ ALLOW_ERROR_INJECTION(sys##name, ERRNO); \ -- cgit v1.2.3