From ef246da8e63c486780dca4d9b4d79589cbebf5e5 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sat, 24 Jan 2026 21:14:13 +0200 Subject: dma-buf: Rename .move_notify() callback to a clearer identifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename the .move_notify() callback to .invalidate_mappings() to make its purpose explicit and highlight that it is responsible for invalidating existing mappings. Suggested-by: Christian König Reviewed-by: Christian König Signed-off-by: Leon Romanovsky Link: https://lore.kernel.org/r/20260124-dmabuf-revoke-v5-1-f98fca917e96@nvidia.com Signed-off-by: Christian König --- include/linux/dma-buf.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index 91f4939db89b..d9ee4499b37d 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -407,7 +407,7 @@ struct dma_buf { * through the device. * * - Dynamic importers should set fences for any access that they can't - * disable immediately from their &dma_buf_attach_ops.move_notify + * disable immediately from their &dma_buf_attach_ops.invalidate_mappings * callback. * * IMPORTANT: @@ -446,7 +446,7 @@ struct dma_buf_attach_ops { bool allow_peer2peer; /** - * @move_notify: [optional] notification that the DMA-buf is moving + * @invalidate_mappings: [optional] notification that the DMA-buf is moving * * If this callback is provided the framework can avoid pinning the * backing store while mappings exists. @@ -463,7 +463,7 @@ struct dma_buf_attach_ops { * New mappings can be created after this callback returns, and will * point to the new location of the DMA-buf. */ - void (*move_notify)(struct dma_buf_attachment *attach); + void (*invalidate_mappings)(struct dma_buf_attachment *attach); }; /** -- cgit v1.2.3 From 95308225e5baeaae1e313816059c59a0036ab6b2 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sat, 24 Jan 2026 21:14:14 +0200 Subject: dma-buf: Rename dma_buf_move_notify() to dma_buf_invalidate_mappings() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Along with renaming the .move_notify() callback, rename the corresponding dma-buf core function. This makes the expected behavior clear to exporters calling this function. Signed-off-by: Leon Romanovsky Reviewed-by: Christian König Link: https://lore.kernel.org/r/20260124-dmabuf-revoke-v5-2-f98fca917e96@nvidia.com Signed-off-by: Christian König --- drivers/dma-buf/dma-buf.c | 8 ++++---- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 2 +- drivers/gpu/drm/xe/xe_bo.c | 2 +- drivers/iommu/iommufd/selftest.c | 2 +- drivers/vfio/pci/vfio_pci_dmabuf.c | 4 ++-- include/linux/dma-buf.h | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index cc9b88214d97..1c257607a623 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -981,7 +981,7 @@ dma_buf_pin_on_map(struct dma_buf_attachment *attach) * 3. Exporters must hold the dma-buf reservation lock when calling these * functions: * - * - dma_buf_move_notify() + * - dma_buf_invalidate_mappings() */ /** @@ -1323,14 +1323,14 @@ void dma_buf_unmap_attachment_unlocked(struct dma_buf_attachment *attach, EXPORT_SYMBOL_NS_GPL(dma_buf_unmap_attachment_unlocked, "DMA_BUF"); /** - * dma_buf_move_notify - notify attachments that DMA-buf is moving + * dma_buf_invalidate_mappings - notify attachments that DMA-buf is moving * * @dmabuf: [in] buffer which is moving * * Informs all attachments that they need to destroy and recreate all their * mappings. */ -void dma_buf_move_notify(struct dma_buf *dmabuf) +void dma_buf_invalidate_mappings(struct dma_buf *dmabuf) { struct dma_buf_attachment *attach; @@ -1340,7 +1340,7 @@ void dma_buf_move_notify(struct dma_buf *dmabuf) if (attach->importer_ops) attach->importer_ops->invalidate_mappings(attach); } -EXPORT_SYMBOL_NS_GPL(dma_buf_move_notify, "DMA_BUF"); +EXPORT_SYMBOL_NS_GPL(dma_buf_invalidate_mappings, "DMA_BUF"); /** * DOC: cpu access diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index e08f58de4b17..f73dc99d1887 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -1270,7 +1270,7 @@ void amdgpu_bo_move_notify(struct ttm_buffer_object *bo, if (abo->tbo.base.dma_buf && !drm_gem_is_imported(&abo->tbo.base) && old_mem && old_mem->mem_type != TTM_PL_SYSTEM) - dma_buf_move_notify(abo->tbo.base.dma_buf); + dma_buf_invalidate_mappings(abo->tbo.base.dma_buf); /* move_notify is called before move happens */ trace_amdgpu_bo_move(abo, new_mem ? new_mem->mem_type : -1, diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c index b0bd31d14bb9..94712b05edff 100644 --- a/drivers/gpu/drm/xe/xe_bo.c +++ b/drivers/gpu/drm/xe/xe_bo.c @@ -819,7 +819,7 @@ static int xe_bo_move_notify(struct xe_bo *bo, /* Don't call move_notify() for imported dma-bufs. */ if (ttm_bo->base.dma_buf && !ttm_bo->base.import_attach) - dma_buf_move_notify(ttm_bo->base.dma_buf); + dma_buf_invalidate_mappings(ttm_bo->base.dma_buf); /* * TTM has already nuked the mmap for us (see ttm_bo_unmap_virtual), diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index c4322fd26f93..fd47953db4a3 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -2073,7 +2073,7 @@ static int iommufd_test_dmabuf_revoke(struct iommufd_ucmd *ucmd, int fd, priv = dmabuf->priv; dma_resv_lock(dmabuf->resv, NULL); priv->revoked = revoked; - dma_buf_move_notify(dmabuf); + dma_buf_invalidate_mappings(dmabuf); dma_resv_unlock(dmabuf->resv); err_put: diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c index d4d0f7d08c53..362e3d149817 100644 --- a/drivers/vfio/pci/vfio_pci_dmabuf.c +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c @@ -320,7 +320,7 @@ void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked) if (priv->revoked != revoked) { dma_resv_lock(priv->dmabuf->resv, NULL); priv->revoked = revoked; - dma_buf_move_notify(priv->dmabuf); + dma_buf_invalidate_mappings(priv->dmabuf); dma_resv_unlock(priv->dmabuf->resv); } fput(priv->dmabuf->file); @@ -341,7 +341,7 @@ void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev) list_del_init(&priv->dmabufs_elm); priv->vdev = NULL; priv->revoked = true; - dma_buf_move_notify(priv->dmabuf); + dma_buf_invalidate_mappings(priv->dmabuf); dma_resv_unlock(priv->dmabuf->resv); vfio_device_put_registration(&vdev->vdev); fput(priv->dmabuf->file); diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index d9ee4499b37d..d0470af8887e 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -588,7 +588,7 @@ struct sg_table *dma_buf_map_attachment(struct dma_buf_attachment *, enum dma_data_direction); void dma_buf_unmap_attachment(struct dma_buf_attachment *, struct sg_table *, enum dma_data_direction); -void dma_buf_move_notify(struct dma_buf *dma_buf); +void dma_buf_invalidate_mappings(struct dma_buf *dma_buf); int dma_buf_begin_cpu_access(struct dma_buf *dma_buf, enum dma_data_direction dir); int dma_buf_end_cpu_access(struct dma_buf *dma_buf, -- cgit v1.2.3 From 2bcbc706dfa02ae50118173a6f6d8a12e735480c Mon Sep 17 00:00:00 2001 From: Christian König Date: Fri, 19 Dec 2025 11:41:54 +0100 Subject: dma-buf: add dma_fence_was_initialized function v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some driver use fence->ops to test if a fence was initialized or not. The problem is that this utilizes internal behavior of the dma_fence implementation. So better abstract that into a function. v2: use a flag instead of testing fence->ops, rename the function, move to the beginning of the patch set. Signed-off-by: Christian König Reviewed-by: Tvrtko Ursulin Link: https://lore.kernel.org/r/20260120105655.7134-2-christian.koenig@amd.com --- drivers/dma-buf/dma-fence.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 13 +++++++------ drivers/gpu/drm/qxl/qxl_release.c | 2 +- include/linux/dma-fence.h | 15 +++++++++++++++ 4 files changed, 24 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c index 21c5c30b4f34..c9a036b0d592 100644 --- a/drivers/dma-buf/dma-fence.c +++ b/drivers/dma-buf/dma-fence.c @@ -1054,7 +1054,7 @@ __dma_fence_init(struct dma_fence *fence, const struct dma_fence_ops *ops, fence->lock = lock; fence->context = context; fence->seqno = seqno; - fence->flags = flags; + fence->flags = flags | BIT(DMA_FENCE_FLAG_INITIALIZED_BIT); fence->error = 0; trace_dma_fence_init(fence); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index aaf5477fcd7a..f05683d59f8b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -282,9 +282,10 @@ void amdgpu_job_free_resources(struct amdgpu_job *job) unsigned i; /* Check if any fences were initialized */ - if (job->base.s_fence && job->base.s_fence->finished.ops) + if (job->base.s_fence && + dma_fence_was_initialized(&job->base.s_fence->finished)) f = &job->base.s_fence->finished; - else if (job->hw_fence && job->hw_fence->base.ops) + else if (dma_fence_was_initialized(&job->hw_fence->base)) f = &job->hw_fence->base; else f = NULL; @@ -301,11 +302,11 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job) amdgpu_sync_free(&job->explicit_sync); - if (job->hw_fence->base.ops) + if (dma_fence_was_initialized(&job->hw_fence->base)) dma_fence_put(&job->hw_fence->base); else kfree(job->hw_fence); - if (job->hw_vm_fence->base.ops) + if (dma_fence_was_initialized(&job->hw_vm_fence->base)) dma_fence_put(&job->hw_vm_fence->base); else kfree(job->hw_vm_fence); @@ -339,11 +340,11 @@ void amdgpu_job_free(struct amdgpu_job *job) if (job->gang_submit != &job->base.s_fence->scheduled) dma_fence_put(job->gang_submit); - if (job->hw_fence->base.ops) + if (dma_fence_was_initialized(&job->hw_fence->base)) dma_fence_put(&job->hw_fence->base); else kfree(job->hw_fence); - if (job->hw_vm_fence->base.ops) + if (dma_fence_was_initialized(&job->hw_vm_fence->base)) dma_fence_put(&job->hw_vm_fence->base); else kfree(job->hw_vm_fence); diff --git a/drivers/gpu/drm/qxl/qxl_release.c b/drivers/gpu/drm/qxl/qxl_release.c index 7b3c9a6016db..06b0b2aa7953 100644 --- a/drivers/gpu/drm/qxl/qxl_release.c +++ b/drivers/gpu/drm/qxl/qxl_release.c @@ -146,7 +146,7 @@ qxl_release_free(struct qxl_device *qdev, idr_remove(&qdev->release_idr, release->id); spin_unlock(&qdev->release_idr_lock); - if (release->base.ops) { + if (dma_fence_was_initialized(&release->base)) { WARN_ON(list_empty(&release->bos)); qxl_release_free_list(release); diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index d4c92fd35092..9c4d25289239 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -48,6 +48,7 @@ struct seq_file; * atomic ops (bit_*), so taking the spinlock will not be needed most * of the time. * + * DMA_FENCE_FLAG_INITIALIZED_BIT - fence was initialized * DMA_FENCE_FLAG_SIGNALED_BIT - fence is already signaled * DMA_FENCE_FLAG_TIMESTAMP_BIT - timestamp recorded for fence signaling * DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT - enable_signaling might have been called @@ -98,6 +99,7 @@ struct dma_fence { }; enum dma_fence_flag_bits { + DMA_FENCE_FLAG_INITIALIZED_BIT, DMA_FENCE_FLAG_SEQNO64_BIT, DMA_FENCE_FLAG_SIGNALED_BIT, DMA_FENCE_FLAG_TIMESTAMP_BIT, @@ -263,6 +265,19 @@ void dma_fence_release(struct kref *kref); void dma_fence_free(struct dma_fence *fence); void dma_fence_describe(struct dma_fence *fence, struct seq_file *seq); +/** + * dma_fence_was_initialized - test if fence was initialized + * @fence: fence to test + * + * Return: True if fence was ever initialized, false otherwise. Works correctly + * only when memory backing the fence structure is zero initialized on + * allocation. + */ +static inline bool dma_fence_was_initialized(struct dma_fence *fence) +{ + return fence && test_bit(DMA_FENCE_FLAG_INITIALIZED_BIT, &fence->flags); +} + /** * dma_fence_put - decreases refcount of the fence * @fence: fence to reduce refcount of -- cgit v1.2.3 From 4a9671a03f2be13acde0cb15c5208767a9cc56e4 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Fri, 6 Feb 2026 08:52:38 +1000 Subject: gpu: Move DRM buddy allocator one level up (part one) Move the DRM buddy allocator one level up so that it can be used by GPU drivers (example, nova-core) that have usecases other than DRM (such as VFIO vGPU support). Modify the API, structures and Kconfigs to use "gpu_buddy" terminology. Adapt the drivers and tests to use the new API. The commit cannot be split due to bisectability, however no functional change is intended. Verified by running K-UNIT tests and build tested various configurations. Signed-off-by: Joel Fernandes Reviewed-by: Dave Airlie [airlied: I've split this into two so git can find copies easier. I've also just nuked drm_random library, that stuff needs to be done elsewhere and only the buddy tests seem to be using it]. Signed-off-by: Dave Airlie --- Documentation/gpu/drm-mm.rst | 6 +- drivers/gpu/Makefile | 2 +- drivers/gpu/buddy.c | 1336 +++++++++++++++++++++++++ drivers/gpu/drm/Kconfig | 4 - drivers/gpu/drm/Kconfig.debug | 1 - drivers/gpu/drm/Makefile | 3 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h | 2 +- drivers/gpu/drm/drm_buddy.c | 1336 ------------------------- drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 2 +- drivers/gpu/drm/i915/i915_scatterlist.c | 2 +- drivers/gpu/drm/i915/i915_ttm_buddy_manager.c | 2 +- drivers/gpu/drm/lib/drm_random.c | 44 - drivers/gpu/drm/lib/drm_random.h | 28 - drivers/gpu/drm/tests/Makefile | 1 - drivers/gpu/drm/tests/drm_buddy_test.c | 928 ----------------- drivers/gpu/drm/tests/drm_exec_test.c | 2 - drivers/gpu/drm/tests/drm_mm_test.c | 2 - drivers/gpu/drm/ttm/tests/ttm_mock_manager.h | 2 +- drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h | 2 +- drivers/gpu/tests/Makefile | 4 + drivers/gpu/tests/gpu_buddy_test.c | 928 +++++++++++++++++ drivers/gpu/tests/gpu_random.c | 44 + drivers/gpu/tests/gpu_random.h | 28 + include/drm/drm_buddy.h | 171 ---- include/linux/gpu_buddy.h | 171 ++++ 25 files changed, 2522 insertions(+), 2529 deletions(-) create mode 100644 drivers/gpu/buddy.c delete mode 100644 drivers/gpu/drm/drm_buddy.c delete mode 100644 drivers/gpu/drm/lib/drm_random.c delete mode 100644 drivers/gpu/drm/lib/drm_random.h delete mode 100644 drivers/gpu/drm/tests/drm_buddy_test.c create mode 100644 drivers/gpu/tests/Makefile create mode 100644 drivers/gpu/tests/gpu_buddy_test.c create mode 100644 drivers/gpu/tests/gpu_random.c create mode 100644 drivers/gpu/tests/gpu_random.h delete mode 100644 include/drm/drm_buddy.h create mode 100644 include/linux/gpu_buddy.h (limited to 'include/linux') diff --git a/Documentation/gpu/drm-mm.rst b/Documentation/gpu/drm-mm.rst index f22433470c76..ceee0e663237 100644 --- a/Documentation/gpu/drm-mm.rst +++ b/Documentation/gpu/drm-mm.rst @@ -526,10 +526,10 @@ DRM GPUVM Function References DRM Buddy Allocator =================== -DRM Buddy Function References ------------------------------ +Buddy Allocator Function References (GPU buddy) +----------------------------------------------- -.. kernel-doc:: drivers/gpu/drm/drm_buddy.c +.. kernel-doc:: drivers/gpu/buddy.c :export: DRM Cache Handling and Fast WC memcpy() diff --git a/drivers/gpu/Makefile b/drivers/gpu/Makefile index 36a54d456630..c5292ee2c852 100644 --- a/drivers/gpu/Makefile +++ b/drivers/gpu/Makefile @@ -2,7 +2,7 @@ # drm/tegra depends on host1x, so if both drivers are built-in care must be # taken to initialize them in the correct order. Link order is the only way # to ensure this currently. -obj-y += host1x/ drm/ vga/ +obj-y += host1x/ drm/ vga/ tests/ obj-$(CONFIG_IMX_IPUV3_CORE) += ipu-v3/ obj-$(CONFIG_TRACE_GPU_MEM) += trace/ obj-$(CONFIG_NOVA_CORE) += nova-core/ diff --git a/drivers/gpu/buddy.c b/drivers/gpu/buddy.c new file mode 100644 index 000000000000..4cc63d961d26 --- /dev/null +++ b/drivers/gpu/buddy.c @@ -0,0 +1,1336 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2021 Intel Corporation + */ + +#include + +#include +#include +#include +#include + +#include +#include + +enum drm_buddy_free_tree { + DRM_BUDDY_CLEAR_TREE = 0, + DRM_BUDDY_DIRTY_TREE, + DRM_BUDDY_MAX_FREE_TREES, +}; + +static struct kmem_cache *slab_blocks; + +#define for_each_free_tree(tree) \ + for ((tree) = 0; (tree) < DRM_BUDDY_MAX_FREE_TREES; (tree)++) + +static struct drm_buddy_block *drm_block_alloc(struct drm_buddy *mm, + struct drm_buddy_block *parent, + unsigned int order, + u64 offset) +{ + struct drm_buddy_block *block; + + BUG_ON(order > DRM_BUDDY_MAX_ORDER); + + block = kmem_cache_zalloc(slab_blocks, GFP_KERNEL); + if (!block) + return NULL; + + block->header = offset; + block->header |= order; + block->parent = parent; + + RB_CLEAR_NODE(&block->rb); + + BUG_ON(block->header & DRM_BUDDY_HEADER_UNUSED); + return block; +} + +static void drm_block_free(struct drm_buddy *mm, + struct drm_buddy_block *block) +{ + kmem_cache_free(slab_blocks, block); +} + +static enum drm_buddy_free_tree +get_block_tree(struct drm_buddy_block *block) +{ + return drm_buddy_block_is_clear(block) ? + DRM_BUDDY_CLEAR_TREE : DRM_BUDDY_DIRTY_TREE; +} + +static struct drm_buddy_block * +rbtree_get_free_block(const struct rb_node *node) +{ + return node ? rb_entry(node, struct drm_buddy_block, rb) : NULL; +} + +static struct drm_buddy_block * +rbtree_last_free_block(struct rb_root *root) +{ + return rbtree_get_free_block(rb_last(root)); +} + +static bool rbtree_is_empty(struct rb_root *root) +{ + return RB_EMPTY_ROOT(root); +} + +static bool drm_buddy_block_offset_less(const struct drm_buddy_block *block, + const struct drm_buddy_block *node) +{ + return drm_buddy_block_offset(block) < drm_buddy_block_offset(node); +} + +static bool rbtree_block_offset_less(struct rb_node *block, + const struct rb_node *node) +{ + return drm_buddy_block_offset_less(rbtree_get_free_block(block), + rbtree_get_free_block(node)); +} + +static void rbtree_insert(struct drm_buddy *mm, + struct drm_buddy_block *block, + enum drm_buddy_free_tree tree) +{ + rb_add(&block->rb, + &mm->free_trees[tree][drm_buddy_block_order(block)], + rbtree_block_offset_less); +} + +static void rbtree_remove(struct drm_buddy *mm, + struct drm_buddy_block *block) +{ + unsigned int order = drm_buddy_block_order(block); + enum drm_buddy_free_tree tree; + struct rb_root *root; + + tree = get_block_tree(block); + root = &mm->free_trees[tree][order]; + + rb_erase(&block->rb, root); + RB_CLEAR_NODE(&block->rb); +} + +static void clear_reset(struct drm_buddy_block *block) +{ + block->header &= ~DRM_BUDDY_HEADER_CLEAR; +} + +static void mark_cleared(struct drm_buddy_block *block) +{ + block->header |= DRM_BUDDY_HEADER_CLEAR; +} + +static void mark_allocated(struct drm_buddy *mm, + struct drm_buddy_block *block) +{ + block->header &= ~DRM_BUDDY_HEADER_STATE; + block->header |= DRM_BUDDY_ALLOCATED; + + rbtree_remove(mm, block); +} + +static void mark_free(struct drm_buddy *mm, + struct drm_buddy_block *block) +{ + enum drm_buddy_free_tree tree; + + block->header &= ~DRM_BUDDY_HEADER_STATE; + block->header |= DRM_BUDDY_FREE; + + tree = get_block_tree(block); + rbtree_insert(mm, block, tree); +} + +static void mark_split(struct drm_buddy *mm, + struct drm_buddy_block *block) +{ + block->header &= ~DRM_BUDDY_HEADER_STATE; + block->header |= DRM_BUDDY_SPLIT; + + rbtree_remove(mm, block); +} + +static inline bool overlaps(u64 s1, u64 e1, u64 s2, u64 e2) +{ + return s1 <= e2 && e1 >= s2; +} + +static inline bool contains(u64 s1, u64 e1, u64 s2, u64 e2) +{ + return s1 <= s2 && e1 >= e2; +} + +static struct drm_buddy_block * +__get_buddy(struct drm_buddy_block *block) +{ + struct drm_buddy_block *parent; + + parent = block->parent; + if (!parent) + return NULL; + + if (parent->left == block) + return parent->right; + + return parent->left; +} + +static unsigned int __drm_buddy_free(struct drm_buddy *mm, + struct drm_buddy_block *block, + bool force_merge) +{ + struct drm_buddy_block *parent; + unsigned int order; + + while ((parent = block->parent)) { + struct drm_buddy_block *buddy; + + buddy = __get_buddy(block); + + if (!drm_buddy_block_is_free(buddy)) + break; + + if (!force_merge) { + /* + * Check the block and its buddy clear state and exit + * the loop if they both have the dissimilar state. + */ + if (drm_buddy_block_is_clear(block) != + drm_buddy_block_is_clear(buddy)) + break; + + if (drm_buddy_block_is_clear(block)) + mark_cleared(parent); + } + + rbtree_remove(mm, buddy); + if (force_merge && drm_buddy_block_is_clear(buddy)) + mm->clear_avail -= drm_buddy_block_size(mm, buddy); + + drm_block_free(mm, block); + drm_block_free(mm, buddy); + + block = parent; + } + + order = drm_buddy_block_order(block); + mark_free(mm, block); + + return order; +} + +static int __force_merge(struct drm_buddy *mm, + u64 start, + u64 end, + unsigned int min_order) +{ + unsigned int tree, order; + int i; + + if (!min_order) + return -ENOMEM; + + if (min_order > mm->max_order) + return -EINVAL; + + for_each_free_tree(tree) { + for (i = min_order - 1; i >= 0; i--) { + struct rb_node *iter = rb_last(&mm->free_trees[tree][i]); + + while (iter) { + struct drm_buddy_block *block, *buddy; + u64 block_start, block_end; + + block = rbtree_get_free_block(iter); + iter = rb_prev(iter); + + if (!block || !block->parent) + continue; + + block_start = drm_buddy_block_offset(block); + block_end = block_start + drm_buddy_block_size(mm, block) - 1; + + if (!contains(start, end, block_start, block_end)) + continue; + + buddy = __get_buddy(block); + if (!drm_buddy_block_is_free(buddy)) + continue; + + WARN_ON(drm_buddy_block_is_clear(block) == + drm_buddy_block_is_clear(buddy)); + + /* + * Advance to the next node when the current node is the buddy, + * as freeing the block will also remove its buddy from the tree. + */ + if (iter == &buddy->rb) + iter = rb_prev(iter); + + rbtree_remove(mm, block); + if (drm_buddy_block_is_clear(block)) + mm->clear_avail -= drm_buddy_block_size(mm, block); + + order = __drm_buddy_free(mm, block, true); + if (order >= min_order) + return 0; + } + } + } + + return -ENOMEM; +} + +/** + * drm_buddy_init - init memory manager + * + * @mm: DRM buddy manager to initialize + * @size: size in bytes to manage + * @chunk_size: minimum page size in bytes for our allocations + * + * Initializes the memory manager and its resources. + * + * Returns: + * 0 on success, error code on failure. + */ +int drm_buddy_init(struct drm_buddy *mm, u64 size, u64 chunk_size) +{ + unsigned int i, j, root_count = 0; + u64 offset = 0; + + if (size < chunk_size) + return -EINVAL; + + if (chunk_size < SZ_4K) + return -EINVAL; + + if (!is_power_of_2(chunk_size)) + return -EINVAL; + + size = round_down(size, chunk_size); + + mm->size = size; + mm->avail = size; + mm->clear_avail = 0; + mm->chunk_size = chunk_size; + mm->max_order = ilog2(size) - ilog2(chunk_size); + + BUG_ON(mm->max_order > DRM_BUDDY_MAX_ORDER); + + mm->free_trees = kmalloc_array(DRM_BUDDY_MAX_FREE_TREES, + sizeof(*mm->free_trees), + GFP_KERNEL); + if (!mm->free_trees) + return -ENOMEM; + + for_each_free_tree(i) { + mm->free_trees[i] = kmalloc_array(mm->max_order + 1, + sizeof(struct rb_root), + GFP_KERNEL); + if (!mm->free_trees[i]) + goto out_free_tree; + + for (j = 0; j <= mm->max_order; ++j) + mm->free_trees[i][j] = RB_ROOT; + } + + mm->n_roots = hweight64(size); + + mm->roots = kmalloc_array(mm->n_roots, + sizeof(struct drm_buddy_block *), + GFP_KERNEL); + if (!mm->roots) + goto out_free_tree; + + /* + * Split into power-of-two blocks, in case we are given a size that is + * not itself a power-of-two. + */ + do { + struct drm_buddy_block *root; + unsigned int order; + u64 root_size; + + order = ilog2(size) - ilog2(chunk_size); + root_size = chunk_size << order; + + root = drm_block_alloc(mm, NULL, order, offset); + if (!root) + goto out_free_roots; + + mark_free(mm, root); + + BUG_ON(root_count > mm->max_order); + BUG_ON(drm_buddy_block_size(mm, root) < chunk_size); + + mm->roots[root_count] = root; + + offset += root_size; + size -= root_size; + root_count++; + } while (size); + + return 0; + +out_free_roots: + while (root_count--) + drm_block_free(mm, mm->roots[root_count]); + kfree(mm->roots); +out_free_tree: + while (i--) + kfree(mm->free_trees[i]); + kfree(mm->free_trees); + return -ENOMEM; +} +EXPORT_SYMBOL(drm_buddy_init); + +/** + * drm_buddy_fini - tear down the memory manager + * + * @mm: DRM buddy manager to free + * + * Cleanup memory manager resources and the freetree + */ +void drm_buddy_fini(struct drm_buddy *mm) +{ + u64 root_size, size, start; + unsigned int order; + int i; + + size = mm->size; + + for (i = 0; i < mm->n_roots; ++i) { + order = ilog2(size) - ilog2(mm->chunk_size); + start = drm_buddy_block_offset(mm->roots[i]); + __force_merge(mm, start, start + size, order); + + if (WARN_ON(!drm_buddy_block_is_free(mm->roots[i]))) + kunit_fail_current_test("buddy_fini() root"); + + drm_block_free(mm, mm->roots[i]); + + root_size = mm->chunk_size << order; + size -= root_size; + } + + WARN_ON(mm->avail != mm->size); + + for_each_free_tree(i) + kfree(mm->free_trees[i]); + kfree(mm->free_trees); + kfree(mm->roots); +} +EXPORT_SYMBOL(drm_buddy_fini); + +static int split_block(struct drm_buddy *mm, + struct drm_buddy_block *block) +{ + unsigned int block_order = drm_buddy_block_order(block) - 1; + u64 offset = drm_buddy_block_offset(block); + + BUG_ON(!drm_buddy_block_is_free(block)); + BUG_ON(!drm_buddy_block_order(block)); + + block->left = drm_block_alloc(mm, block, block_order, offset); + if (!block->left) + return -ENOMEM; + + block->right = drm_block_alloc(mm, block, block_order, + offset + (mm->chunk_size << block_order)); + if (!block->right) { + drm_block_free(mm, block->left); + return -ENOMEM; + } + + mark_split(mm, block); + + if (drm_buddy_block_is_clear(block)) { + mark_cleared(block->left); + mark_cleared(block->right); + clear_reset(block); + } + + mark_free(mm, block->left); + mark_free(mm, block->right); + + return 0; +} + +/** + * drm_get_buddy - get buddy address + * + * @block: DRM buddy block + * + * Returns the corresponding buddy block for @block, or NULL + * if this is a root block and can't be merged further. + * Requires some kind of locking to protect against + * any concurrent allocate and free operations. + */ +struct drm_buddy_block * +drm_get_buddy(struct drm_buddy_block *block) +{ + return __get_buddy(block); +} +EXPORT_SYMBOL(drm_get_buddy); + +/** + * drm_buddy_reset_clear - reset blocks clear state + * + * @mm: DRM buddy manager + * @is_clear: blocks clear state + * + * Reset the clear state based on @is_clear value for each block + * in the freetree. + */ +void drm_buddy_reset_clear(struct drm_buddy *mm, bool is_clear) +{ + enum drm_buddy_free_tree src_tree, dst_tree; + u64 root_size, size, start; + unsigned int order; + int i; + + size = mm->size; + for (i = 0; i < mm->n_roots; ++i) { + order = ilog2(size) - ilog2(mm->chunk_size); + start = drm_buddy_block_offset(mm->roots[i]); + __force_merge(mm, start, start + size, order); + + root_size = mm->chunk_size << order; + size -= root_size; + } + + src_tree = is_clear ? DRM_BUDDY_DIRTY_TREE : DRM_BUDDY_CLEAR_TREE; + dst_tree = is_clear ? DRM_BUDDY_CLEAR_TREE : DRM_BUDDY_DIRTY_TREE; + + for (i = 0; i <= mm->max_order; ++i) { + struct rb_root *root = &mm->free_trees[src_tree][i]; + struct drm_buddy_block *block, *tmp; + + rbtree_postorder_for_each_entry_safe(block, tmp, root, rb) { + rbtree_remove(mm, block); + if (is_clear) { + mark_cleared(block); + mm->clear_avail += drm_buddy_block_size(mm, block); + } else { + clear_reset(block); + mm->clear_avail -= drm_buddy_block_size(mm, block); + } + + rbtree_insert(mm, block, dst_tree); + } + } +} +EXPORT_SYMBOL(drm_buddy_reset_clear); + +/** + * drm_buddy_free_block - free a block + * + * @mm: DRM buddy manager + * @block: block to be freed + */ +void drm_buddy_free_block(struct drm_buddy *mm, + struct drm_buddy_block *block) +{ + BUG_ON(!drm_buddy_block_is_allocated(block)); + mm->avail += drm_buddy_block_size(mm, block); + if (drm_buddy_block_is_clear(block)) + mm->clear_avail += drm_buddy_block_size(mm, block); + + __drm_buddy_free(mm, block, false); +} +EXPORT_SYMBOL(drm_buddy_free_block); + +static void __drm_buddy_free_list(struct drm_buddy *mm, + struct list_head *objects, + bool mark_clear, + bool mark_dirty) +{ + struct drm_buddy_block *block, *on; + + WARN_ON(mark_dirty && mark_clear); + + list_for_each_entry_safe(block, on, objects, link) { + if (mark_clear) + mark_cleared(block); + else if (mark_dirty) + clear_reset(block); + drm_buddy_free_block(mm, block); + cond_resched(); + } + INIT_LIST_HEAD(objects); +} + +static void drm_buddy_free_list_internal(struct drm_buddy *mm, + struct list_head *objects) +{ + /* + * Don't touch the clear/dirty bit, since allocation is still internal + * at this point. For example we might have just failed part of the + * allocation. + */ + __drm_buddy_free_list(mm, objects, false, false); +} + +/** + * drm_buddy_free_list - free blocks + * + * @mm: DRM buddy manager + * @objects: input list head to free blocks + * @flags: optional flags like DRM_BUDDY_CLEARED + */ +void drm_buddy_free_list(struct drm_buddy *mm, + struct list_head *objects, + unsigned int flags) +{ + bool mark_clear = flags & DRM_BUDDY_CLEARED; + + __drm_buddy_free_list(mm, objects, mark_clear, !mark_clear); +} +EXPORT_SYMBOL(drm_buddy_free_list); + +static bool block_incompatible(struct drm_buddy_block *block, unsigned int flags) +{ + bool needs_clear = flags & DRM_BUDDY_CLEAR_ALLOCATION; + + return needs_clear != drm_buddy_block_is_clear(block); +} + +static struct drm_buddy_block * +__alloc_range_bias(struct drm_buddy *mm, + u64 start, u64 end, + unsigned int order, + unsigned long flags, + bool fallback) +{ + u64 req_size = mm->chunk_size << order; + struct drm_buddy_block *block; + struct drm_buddy_block *buddy; + LIST_HEAD(dfs); + int err; + int i; + + end = end - 1; + + for (i = 0; i < mm->n_roots; ++i) + list_add_tail(&mm->roots[i]->tmp_link, &dfs); + + do { + u64 block_start; + u64 block_end; + + block = list_first_entry_or_null(&dfs, + struct drm_buddy_block, + tmp_link); + if (!block) + break; + + list_del(&block->tmp_link); + + if (drm_buddy_block_order(block) < order) + continue; + + block_start = drm_buddy_block_offset(block); + block_end = block_start + drm_buddy_block_size(mm, block) - 1; + + if (!overlaps(start, end, block_start, block_end)) + continue; + + if (drm_buddy_block_is_allocated(block)) + continue; + + if (block_start < start || block_end > end) { + u64 adjusted_start = max(block_start, start); + u64 adjusted_end = min(block_end, end); + + if (round_down(adjusted_end + 1, req_size) <= + round_up(adjusted_start, req_size)) + continue; + } + + if (!fallback && block_incompatible(block, flags)) + continue; + + if (contains(start, end, block_start, block_end) && + order == drm_buddy_block_order(block)) { + /* + * Find the free block within the range. + */ + if (drm_buddy_block_is_free(block)) + return block; + + continue; + } + + if (!drm_buddy_block_is_split(block)) { + err = split_block(mm, block); + if (unlikely(err)) + goto err_undo; + } + + list_add(&block->right->tmp_link, &dfs); + list_add(&block->left->tmp_link, &dfs); + } while (1); + + return ERR_PTR(-ENOSPC); + +err_undo: + /* + * We really don't want to leave around a bunch of split blocks, since + * bigger is better, so make sure we merge everything back before we + * free the allocated blocks. + */ + buddy = __get_buddy(block); + if (buddy && + (drm_buddy_block_is_free(block) && + drm_buddy_block_is_free(buddy))) + __drm_buddy_free(mm, block, false); + return ERR_PTR(err); +} + +static struct drm_buddy_block * +__drm_buddy_alloc_range_bias(struct drm_buddy *mm, + u64 start, u64 end, + unsigned int order, + unsigned long flags) +{ + struct drm_buddy_block *block; + bool fallback = false; + + block = __alloc_range_bias(mm, start, end, order, + flags, fallback); + if (IS_ERR(block)) + return __alloc_range_bias(mm, start, end, order, + flags, !fallback); + + return block; +} + +static struct drm_buddy_block * +get_maxblock(struct drm_buddy *mm, + unsigned int order, + enum drm_buddy_free_tree tree) +{ + struct drm_buddy_block *max_block = NULL, *block = NULL; + struct rb_root *root; + unsigned int i; + + for (i = order; i <= mm->max_order; ++i) { + root = &mm->free_trees[tree][i]; + block = rbtree_last_free_block(root); + if (!block) + continue; + + if (!max_block) { + max_block = block; + continue; + } + + if (drm_buddy_block_offset(block) > + drm_buddy_block_offset(max_block)) { + max_block = block; + } + } + + return max_block; +} + +static struct drm_buddy_block * +alloc_from_freetree(struct drm_buddy *mm, + unsigned int order, + unsigned long flags) +{ + struct drm_buddy_block *block = NULL; + struct rb_root *root; + enum drm_buddy_free_tree tree; + unsigned int tmp; + int err; + + tree = (flags & DRM_BUDDY_CLEAR_ALLOCATION) ? + DRM_BUDDY_CLEAR_TREE : DRM_BUDDY_DIRTY_TREE; + + if (flags & DRM_BUDDY_TOPDOWN_ALLOCATION) { + block = get_maxblock(mm, order, tree); + if (block) + /* Store the obtained block order */ + tmp = drm_buddy_block_order(block); + } else { + for (tmp = order; tmp <= mm->max_order; ++tmp) { + /* Get RB tree root for this order and tree */ + root = &mm->free_trees[tree][tmp]; + block = rbtree_last_free_block(root); + if (block) + break; + } + } + + if (!block) { + /* Try allocating from the other tree */ + tree = (tree == DRM_BUDDY_CLEAR_TREE) ? + DRM_BUDDY_DIRTY_TREE : DRM_BUDDY_CLEAR_TREE; + + for (tmp = order; tmp <= mm->max_order; ++tmp) { + root = &mm->free_trees[tree][tmp]; + block = rbtree_last_free_block(root); + if (block) + break; + } + + if (!block) + return ERR_PTR(-ENOSPC); + } + + BUG_ON(!drm_buddy_block_is_free(block)); + + while (tmp != order) { + err = split_block(mm, block); + if (unlikely(err)) + goto err_undo; + + block = block->right; + tmp--; + } + return block; + +err_undo: + if (tmp != order) + __drm_buddy_free(mm, block, false); + return ERR_PTR(err); +} + +static int __alloc_range(struct drm_buddy *mm, + struct list_head *dfs, + u64 start, u64 size, + struct list_head *blocks, + u64 *total_allocated_on_err) +{ + struct drm_buddy_block *block; + struct drm_buddy_block *buddy; + u64 total_allocated = 0; + LIST_HEAD(allocated); + u64 end; + int err; + + end = start + size - 1; + + do { + u64 block_start; + u64 block_end; + + block = list_first_entry_or_null(dfs, + struct drm_buddy_block, + tmp_link); + if (!block) + break; + + list_del(&block->tmp_link); + + block_start = drm_buddy_block_offset(block); + block_end = block_start + drm_buddy_block_size(mm, block) - 1; + + if (!overlaps(start, end, block_start, block_end)) + continue; + + if (drm_buddy_block_is_allocated(block)) { + err = -ENOSPC; + goto err_free; + } + + if (contains(start, end, block_start, block_end)) { + if (drm_buddy_block_is_free(block)) { + mark_allocated(mm, block); + total_allocated += drm_buddy_block_size(mm, block); + mm->avail -= drm_buddy_block_size(mm, block); + if (drm_buddy_block_is_clear(block)) + mm->clear_avail -= drm_buddy_block_size(mm, block); + list_add_tail(&block->link, &allocated); + continue; + } else if (!mm->clear_avail) { + err = -ENOSPC; + goto err_free; + } + } + + if (!drm_buddy_block_is_split(block)) { + err = split_block(mm, block); + if (unlikely(err)) + goto err_undo; + } + + list_add(&block->right->tmp_link, dfs); + list_add(&block->left->tmp_link, dfs); + } while (1); + + if (total_allocated < size) { + err = -ENOSPC; + goto err_free; + } + + list_splice_tail(&allocated, blocks); + + return 0; + +err_undo: + /* + * We really don't want to leave around a bunch of split blocks, since + * bigger is better, so make sure we merge everything back before we + * free the allocated blocks. + */ + buddy = __get_buddy(block); + if (buddy && + (drm_buddy_block_is_free(block) && + drm_buddy_block_is_free(buddy))) + __drm_buddy_free(mm, block, false); + +err_free: + if (err == -ENOSPC && total_allocated_on_err) { + list_splice_tail(&allocated, blocks); + *total_allocated_on_err = total_allocated; + } else { + drm_buddy_free_list_internal(mm, &allocated); + } + + return err; +} + +static int __drm_buddy_alloc_range(struct drm_buddy *mm, + u64 start, + u64 size, + u64 *total_allocated_on_err, + struct list_head *blocks) +{ + LIST_HEAD(dfs); + int i; + + for (i = 0; i < mm->n_roots; ++i) + list_add_tail(&mm->roots[i]->tmp_link, &dfs); + + return __alloc_range(mm, &dfs, start, size, + blocks, total_allocated_on_err); +} + +static int __alloc_contig_try_harder(struct drm_buddy *mm, + u64 size, + u64 min_block_size, + struct list_head *blocks) +{ + u64 rhs_offset, lhs_offset, lhs_size, filled; + struct drm_buddy_block *block; + unsigned int tree, order; + LIST_HEAD(blocks_lhs); + unsigned long pages; + u64 modify_size; + int err; + + modify_size = rounddown_pow_of_two(size); + pages = modify_size >> ilog2(mm->chunk_size); + order = fls(pages) - 1; + if (order == 0) + return -ENOSPC; + + for_each_free_tree(tree) { + struct rb_root *root; + struct rb_node *iter; + + root = &mm->free_trees[tree][order]; + if (rbtree_is_empty(root)) + continue; + + iter = rb_last(root); + while (iter) { + block = rbtree_get_free_block(iter); + + /* Allocate blocks traversing RHS */ + rhs_offset = drm_buddy_block_offset(block); + err = __drm_buddy_alloc_range(mm, rhs_offset, size, + &filled, blocks); + if (!err || err != -ENOSPC) + return err; + + lhs_size = max((size - filled), min_block_size); + if (!IS_ALIGNED(lhs_size, min_block_size)) + lhs_size = round_up(lhs_size, min_block_size); + + /* Allocate blocks traversing LHS */ + lhs_offset = drm_buddy_block_offset(block) - lhs_size; + err = __drm_buddy_alloc_range(mm, lhs_offset, lhs_size, + NULL, &blocks_lhs); + if (!err) { + list_splice(&blocks_lhs, blocks); + return 0; + } else if (err != -ENOSPC) { + drm_buddy_free_list_internal(mm, blocks); + return err; + } + /* Free blocks for the next iteration */ + drm_buddy_free_list_internal(mm, blocks); + + iter = rb_prev(iter); + } + } + + return -ENOSPC; +} + +/** + * drm_buddy_block_trim - free unused pages + * + * @mm: DRM buddy manager + * @start: start address to begin the trimming. + * @new_size: original size requested + * @blocks: Input and output list of allocated blocks. + * MUST contain single block as input to be trimmed. + * On success will contain the newly allocated blocks + * making up the @new_size. Blocks always appear in + * ascending order + * + * For contiguous allocation, we round up the size to the nearest + * power of two value, drivers consume *actual* size, so remaining + * portions are unused and can be optionally freed with this function + * + * Returns: + * 0 on success, error code on failure. + */ +int drm_buddy_block_trim(struct drm_buddy *mm, + u64 *start, + u64 new_size, + struct list_head *blocks) +{ + struct drm_buddy_block *parent; + struct drm_buddy_block *block; + u64 block_start, block_end; + LIST_HEAD(dfs); + u64 new_start; + int err; + + if (!list_is_singular(blocks)) + return -EINVAL; + + block = list_first_entry(blocks, + struct drm_buddy_block, + link); + + block_start = drm_buddy_block_offset(block); + block_end = block_start + drm_buddy_block_size(mm, block); + + if (WARN_ON(!drm_buddy_block_is_allocated(block))) + return -EINVAL; + + if (new_size > drm_buddy_block_size(mm, block)) + return -EINVAL; + + if (!new_size || !IS_ALIGNED(new_size, mm->chunk_size)) + return -EINVAL; + + if (new_size == drm_buddy_block_size(mm, block)) + return 0; + + new_start = block_start; + if (start) { + new_start = *start; + + if (new_start < block_start) + return -EINVAL; + + if (!IS_ALIGNED(new_start, mm->chunk_size)) + return -EINVAL; + + if (range_overflows(new_start, new_size, block_end)) + return -EINVAL; + } + + list_del(&block->link); + mark_free(mm, block); + mm->avail += drm_buddy_block_size(mm, block); + if (drm_buddy_block_is_clear(block)) + mm->clear_avail += drm_buddy_block_size(mm, block); + + /* Prevent recursively freeing this node */ + parent = block->parent; + block->parent = NULL; + + list_add(&block->tmp_link, &dfs); + err = __alloc_range(mm, &dfs, new_start, new_size, blocks, NULL); + if (err) { + mark_allocated(mm, block); + mm->avail -= drm_buddy_block_size(mm, block); + if (drm_buddy_block_is_clear(block)) + mm->clear_avail -= drm_buddy_block_size(mm, block); + list_add(&block->link, blocks); + } + + block->parent = parent; + return err; +} +EXPORT_SYMBOL(drm_buddy_block_trim); + +static struct drm_buddy_block * +__drm_buddy_alloc_blocks(struct drm_buddy *mm, + u64 start, u64 end, + unsigned int order, + unsigned long flags) +{ + if (flags & DRM_BUDDY_RANGE_ALLOCATION) + /* Allocate traversing within the range */ + return __drm_buddy_alloc_range_bias(mm, start, end, + order, flags); + else + /* Allocate from freetree */ + return alloc_from_freetree(mm, order, flags); +} + +/** + * drm_buddy_alloc_blocks - allocate power-of-two blocks + * + * @mm: DRM buddy manager to allocate from + * @start: start of the allowed range for this block + * @end: end of the allowed range for this block + * @size: size of the allocation in bytes + * @min_block_size: alignment of the allocation + * @blocks: output list head to add allocated blocks + * @flags: DRM_BUDDY_*_ALLOCATION flags + * + * alloc_range_bias() called on range limitations, which traverses + * the tree and returns the desired block. + * + * alloc_from_freetree() called when *no* range restrictions + * are enforced, which picks the block from the freetree. + * + * Returns: + * 0 on success, error code on failure. + */ +int drm_buddy_alloc_blocks(struct drm_buddy *mm, + u64 start, u64 end, u64 size, + u64 min_block_size, + struct list_head *blocks, + unsigned long flags) +{ + struct drm_buddy_block *block = NULL; + u64 original_size, original_min_size; + unsigned int min_order, order; + LIST_HEAD(allocated); + unsigned long pages; + int err; + + if (size < mm->chunk_size) + return -EINVAL; + + if (min_block_size < mm->chunk_size) + return -EINVAL; + + if (!is_power_of_2(min_block_size)) + return -EINVAL; + + if (!IS_ALIGNED(start | end | size, mm->chunk_size)) + return -EINVAL; + + if (end > mm->size) + return -EINVAL; + + if (range_overflows(start, size, mm->size)) + return -EINVAL; + + /* Actual range allocation */ + if (start + size == end) { + if (!IS_ALIGNED(start | end, min_block_size)) + return -EINVAL; + + return __drm_buddy_alloc_range(mm, start, size, NULL, blocks); + } + + original_size = size; + original_min_size = min_block_size; + + /* Roundup the size to power of 2 */ + if (flags & DRM_BUDDY_CONTIGUOUS_ALLOCATION) { + size = roundup_pow_of_two(size); + min_block_size = size; + /* Align size value to min_block_size */ + } else if (!IS_ALIGNED(size, min_block_size)) { + size = round_up(size, min_block_size); + } + + pages = size >> ilog2(mm->chunk_size); + order = fls(pages) - 1; + min_order = ilog2(min_block_size) - ilog2(mm->chunk_size); + + if (order > mm->max_order || size > mm->size) { + if ((flags & DRM_BUDDY_CONTIGUOUS_ALLOCATION) && + !(flags & DRM_BUDDY_RANGE_ALLOCATION)) + return __alloc_contig_try_harder(mm, original_size, + original_min_size, blocks); + + return -EINVAL; + } + + do { + order = min(order, (unsigned int)fls(pages) - 1); + BUG_ON(order > mm->max_order); + BUG_ON(order < min_order); + + do { + block = __drm_buddy_alloc_blocks(mm, start, + end, + order, + flags); + if (!IS_ERR(block)) + break; + + if (order-- == min_order) { + /* Try allocation through force merge method */ + if (mm->clear_avail && + !__force_merge(mm, start, end, min_order)) { + block = __drm_buddy_alloc_blocks(mm, start, + end, + min_order, + flags); + if (!IS_ERR(block)) { + order = min_order; + break; + } + } + + /* + * Try contiguous block allocation through + * try harder method. + */ + if (flags & DRM_BUDDY_CONTIGUOUS_ALLOCATION && + !(flags & DRM_BUDDY_RANGE_ALLOCATION)) + return __alloc_contig_try_harder(mm, + original_size, + original_min_size, + blocks); + err = -ENOSPC; + goto err_free; + } + } while (1); + + mark_allocated(mm, block); + mm->avail -= drm_buddy_block_size(mm, block); + if (drm_buddy_block_is_clear(block)) + mm->clear_avail -= drm_buddy_block_size(mm, block); + kmemleak_update_trace(block); + list_add_tail(&block->link, &allocated); + + pages -= BIT(order); + + if (!pages) + break; + } while (1); + + /* Trim the allocated block to the required size */ + if (!(flags & DRM_BUDDY_TRIM_DISABLE) && + original_size != size) { + struct list_head *trim_list; + LIST_HEAD(temp); + u64 trim_size; + + trim_list = &allocated; + trim_size = original_size; + + if (!list_is_singular(&allocated)) { + block = list_last_entry(&allocated, typeof(*block), link); + list_move(&block->link, &temp); + trim_list = &temp; + trim_size = drm_buddy_block_size(mm, block) - + (size - original_size); + } + + drm_buddy_block_trim(mm, + NULL, + trim_size, + trim_list); + + if (!list_empty(&temp)) + list_splice_tail(trim_list, &allocated); + } + + list_splice_tail(&allocated, blocks); + return 0; + +err_free: + drm_buddy_free_list_internal(mm, &allocated); + return err; +} +EXPORT_SYMBOL(drm_buddy_alloc_blocks); + +/** + * drm_buddy_block_print - print block information + * + * @mm: DRM buddy manager + * @block: DRM buddy block + * @p: DRM printer to use + */ +void drm_buddy_block_print(struct drm_buddy *mm, + struct drm_buddy_block *block, + struct drm_printer *p) +{ + u64 start = drm_buddy_block_offset(block); + u64 size = drm_buddy_block_size(mm, block); + + drm_printf(p, "%#018llx-%#018llx: %llu\n", start, start + size, size); +} +EXPORT_SYMBOL(drm_buddy_block_print); + +/** + * drm_buddy_print - print allocator state + * + * @mm: DRM buddy manager + * @p: DRM printer to use + */ +void drm_buddy_print(struct drm_buddy *mm, struct drm_printer *p) +{ + int order; + + drm_printf(p, "chunk_size: %lluKiB, total: %lluMiB, free: %lluMiB, clear_free: %lluMiB\n", + mm->chunk_size >> 10, mm->size >> 20, mm->avail >> 20, mm->clear_avail >> 20); + + for (order = mm->max_order; order >= 0; order--) { + struct drm_buddy_block *block, *tmp; + struct rb_root *root; + u64 count = 0, free; + unsigned int tree; + + for_each_free_tree(tree) { + root = &mm->free_trees[tree][order]; + + rbtree_postorder_for_each_entry_safe(block, tmp, root, rb) { + BUG_ON(!drm_buddy_block_is_free(block)); + count++; + } + } + + drm_printf(p, "order-%2d ", order); + + free = count * (mm->chunk_size << order); + if (free < SZ_1M) + drm_printf(p, "free: %8llu KiB", free >> 10); + else + drm_printf(p, "free: %8llu MiB", free >> 20); + + drm_printf(p, ", blocks: %llu\n", count); + } +} +EXPORT_SYMBOL(drm_buddy_print); + +static void drm_buddy_module_exit(void) +{ + kmem_cache_destroy(slab_blocks); +} + +static int __init drm_buddy_module_init(void) +{ + slab_blocks = KMEM_CACHE(drm_buddy_block, 0); + if (!slab_blocks) + return -ENOMEM; + + return 0; +} + +module_init(drm_buddy_module_init); +module_exit(drm_buddy_module_exit); + +MODULE_DESCRIPTION("DRM Buddy Allocator"); +MODULE_LICENSE("Dual MIT/GPL"); diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index 5888eb147ed1..862ff4000969 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -269,10 +269,6 @@ config DRM_SCHED config DRM_PANEL_BACKLIGHT_QUIRKS tristate -config DRM_LIB_RANDOM - bool - default n - config DRM_PRIVACY_SCREEN bool default n diff --git a/drivers/gpu/drm/Kconfig.debug b/drivers/gpu/drm/Kconfig.debug index 05dc43c0b8c5..3b7886865335 100644 --- a/drivers/gpu/drm/Kconfig.debug +++ b/drivers/gpu/drm/Kconfig.debug @@ -69,7 +69,6 @@ config DRM_KUNIT_TEST select DRM_EXPORT_FOR_TESTS if m select DRM_GEM_SHMEM_HELPER select DRM_KUNIT_TEST_HELPERS - select DRM_LIB_RANDOM select DRM_SYSFB_HELPER select PRIME_NUMBERS default KUNIT_ALL_TESTS diff --git a/drivers/gpu/drm/Makefile b/drivers/gpu/drm/Makefile index 75840ec4d782..892859cfe95f 100644 --- a/drivers/gpu/drm/Makefile +++ b/drivers/gpu/drm/Makefile @@ -79,7 +79,6 @@ drm-$(CONFIG_DRM_CLIENT) += \ drm_client_event.o \ drm_client_modeset.o \ drm_client_sysrq.o -drm-$(CONFIG_DRM_LIB_RANDOM) += lib/drm_random.o drm-$(CONFIG_COMPAT) += drm_ioc32.o drm-$(CONFIG_DRM_PANEL) += drm_panel.o drm-$(CONFIG_OF) += drm_of.o @@ -115,7 +114,7 @@ drm_gpusvm_helper-$(CONFIG_ZONE_DEVICE) += \ obj-$(CONFIG_DRM_GPUSVM) += drm_gpusvm_helper.o -obj-$(CONFIG_DRM_BUDDY) += drm_buddy.o +obj-$(CONFIG_DRM_BUDDY) += ../buddy.o drm_dma_helper-y := drm_gem_dma_helper.o drm_dma_helper-$(CONFIG_DRM_FBDEV_EMULATION) += drm_fbdev_dma.o diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h index 5f5fd9a911c2..874779618056 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h @@ -24,7 +24,7 @@ #ifndef __AMDGPU_VRAM_MGR_H__ #define __AMDGPU_VRAM_MGR_H__ -#include +#include struct amdgpu_vram_mgr { struct ttm_resource_manager manager; diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c deleted file mode 100644 index fd34d3755f7c..000000000000 --- a/drivers/gpu/drm/drm_buddy.c +++ /dev/null @@ -1,1336 +0,0 @@ -// SPDX-License-Identifier: MIT -/* - * Copyright © 2021 Intel Corporation - */ - -#include - -#include -#include -#include -#include - -#include -#include - -enum drm_buddy_free_tree { - DRM_BUDDY_CLEAR_TREE = 0, - DRM_BUDDY_DIRTY_TREE, - DRM_BUDDY_MAX_FREE_TREES, -}; - -static struct kmem_cache *slab_blocks; - -#define for_each_free_tree(tree) \ - for ((tree) = 0; (tree) < DRM_BUDDY_MAX_FREE_TREES; (tree)++) - -static struct drm_buddy_block *drm_block_alloc(struct drm_buddy *mm, - struct drm_buddy_block *parent, - unsigned int order, - u64 offset) -{ - struct drm_buddy_block *block; - - BUG_ON(order > DRM_BUDDY_MAX_ORDER); - - block = kmem_cache_zalloc(slab_blocks, GFP_KERNEL); - if (!block) - return NULL; - - block->header = offset; - block->header |= order; - block->parent = parent; - - RB_CLEAR_NODE(&block->rb); - - BUG_ON(block->header & DRM_BUDDY_HEADER_UNUSED); - return block; -} - -static void drm_block_free(struct drm_buddy *mm, - struct drm_buddy_block *block) -{ - kmem_cache_free(slab_blocks, block); -} - -static enum drm_buddy_free_tree -get_block_tree(struct drm_buddy_block *block) -{ - return drm_buddy_block_is_clear(block) ? - DRM_BUDDY_CLEAR_TREE : DRM_BUDDY_DIRTY_TREE; -} - -static struct drm_buddy_block * -rbtree_get_free_block(const struct rb_node *node) -{ - return node ? rb_entry(node, struct drm_buddy_block, rb) : NULL; -} - -static struct drm_buddy_block * -rbtree_last_free_block(struct rb_root *root) -{ - return rbtree_get_free_block(rb_last(root)); -} - -static bool rbtree_is_empty(struct rb_root *root) -{ - return RB_EMPTY_ROOT(root); -} - -static bool drm_buddy_block_offset_less(const struct drm_buddy_block *block, - const struct drm_buddy_block *node) -{ - return drm_buddy_block_offset(block) < drm_buddy_block_offset(node); -} - -static bool rbtree_block_offset_less(struct rb_node *block, - const struct rb_node *node) -{ - return drm_buddy_block_offset_less(rbtree_get_free_block(block), - rbtree_get_free_block(node)); -} - -static void rbtree_insert(struct drm_buddy *mm, - struct drm_buddy_block *block, - enum drm_buddy_free_tree tree) -{ - rb_add(&block->rb, - &mm->free_trees[tree][drm_buddy_block_order(block)], - rbtree_block_offset_less); -} - -static void rbtree_remove(struct drm_buddy *mm, - struct drm_buddy_block *block) -{ - unsigned int order = drm_buddy_block_order(block); - enum drm_buddy_free_tree tree; - struct rb_root *root; - - tree = get_block_tree(block); - root = &mm->free_trees[tree][order]; - - rb_erase(&block->rb, root); - RB_CLEAR_NODE(&block->rb); -} - -static void clear_reset(struct drm_buddy_block *block) -{ - block->header &= ~DRM_BUDDY_HEADER_CLEAR; -} - -static void mark_cleared(struct drm_buddy_block *block) -{ - block->header |= DRM_BUDDY_HEADER_CLEAR; -} - -static void mark_allocated(struct drm_buddy *mm, - struct drm_buddy_block *block) -{ - block->header &= ~DRM_BUDDY_HEADER_STATE; - block->header |= DRM_BUDDY_ALLOCATED; - - rbtree_remove(mm, block); -} - -static void mark_free(struct drm_buddy *mm, - struct drm_buddy_block *block) -{ - enum drm_buddy_free_tree tree; - - block->header &= ~DRM_BUDDY_HEADER_STATE; - block->header |= DRM_BUDDY_FREE; - - tree = get_block_tree(block); - rbtree_insert(mm, block, tree); -} - -static void mark_split(struct drm_buddy *mm, - struct drm_buddy_block *block) -{ - block->header &= ~DRM_BUDDY_HEADER_STATE; - block->header |= DRM_BUDDY_SPLIT; - - rbtree_remove(mm, block); -} - -static inline bool overlaps(u64 s1, u64 e1, u64 s2, u64 e2) -{ - return s1 <= e2 && e1 >= s2; -} - -static inline bool contains(u64 s1, u64 e1, u64 s2, u64 e2) -{ - return s1 <= s2 && e1 >= e2; -} - -static struct drm_buddy_block * -__get_buddy(struct drm_buddy_block *block) -{ - struct drm_buddy_block *parent; - - parent = block->parent; - if (!parent) - return NULL; - - if (parent->left == block) - return parent->right; - - return parent->left; -} - -static unsigned int __drm_buddy_free(struct drm_buddy *mm, - struct drm_buddy_block *block, - bool force_merge) -{ - struct drm_buddy_block *parent; - unsigned int order; - - while ((parent = block->parent)) { - struct drm_buddy_block *buddy; - - buddy = __get_buddy(block); - - if (!drm_buddy_block_is_free(buddy)) - break; - - if (!force_merge) { - /* - * Check the block and its buddy clear state and exit - * the loop if they both have the dissimilar state. - */ - if (drm_buddy_block_is_clear(block) != - drm_buddy_block_is_clear(buddy)) - break; - - if (drm_buddy_block_is_clear(block)) - mark_cleared(parent); - } - - rbtree_remove(mm, buddy); - if (force_merge && drm_buddy_block_is_clear(buddy)) - mm->clear_avail -= drm_buddy_block_size(mm, buddy); - - drm_block_free(mm, block); - drm_block_free(mm, buddy); - - block = parent; - } - - order = drm_buddy_block_order(block); - mark_free(mm, block); - - return order; -} - -static int __force_merge(struct drm_buddy *mm, - u64 start, - u64 end, - unsigned int min_order) -{ - unsigned int tree, order; - int i; - - if (!min_order) - return -ENOMEM; - - if (min_order > mm->max_order) - return -EINVAL; - - for_each_free_tree(tree) { - for (i = min_order - 1; i >= 0; i--) { - struct rb_node *iter = rb_last(&mm->free_trees[tree][i]); - - while (iter) { - struct drm_buddy_block *block, *buddy; - u64 block_start, block_end; - - block = rbtree_get_free_block(iter); - iter = rb_prev(iter); - - if (!block || !block->parent) - continue; - - block_start = drm_buddy_block_offset(block); - block_end = block_start + drm_buddy_block_size(mm, block) - 1; - - if (!contains(start, end, block_start, block_end)) - continue; - - buddy = __get_buddy(block); - if (!drm_buddy_block_is_free(buddy)) - continue; - - WARN_ON(drm_buddy_block_is_clear(block) == - drm_buddy_block_is_clear(buddy)); - - /* - * Advance to the next node when the current node is the buddy, - * as freeing the block will also remove its buddy from the tree. - */ - if (iter == &buddy->rb) - iter = rb_prev(iter); - - rbtree_remove(mm, block); - if (drm_buddy_block_is_clear(block)) - mm->clear_avail -= drm_buddy_block_size(mm, block); - - order = __drm_buddy_free(mm, block, true); - if (order >= min_order) - return 0; - } - } - } - - return -ENOMEM; -} - -/** - * drm_buddy_init - init memory manager - * - * @mm: DRM buddy manager to initialize - * @size: size in bytes to manage - * @chunk_size: minimum page size in bytes for our allocations - * - * Initializes the memory manager and its resources. - * - * Returns: - * 0 on success, error code on failure. - */ -int drm_buddy_init(struct drm_buddy *mm, u64 size, u64 chunk_size) -{ - unsigned int i, j, root_count = 0; - u64 offset = 0; - - if (size < chunk_size) - return -EINVAL; - - if (chunk_size < SZ_4K) - return -EINVAL; - - if (!is_power_of_2(chunk_size)) - return -EINVAL; - - size = round_down(size, chunk_size); - - mm->size = size; - mm->avail = size; - mm->clear_avail = 0; - mm->chunk_size = chunk_size; - mm->max_order = ilog2(size) - ilog2(chunk_size); - - BUG_ON(mm->max_order > DRM_BUDDY_MAX_ORDER); - - mm->free_trees = kmalloc_array(DRM_BUDDY_MAX_FREE_TREES, - sizeof(*mm->free_trees), - GFP_KERNEL); - if (!mm->free_trees) - return -ENOMEM; - - for_each_free_tree(i) { - mm->free_trees[i] = kmalloc_array(mm->max_order + 1, - sizeof(struct rb_root), - GFP_KERNEL); - if (!mm->free_trees[i]) - goto out_free_tree; - - for (j = 0; j <= mm->max_order; ++j) - mm->free_trees[i][j] = RB_ROOT; - } - - mm->n_roots = hweight64(size); - - mm->roots = kmalloc_array(mm->n_roots, - sizeof(struct drm_buddy_block *), - GFP_KERNEL); - if (!mm->roots) - goto out_free_tree; - - /* - * Split into power-of-two blocks, in case we are given a size that is - * not itself a power-of-two. - */ - do { - struct drm_buddy_block *root; - unsigned int order; - u64 root_size; - - order = ilog2(size) - ilog2(chunk_size); - root_size = chunk_size << order; - - root = drm_block_alloc(mm, NULL, order, offset); - if (!root) - goto out_free_roots; - - mark_free(mm, root); - - BUG_ON(root_count > mm->max_order); - BUG_ON(drm_buddy_block_size(mm, root) < chunk_size); - - mm->roots[root_count] = root; - - offset += root_size; - size -= root_size; - root_count++; - } while (size); - - return 0; - -out_free_roots: - while (root_count--) - drm_block_free(mm, mm->roots[root_count]); - kfree(mm->roots); -out_free_tree: - while (i--) - kfree(mm->free_trees[i]); - kfree(mm->free_trees); - return -ENOMEM; -} -EXPORT_SYMBOL(drm_buddy_init); - -/** - * drm_buddy_fini - tear down the memory manager - * - * @mm: DRM buddy manager to free - * - * Cleanup memory manager resources and the freetree - */ -void drm_buddy_fini(struct drm_buddy *mm) -{ - u64 root_size, size, start; - unsigned int order; - int i; - - size = mm->size; - - for (i = 0; i < mm->n_roots; ++i) { - order = ilog2(size) - ilog2(mm->chunk_size); - start = drm_buddy_block_offset(mm->roots[i]); - __force_merge(mm, start, start + size, order); - - if (WARN_ON(!drm_buddy_block_is_free(mm->roots[i]))) - kunit_fail_current_test("buddy_fini() root"); - - drm_block_free(mm, mm->roots[i]); - - root_size = mm->chunk_size << order; - size -= root_size; - } - - WARN_ON(mm->avail != mm->size); - - for_each_free_tree(i) - kfree(mm->free_trees[i]); - kfree(mm->free_trees); - kfree(mm->roots); -} -EXPORT_SYMBOL(drm_buddy_fini); - -static int split_block(struct drm_buddy *mm, - struct drm_buddy_block *block) -{ - unsigned int block_order = drm_buddy_block_order(block) - 1; - u64 offset = drm_buddy_block_offset(block); - - BUG_ON(!drm_buddy_block_is_free(block)); - BUG_ON(!drm_buddy_block_order(block)); - - block->left = drm_block_alloc(mm, block, block_order, offset); - if (!block->left) - return -ENOMEM; - - block->right = drm_block_alloc(mm, block, block_order, - offset + (mm->chunk_size << block_order)); - if (!block->right) { - drm_block_free(mm, block->left); - return -ENOMEM; - } - - mark_split(mm, block); - - if (drm_buddy_block_is_clear(block)) { - mark_cleared(block->left); - mark_cleared(block->right); - clear_reset(block); - } - - mark_free(mm, block->left); - mark_free(mm, block->right); - - return 0; -} - -/** - * drm_get_buddy - get buddy address - * - * @block: DRM buddy block - * - * Returns the corresponding buddy block for @block, or NULL - * if this is a root block and can't be merged further. - * Requires some kind of locking to protect against - * any concurrent allocate and free operations. - */ -struct drm_buddy_block * -drm_get_buddy(struct drm_buddy_block *block) -{ - return __get_buddy(block); -} -EXPORT_SYMBOL(drm_get_buddy); - -/** - * drm_buddy_reset_clear - reset blocks clear state - * - * @mm: DRM buddy manager - * @is_clear: blocks clear state - * - * Reset the clear state based on @is_clear value for each block - * in the freetree. - */ -void drm_buddy_reset_clear(struct drm_buddy *mm, bool is_clear) -{ - enum drm_buddy_free_tree src_tree, dst_tree; - u64 root_size, size, start; - unsigned int order; - int i; - - size = mm->size; - for (i = 0; i < mm->n_roots; ++i) { - order = ilog2(size) - ilog2(mm->chunk_size); - start = drm_buddy_block_offset(mm->roots[i]); - __force_merge(mm, start, start + size, order); - - root_size = mm->chunk_size << order; - size -= root_size; - } - - src_tree = is_clear ? DRM_BUDDY_DIRTY_TREE : DRM_BUDDY_CLEAR_TREE; - dst_tree = is_clear ? DRM_BUDDY_CLEAR_TREE : DRM_BUDDY_DIRTY_TREE; - - for (i = 0; i <= mm->max_order; ++i) { - struct rb_root *root = &mm->free_trees[src_tree][i]; - struct drm_buddy_block *block, *tmp; - - rbtree_postorder_for_each_entry_safe(block, tmp, root, rb) { - rbtree_remove(mm, block); - if (is_clear) { - mark_cleared(block); - mm->clear_avail += drm_buddy_block_size(mm, block); - } else { - clear_reset(block); - mm->clear_avail -= drm_buddy_block_size(mm, block); - } - - rbtree_insert(mm, block, dst_tree); - } - } -} -EXPORT_SYMBOL(drm_buddy_reset_clear); - -/** - * drm_buddy_free_block - free a block - * - * @mm: DRM buddy manager - * @block: block to be freed - */ -void drm_buddy_free_block(struct drm_buddy *mm, - struct drm_buddy_block *block) -{ - BUG_ON(!drm_buddy_block_is_allocated(block)); - mm->avail += drm_buddy_block_size(mm, block); - if (drm_buddy_block_is_clear(block)) - mm->clear_avail += drm_buddy_block_size(mm, block); - - __drm_buddy_free(mm, block, false); -} -EXPORT_SYMBOL(drm_buddy_free_block); - -static void __drm_buddy_free_list(struct drm_buddy *mm, - struct list_head *objects, - bool mark_clear, - bool mark_dirty) -{ - struct drm_buddy_block *block, *on; - - WARN_ON(mark_dirty && mark_clear); - - list_for_each_entry_safe(block, on, objects, link) { - if (mark_clear) - mark_cleared(block); - else if (mark_dirty) - clear_reset(block); - drm_buddy_free_block(mm, block); - cond_resched(); - } - INIT_LIST_HEAD(objects); -} - -static void drm_buddy_free_list_internal(struct drm_buddy *mm, - struct list_head *objects) -{ - /* - * Don't touch the clear/dirty bit, since allocation is still internal - * at this point. For example we might have just failed part of the - * allocation. - */ - __drm_buddy_free_list(mm, objects, false, false); -} - -/** - * drm_buddy_free_list - free blocks - * - * @mm: DRM buddy manager - * @objects: input list head to free blocks - * @flags: optional flags like DRM_BUDDY_CLEARED - */ -void drm_buddy_free_list(struct drm_buddy *mm, - struct list_head *objects, - unsigned int flags) -{ - bool mark_clear = flags & DRM_BUDDY_CLEARED; - - __drm_buddy_free_list(mm, objects, mark_clear, !mark_clear); -} -EXPORT_SYMBOL(drm_buddy_free_list); - -static bool block_incompatible(struct drm_buddy_block *block, unsigned int flags) -{ - bool needs_clear = flags & DRM_BUDDY_CLEAR_ALLOCATION; - - return needs_clear != drm_buddy_block_is_clear(block); -} - -static struct drm_buddy_block * -__alloc_range_bias(struct drm_buddy *mm, - u64 start, u64 end, - unsigned int order, - unsigned long flags, - bool fallback) -{ - u64 req_size = mm->chunk_size << order; - struct drm_buddy_block *block; - struct drm_buddy_block *buddy; - LIST_HEAD(dfs); - int err; - int i; - - end = end - 1; - - for (i = 0; i < mm->n_roots; ++i) - list_add_tail(&mm->roots[i]->tmp_link, &dfs); - - do { - u64 block_start; - u64 block_end; - - block = list_first_entry_or_null(&dfs, - struct drm_buddy_block, - tmp_link); - if (!block) - break; - - list_del(&block->tmp_link); - - if (drm_buddy_block_order(block) < order) - continue; - - block_start = drm_buddy_block_offset(block); - block_end = block_start + drm_buddy_block_size(mm, block) - 1; - - if (!overlaps(start, end, block_start, block_end)) - continue; - - if (drm_buddy_block_is_allocated(block)) - continue; - - if (block_start < start || block_end > end) { - u64 adjusted_start = max(block_start, start); - u64 adjusted_end = min(block_end, end); - - if (round_down(adjusted_end + 1, req_size) <= - round_up(adjusted_start, req_size)) - continue; - } - - if (!fallback && block_incompatible(block, flags)) - continue; - - if (contains(start, end, block_start, block_end) && - order == drm_buddy_block_order(block)) { - /* - * Find the free block within the range. - */ - if (drm_buddy_block_is_free(block)) - return block; - - continue; - } - - if (!drm_buddy_block_is_split(block)) { - err = split_block(mm, block); - if (unlikely(err)) - goto err_undo; - } - - list_add(&block->right->tmp_link, &dfs); - list_add(&block->left->tmp_link, &dfs); - } while (1); - - return ERR_PTR(-ENOSPC); - -err_undo: - /* - * We really don't want to leave around a bunch of split blocks, since - * bigger is better, so make sure we merge everything back before we - * free the allocated blocks. - */ - buddy = __get_buddy(block); - if (buddy && - (drm_buddy_block_is_free(block) && - drm_buddy_block_is_free(buddy))) - __drm_buddy_free(mm, block, false); - return ERR_PTR(err); -} - -static struct drm_buddy_block * -__drm_buddy_alloc_range_bias(struct drm_buddy *mm, - u64 start, u64 end, - unsigned int order, - unsigned long flags) -{ - struct drm_buddy_block *block; - bool fallback = false; - - block = __alloc_range_bias(mm, start, end, order, - flags, fallback); - if (IS_ERR(block)) - return __alloc_range_bias(mm, start, end, order, - flags, !fallback); - - return block; -} - -static struct drm_buddy_block * -get_maxblock(struct drm_buddy *mm, - unsigned int order, - enum drm_buddy_free_tree tree) -{ - struct drm_buddy_block *max_block = NULL, *block = NULL; - struct rb_root *root; - unsigned int i; - - for (i = order; i <= mm->max_order; ++i) { - root = &mm->free_trees[tree][i]; - block = rbtree_last_free_block(root); - if (!block) - continue; - - if (!max_block) { - max_block = block; - continue; - } - - if (drm_buddy_block_offset(block) > - drm_buddy_block_offset(max_block)) { - max_block = block; - } - } - - return max_block; -} - -static struct drm_buddy_block * -alloc_from_freetree(struct drm_buddy *mm, - unsigned int order, - unsigned long flags) -{ - struct drm_buddy_block *block = NULL; - struct rb_root *root; - enum drm_buddy_free_tree tree; - unsigned int tmp; - int err; - - tree = (flags & DRM_BUDDY_CLEAR_ALLOCATION) ? - DRM_BUDDY_CLEAR_TREE : DRM_BUDDY_DIRTY_TREE; - - if (flags & DRM_BUDDY_TOPDOWN_ALLOCATION) { - block = get_maxblock(mm, order, tree); - if (block) - /* Store the obtained block order */ - tmp = drm_buddy_block_order(block); - } else { - for (tmp = order; tmp <= mm->max_order; ++tmp) { - /* Get RB tree root for this order and tree */ - root = &mm->free_trees[tree][tmp]; - block = rbtree_last_free_block(root); - if (block) - break; - } - } - - if (!block) { - /* Try allocating from the other tree */ - tree = (tree == DRM_BUDDY_CLEAR_TREE) ? - DRM_BUDDY_DIRTY_TREE : DRM_BUDDY_CLEAR_TREE; - - for (tmp = order; tmp <= mm->max_order; ++tmp) { - root = &mm->free_trees[tree][tmp]; - block = rbtree_last_free_block(root); - if (block) - break; - } - - if (!block) - return ERR_PTR(-ENOSPC); - } - - BUG_ON(!drm_buddy_block_is_free(block)); - - while (tmp != order) { - err = split_block(mm, block); - if (unlikely(err)) - goto err_undo; - - block = block->right; - tmp--; - } - return block; - -err_undo: - if (tmp != order) - __drm_buddy_free(mm, block, false); - return ERR_PTR(err); -} - -static int __alloc_range(struct drm_buddy *mm, - struct list_head *dfs, - u64 start, u64 size, - struct list_head *blocks, - u64 *total_allocated_on_err) -{ - struct drm_buddy_block *block; - struct drm_buddy_block *buddy; - u64 total_allocated = 0; - LIST_HEAD(allocated); - u64 end; - int err; - - end = start + size - 1; - - do { - u64 block_start; - u64 block_end; - - block = list_first_entry_or_null(dfs, - struct drm_buddy_block, - tmp_link); - if (!block) - break; - - list_del(&block->tmp_link); - - block_start = drm_buddy_block_offset(block); - block_end = block_start + drm_buddy_block_size(mm, block) - 1; - - if (!overlaps(start, end, block_start, block_end)) - continue; - - if (drm_buddy_block_is_allocated(block)) { - err = -ENOSPC; - goto err_free; - } - - if (contains(start, end, block_start, block_end)) { - if (drm_buddy_block_is_free(block)) { - mark_allocated(mm, block); - total_allocated += drm_buddy_block_size(mm, block); - mm->avail -= drm_buddy_block_size(mm, block); - if (drm_buddy_block_is_clear(block)) - mm->clear_avail -= drm_buddy_block_size(mm, block); - list_add_tail(&block->link, &allocated); - continue; - } else if (!mm->clear_avail) { - err = -ENOSPC; - goto err_free; - } - } - - if (!drm_buddy_block_is_split(block)) { - err = split_block(mm, block); - if (unlikely(err)) - goto err_undo; - } - - list_add(&block->right->tmp_link, dfs); - list_add(&block->left->tmp_link, dfs); - } while (1); - - if (total_allocated < size) { - err = -ENOSPC; - goto err_free; - } - - list_splice_tail(&allocated, blocks); - - return 0; - -err_undo: - /* - * We really don't want to leave around a bunch of split blocks, since - * bigger is better, so make sure we merge everything back before we - * free the allocated blocks. - */ - buddy = __get_buddy(block); - if (buddy && - (drm_buddy_block_is_free(block) && - drm_buddy_block_is_free(buddy))) - __drm_buddy_free(mm, block, false); - -err_free: - if (err == -ENOSPC && total_allocated_on_err) { - list_splice_tail(&allocated, blocks); - *total_allocated_on_err = total_allocated; - } else { - drm_buddy_free_list_internal(mm, &allocated); - } - - return err; -} - -static int __drm_buddy_alloc_range(struct drm_buddy *mm, - u64 start, - u64 size, - u64 *total_allocated_on_err, - struct list_head *blocks) -{ - LIST_HEAD(dfs); - int i; - - for (i = 0; i < mm->n_roots; ++i) - list_add_tail(&mm->roots[i]->tmp_link, &dfs); - - return __alloc_range(mm, &dfs, start, size, - blocks, total_allocated_on_err); -} - -static int __alloc_contig_try_harder(struct drm_buddy *mm, - u64 size, - u64 min_block_size, - struct list_head *blocks) -{ - u64 rhs_offset, lhs_offset, lhs_size, filled; - struct drm_buddy_block *block; - unsigned int tree, order; - LIST_HEAD(blocks_lhs); - unsigned long pages; - u64 modify_size; - int err; - - modify_size = rounddown_pow_of_two(size); - pages = modify_size >> ilog2(mm->chunk_size); - order = fls(pages) - 1; - if (order == 0) - return -ENOSPC; - - for_each_free_tree(tree) { - struct rb_root *root; - struct rb_node *iter; - - root = &mm->free_trees[tree][order]; - if (rbtree_is_empty(root)) - continue; - - iter = rb_last(root); - while (iter) { - block = rbtree_get_free_block(iter); - - /* Allocate blocks traversing RHS */ - rhs_offset = drm_buddy_block_offset(block); - err = __drm_buddy_alloc_range(mm, rhs_offset, size, - &filled, blocks); - if (!err || err != -ENOSPC) - return err; - - lhs_size = max((size - filled), min_block_size); - if (!IS_ALIGNED(lhs_size, min_block_size)) - lhs_size = round_up(lhs_size, min_block_size); - - /* Allocate blocks traversing LHS */ - lhs_offset = drm_buddy_block_offset(block) - lhs_size; - err = __drm_buddy_alloc_range(mm, lhs_offset, lhs_size, - NULL, &blocks_lhs); - if (!err) { - list_splice(&blocks_lhs, blocks); - return 0; - } else if (err != -ENOSPC) { - drm_buddy_free_list_internal(mm, blocks); - return err; - } - /* Free blocks for the next iteration */ - drm_buddy_free_list_internal(mm, blocks); - - iter = rb_prev(iter); - } - } - - return -ENOSPC; -} - -/** - * drm_buddy_block_trim - free unused pages - * - * @mm: DRM buddy manager - * @start: start address to begin the trimming. - * @new_size: original size requested - * @blocks: Input and output list of allocated blocks. - * MUST contain single block as input to be trimmed. - * On success will contain the newly allocated blocks - * making up the @new_size. Blocks always appear in - * ascending order - * - * For contiguous allocation, we round up the size to the nearest - * power of two value, drivers consume *actual* size, so remaining - * portions are unused and can be optionally freed with this function - * - * Returns: - * 0 on success, error code on failure. - */ -int drm_buddy_block_trim(struct drm_buddy *mm, - u64 *start, - u64 new_size, - struct list_head *blocks) -{ - struct drm_buddy_block *parent; - struct drm_buddy_block *block; - u64 block_start, block_end; - LIST_HEAD(dfs); - u64 new_start; - int err; - - if (!list_is_singular(blocks)) - return -EINVAL; - - block = list_first_entry(blocks, - struct drm_buddy_block, - link); - - block_start = drm_buddy_block_offset(block); - block_end = block_start + drm_buddy_block_size(mm, block); - - if (WARN_ON(!drm_buddy_block_is_allocated(block))) - return -EINVAL; - - if (new_size > drm_buddy_block_size(mm, block)) - return -EINVAL; - - if (!new_size || !IS_ALIGNED(new_size, mm->chunk_size)) - return -EINVAL; - - if (new_size == drm_buddy_block_size(mm, block)) - return 0; - - new_start = block_start; - if (start) { - new_start = *start; - - if (new_start < block_start) - return -EINVAL; - - if (!IS_ALIGNED(new_start, mm->chunk_size)) - return -EINVAL; - - if (range_overflows(new_start, new_size, block_end)) - return -EINVAL; - } - - list_del(&block->link); - mark_free(mm, block); - mm->avail += drm_buddy_block_size(mm, block); - if (drm_buddy_block_is_clear(block)) - mm->clear_avail += drm_buddy_block_size(mm, block); - - /* Prevent recursively freeing this node */ - parent = block->parent; - block->parent = NULL; - - list_add(&block->tmp_link, &dfs); - err = __alloc_range(mm, &dfs, new_start, new_size, blocks, NULL); - if (err) { - mark_allocated(mm, block); - mm->avail -= drm_buddy_block_size(mm, block); - if (drm_buddy_block_is_clear(block)) - mm->clear_avail -= drm_buddy_block_size(mm, block); - list_add(&block->link, blocks); - } - - block->parent = parent; - return err; -} -EXPORT_SYMBOL(drm_buddy_block_trim); - -static struct drm_buddy_block * -__drm_buddy_alloc_blocks(struct drm_buddy *mm, - u64 start, u64 end, - unsigned int order, - unsigned long flags) -{ - if (flags & DRM_BUDDY_RANGE_ALLOCATION) - /* Allocate traversing within the range */ - return __drm_buddy_alloc_range_bias(mm, start, end, - order, flags); - else - /* Allocate from freetree */ - return alloc_from_freetree(mm, order, flags); -} - -/** - * drm_buddy_alloc_blocks - allocate power-of-two blocks - * - * @mm: DRM buddy manager to allocate from - * @start: start of the allowed range for this block - * @end: end of the allowed range for this block - * @size: size of the allocation in bytes - * @min_block_size: alignment of the allocation - * @blocks: output list head to add allocated blocks - * @flags: DRM_BUDDY_*_ALLOCATION flags - * - * alloc_range_bias() called on range limitations, which traverses - * the tree and returns the desired block. - * - * alloc_from_freetree() called when *no* range restrictions - * are enforced, which picks the block from the freetree. - * - * Returns: - * 0 on success, error code on failure. - */ -int drm_buddy_alloc_blocks(struct drm_buddy *mm, - u64 start, u64 end, u64 size, - u64 min_block_size, - struct list_head *blocks, - unsigned long flags) -{ - struct drm_buddy_block *block = NULL; - u64 original_size, original_min_size; - unsigned int min_order, order; - LIST_HEAD(allocated); - unsigned long pages; - int err; - - if (size < mm->chunk_size) - return -EINVAL; - - if (min_block_size < mm->chunk_size) - return -EINVAL; - - if (!is_power_of_2(min_block_size)) - return -EINVAL; - - if (!IS_ALIGNED(start | end | size, mm->chunk_size)) - return -EINVAL; - - if (end > mm->size) - return -EINVAL; - - if (range_overflows(start, size, mm->size)) - return -EINVAL; - - /* Actual range allocation */ - if (start + size == end) { - if (!IS_ALIGNED(start | end, min_block_size)) - return -EINVAL; - - return __drm_buddy_alloc_range(mm, start, size, NULL, blocks); - } - - original_size = size; - original_min_size = min_block_size; - - /* Roundup the size to power of 2 */ - if (flags & DRM_BUDDY_CONTIGUOUS_ALLOCATION) { - size = roundup_pow_of_two(size); - min_block_size = size; - /* Align size value to min_block_size */ - } else if (!IS_ALIGNED(size, min_block_size)) { - size = round_up(size, min_block_size); - } - - pages = size >> ilog2(mm->chunk_size); - order = fls(pages) - 1; - min_order = ilog2(min_block_size) - ilog2(mm->chunk_size); - - if (order > mm->max_order || size > mm->size) { - if ((flags & DRM_BUDDY_CONTIGUOUS_ALLOCATION) && - !(flags & DRM_BUDDY_RANGE_ALLOCATION)) - return __alloc_contig_try_harder(mm, original_size, - original_min_size, blocks); - - return -EINVAL; - } - - do { - order = min(order, (unsigned int)fls(pages) - 1); - BUG_ON(order > mm->max_order); - BUG_ON(order < min_order); - - do { - block = __drm_buddy_alloc_blocks(mm, start, - end, - order, - flags); - if (!IS_ERR(block)) - break; - - if (order-- == min_order) { - /* Try allocation through force merge method */ - if (mm->clear_avail && - !__force_merge(mm, start, end, min_order)) { - block = __drm_buddy_alloc_blocks(mm, start, - end, - min_order, - flags); - if (!IS_ERR(block)) { - order = min_order; - break; - } - } - - /* - * Try contiguous block allocation through - * try harder method. - */ - if (flags & DRM_BUDDY_CONTIGUOUS_ALLOCATION && - !(flags & DRM_BUDDY_RANGE_ALLOCATION)) - return __alloc_contig_try_harder(mm, - original_size, - original_min_size, - blocks); - err = -ENOSPC; - goto err_free; - } - } while (1); - - mark_allocated(mm, block); - mm->avail -= drm_buddy_block_size(mm, block); - if (drm_buddy_block_is_clear(block)) - mm->clear_avail -= drm_buddy_block_size(mm, block); - kmemleak_update_trace(block); - list_add_tail(&block->link, &allocated); - - pages -= BIT(order); - - if (!pages) - break; - } while (1); - - /* Trim the allocated block to the required size */ - if (!(flags & DRM_BUDDY_TRIM_DISABLE) && - original_size != size) { - struct list_head *trim_list; - LIST_HEAD(temp); - u64 trim_size; - - trim_list = &allocated; - trim_size = original_size; - - if (!list_is_singular(&allocated)) { - block = list_last_entry(&allocated, typeof(*block), link); - list_move(&block->link, &temp); - trim_list = &temp; - trim_size = drm_buddy_block_size(mm, block) - - (size - original_size); - } - - drm_buddy_block_trim(mm, - NULL, - trim_size, - trim_list); - - if (!list_empty(&temp)) - list_splice_tail(trim_list, &allocated); - } - - list_splice_tail(&allocated, blocks); - return 0; - -err_free: - drm_buddy_free_list_internal(mm, &allocated); - return err; -} -EXPORT_SYMBOL(drm_buddy_alloc_blocks); - -/** - * drm_buddy_block_print - print block information - * - * @mm: DRM buddy manager - * @block: DRM buddy block - * @p: DRM printer to use - */ -void drm_buddy_block_print(struct drm_buddy *mm, - struct drm_buddy_block *block, - struct drm_printer *p) -{ - u64 start = drm_buddy_block_offset(block); - u64 size = drm_buddy_block_size(mm, block); - - drm_printf(p, "%#018llx-%#018llx: %llu\n", start, start + size, size); -} -EXPORT_SYMBOL(drm_buddy_block_print); - -/** - * drm_buddy_print - print allocator state - * - * @mm: DRM buddy manager - * @p: DRM printer to use - */ -void drm_buddy_print(struct drm_buddy *mm, struct drm_printer *p) -{ - int order; - - drm_printf(p, "chunk_size: %lluKiB, total: %lluMiB, free: %lluMiB, clear_free: %lluMiB\n", - mm->chunk_size >> 10, mm->size >> 20, mm->avail >> 20, mm->clear_avail >> 20); - - for (order = mm->max_order; order >= 0; order--) { - struct drm_buddy_block *block, *tmp; - struct rb_root *root; - u64 count = 0, free; - unsigned int tree; - - for_each_free_tree(tree) { - root = &mm->free_trees[tree][order]; - - rbtree_postorder_for_each_entry_safe(block, tmp, root, rb) { - BUG_ON(!drm_buddy_block_is_free(block)); - count++; - } - } - - drm_printf(p, "order-%2d ", order); - - free = count * (mm->chunk_size << order); - if (free < SZ_1M) - drm_printf(p, "free: %8llu KiB", free >> 10); - else - drm_printf(p, "free: %8llu MiB", free >> 20); - - drm_printf(p, ", blocks: %llu\n", count); - } -} -EXPORT_SYMBOL(drm_buddy_print); - -static void drm_buddy_module_exit(void) -{ - kmem_cache_destroy(slab_blocks); -} - -static int __init drm_buddy_module_init(void) -{ - slab_blocks = KMEM_CACHE(drm_buddy_block, 0); - if (!slab_blocks) - return -ENOMEM; - - return 0; -} - -module_init(drm_buddy_module_init); -module_exit(drm_buddy_module_exit); - -MODULE_DESCRIPTION("DRM Buddy Allocator"); -MODULE_LICENSE("Dual MIT/GPL"); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c index f65fe86c02b5..eeda5daa544f 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c @@ -5,7 +5,7 @@ #include -#include +#include #include #include #include diff --git a/drivers/gpu/drm/i915/i915_scatterlist.c b/drivers/gpu/drm/i915/i915_scatterlist.c index 4d830740946d..30246f02bcfe 100644 --- a/drivers/gpu/drm/i915/i915_scatterlist.c +++ b/drivers/gpu/drm/i915/i915_scatterlist.c @@ -7,7 +7,7 @@ #include "i915_scatterlist.h" #include "i915_ttm_buddy_manager.h" -#include +#include #include #include diff --git a/drivers/gpu/drm/i915/i915_ttm_buddy_manager.c b/drivers/gpu/drm/i915/i915_ttm_buddy_manager.c index d5c6e6605086..6b256d95badd 100644 --- a/drivers/gpu/drm/i915/i915_ttm_buddy_manager.c +++ b/drivers/gpu/drm/i915/i915_ttm_buddy_manager.c @@ -5,7 +5,7 @@ #include -#include +#include #include #include #include diff --git a/drivers/gpu/drm/lib/drm_random.c b/drivers/gpu/drm/lib/drm_random.c deleted file mode 100644 index 0e9dba1ef4af..000000000000 --- a/drivers/gpu/drm/lib/drm_random.c +++ /dev/null @@ -1,44 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include - -#include "drm_random.h" - -u32 drm_prandom_u32_max_state(u32 ep_ro, struct rnd_state *state) -{ - return upper_32_bits((u64)prandom_u32_state(state) * ep_ro); -} -EXPORT_SYMBOL(drm_prandom_u32_max_state); - -void drm_random_reorder(unsigned int *order, unsigned int count, - struct rnd_state *state) -{ - unsigned int i, j; - - for (i = 0; i < count; ++i) { - BUILD_BUG_ON(sizeof(unsigned int) > sizeof(u32)); - j = drm_prandom_u32_max_state(count, state); - swap(order[i], order[j]); - } -} -EXPORT_SYMBOL(drm_random_reorder); - -unsigned int *drm_random_order(unsigned int count, struct rnd_state *state) -{ - unsigned int *order, i; - - order = kmalloc_array(count, sizeof(*order), GFP_KERNEL); - if (!order) - return order; - - for (i = 0; i < count; i++) - order[i] = i; - - drm_random_reorder(order, count, state); - return order; -} -EXPORT_SYMBOL(drm_random_order); diff --git a/drivers/gpu/drm/lib/drm_random.h b/drivers/gpu/drm/lib/drm_random.h deleted file mode 100644 index 9f827260a89d..000000000000 --- a/drivers/gpu/drm/lib/drm_random.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __DRM_RANDOM_H__ -#define __DRM_RANDOM_H__ - -/* This is a temporary home for a couple of utility functions that should - * be transposed to lib/ at the earliest convenience. - */ - -#include - -#define DRM_RND_STATE_INITIALIZER(seed__) ({ \ - struct rnd_state state__; \ - prandom_seed_state(&state__, (seed__)); \ - state__; \ -}) - -#define DRM_RND_STATE(name__, seed__) \ - struct rnd_state name__ = DRM_RND_STATE_INITIALIZER(seed__) - -unsigned int *drm_random_order(unsigned int count, - struct rnd_state *state); -void drm_random_reorder(unsigned int *order, - unsigned int count, - struct rnd_state *state); -u32 drm_prandom_u32_max_state(u32 ep_ro, - struct rnd_state *state); - -#endif /* !__DRM_RANDOM_H__ */ diff --git a/drivers/gpu/drm/tests/Makefile b/drivers/gpu/drm/tests/Makefile index 87d5d5f9332a..d2e2e3d8349a 100644 --- a/drivers/gpu/drm/tests/Makefile +++ b/drivers/gpu/drm/tests/Makefile @@ -7,7 +7,6 @@ obj-$(CONFIG_DRM_KUNIT_TEST) += \ drm_atomic_test.o \ drm_atomic_state_test.o \ drm_bridge_test.o \ - drm_buddy_test.o \ drm_cmdline_parser_test.o \ drm_connector_test.o \ drm_damage_helper_test.o \ diff --git a/drivers/gpu/drm/tests/drm_buddy_test.c b/drivers/gpu/drm/tests/drm_buddy_test.c deleted file mode 100644 index e6f8459c6c54..000000000000 --- a/drivers/gpu/drm/tests/drm_buddy_test.c +++ /dev/null @@ -1,928 +0,0 @@ -// SPDX-License-Identifier: MIT -/* - * Copyright © 2019 Intel Corporation - * Copyright © 2022 Maíra Canal - */ - -#include - -#include -#include -#include - -#include - -#include "../lib/drm_random.h" - -static unsigned int random_seed; - -static inline u64 get_size(int order, u64 chunk_size) -{ - return (1 << order) * chunk_size; -} - -static void drm_test_buddy_fragmentation_performance(struct kunit *test) -{ - struct drm_buddy_block *block, *tmp; - int num_blocks, i, ret, count = 0; - LIST_HEAD(allocated_blocks); - unsigned long elapsed_ms; - LIST_HEAD(reverse_list); - LIST_HEAD(test_blocks); - LIST_HEAD(clear_list); - LIST_HEAD(dirty_list); - LIST_HEAD(free_list); - struct drm_buddy mm; - u64 mm_size = SZ_4G; - ktime_t start, end; - - /* - * Allocation under severe fragmentation - * - * Create severe fragmentation by allocating the entire 4 GiB address space - * as tiny 8 KiB blocks but forcing a 64 KiB alignment. The resulting pattern - * leaves many scattered holes. Split the allocations into two groups and - * return them with different flags to block coalescing, then repeatedly - * allocate and free 64 KiB blocks while timing the loop. This stresses how - * quickly the allocator can satisfy larger, aligned requests from a pool of - * highly fragmented space. - */ - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), - "buddy_init failed\n"); - - num_blocks = mm_size / SZ_64K; - - start = ktime_get(); - /* Allocate with maximum fragmentation - 8K blocks with 64K alignment */ - for (i = 0; i < num_blocks; i++) - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, SZ_8K, SZ_64K, - &allocated_blocks, 0), - "buddy_alloc hit an error size=%u\n", SZ_8K); - - list_for_each_entry_safe(block, tmp, &allocated_blocks, link) { - if (count % 4 == 0 || count % 4 == 3) - list_move_tail(&block->link, &clear_list); - else - list_move_tail(&block->link, &dirty_list); - count++; - } - - /* Free with different flags to ensure no coalescing */ - drm_buddy_free_list(&mm, &clear_list, DRM_BUDDY_CLEARED); - drm_buddy_free_list(&mm, &dirty_list, 0); - - for (i = 0; i < num_blocks; i++) - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, SZ_64K, SZ_64K, - &test_blocks, 0), - "buddy_alloc hit an error size=%u\n", SZ_64K); - drm_buddy_free_list(&mm, &test_blocks, 0); - - end = ktime_get(); - elapsed_ms = ktime_to_ms(ktime_sub(end, start)); - - kunit_info(test, "Fragmented allocation took %lu ms\n", elapsed_ms); - - drm_buddy_fini(&mm); - - /* - * Reverse free order under fragmentation - * - * Construct a fragmented 4 GiB space by allocating every 8 KiB block with - * 64 KiB alignment, creating a dense scatter of small regions. Half of the - * blocks are selectively freed to form sparse gaps, while the remaining - * allocations are preserved, reordered in reverse, and released back with - * the cleared flag. This models a pathological reverse-ordered free pattern - * and measures how quickly the allocator can merge and reclaim space when - * deallocation occurs in the opposite order of allocation, exposing the - * cost difference between a linear freelist scan and an ordered tree lookup. - */ - ret = drm_buddy_init(&mm, mm_size, SZ_4K); - KUNIT_ASSERT_EQ(test, ret, 0); - - start = ktime_get(); - /* Allocate maximum fragmentation */ - for (i = 0; i < num_blocks; i++) - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, SZ_8K, SZ_64K, - &allocated_blocks, 0), - "buddy_alloc hit an error size=%u\n", SZ_8K); - - list_for_each_entry_safe(block, tmp, &allocated_blocks, link) { - if (count % 2 == 0) - list_move_tail(&block->link, &free_list); - count++; - } - drm_buddy_free_list(&mm, &free_list, DRM_BUDDY_CLEARED); - - list_for_each_entry_safe_reverse(block, tmp, &allocated_blocks, link) - list_move(&block->link, &reverse_list); - drm_buddy_free_list(&mm, &reverse_list, DRM_BUDDY_CLEARED); - - end = ktime_get(); - elapsed_ms = ktime_to_ms(ktime_sub(end, start)); - - kunit_info(test, "Reverse-ordered free took %lu ms\n", elapsed_ms); - - drm_buddy_fini(&mm); -} - -static void drm_test_buddy_alloc_range_bias(struct kunit *test) -{ - u32 mm_size, size, ps, bias_size, bias_start, bias_end, bias_rem; - DRM_RND_STATE(prng, random_seed); - unsigned int i, count, *order; - struct drm_buddy_block *block; - unsigned long flags; - struct drm_buddy mm; - LIST_HEAD(allocated); - - bias_size = SZ_1M; - ps = roundup_pow_of_two(prandom_u32_state(&prng) % bias_size); - ps = max(SZ_4K, ps); - mm_size = (SZ_8M-1) & ~(ps-1); /* Multiple roots */ - - kunit_info(test, "mm_size=%u, ps=%u\n", mm_size, ps); - - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, ps), - "buddy_init failed\n"); - - count = mm_size / bias_size; - order = drm_random_order(count, &prng); - KUNIT_EXPECT_TRUE(test, order); - - /* - * Idea is to split the address space into uniform bias ranges, and then - * in some random order allocate within each bias, using various - * patterns within. This should detect if allocations leak out from a - * given bias, for example. - */ - - for (i = 0; i < count; i++) { - LIST_HEAD(tmp); - u32 size; - - bias_start = order[i] * bias_size; - bias_end = bias_start + bias_size; - bias_rem = bias_size; - - /* internal round_up too big */ - KUNIT_ASSERT_TRUE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, - bias_end, bias_size + ps, bias_size, - &allocated, - DRM_BUDDY_RANGE_ALLOCATION), - "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", - bias_start, bias_end, bias_size, bias_size); - - /* size too big */ - KUNIT_ASSERT_TRUE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, - bias_end, bias_size + ps, ps, - &allocated, - DRM_BUDDY_RANGE_ALLOCATION), - "buddy_alloc didn't fail with bias(%x-%x), size=%u, ps=%u\n", - bias_start, bias_end, bias_size + ps, ps); - - /* bias range too small for size */ - KUNIT_ASSERT_TRUE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start + ps, - bias_end, bias_size, ps, - &allocated, - DRM_BUDDY_RANGE_ALLOCATION), - "buddy_alloc didn't fail with bias(%x-%x), size=%u, ps=%u\n", - bias_start + ps, bias_end, bias_size, ps); - - /* bias misaligned */ - KUNIT_ASSERT_TRUE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start + ps, - bias_end - ps, - bias_size >> 1, bias_size >> 1, - &allocated, - DRM_BUDDY_RANGE_ALLOCATION), - "buddy_alloc h didn't fail with bias(%x-%x), size=%u, ps=%u\n", - bias_start + ps, bias_end - ps, bias_size >> 1, bias_size >> 1); - - /* single big page */ - KUNIT_ASSERT_FALSE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, - bias_end, bias_size, bias_size, - &tmp, - DRM_BUDDY_RANGE_ALLOCATION), - "buddy_alloc i failed with bias(%x-%x), size=%u, ps=%u\n", - bias_start, bias_end, bias_size, bias_size); - drm_buddy_free_list(&mm, &tmp, 0); - - /* single page with internal round_up */ - KUNIT_ASSERT_FALSE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, - bias_end, ps, bias_size, - &tmp, - DRM_BUDDY_RANGE_ALLOCATION), - "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", - bias_start, bias_end, ps, bias_size); - drm_buddy_free_list(&mm, &tmp, 0); - - /* random size within */ - size = max(round_up(prandom_u32_state(&prng) % bias_rem, ps), ps); - if (size) - KUNIT_ASSERT_FALSE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, - bias_end, size, ps, - &tmp, - DRM_BUDDY_RANGE_ALLOCATION), - "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", - bias_start, bias_end, size, ps); - - bias_rem -= size; - /* too big for current avail */ - KUNIT_ASSERT_TRUE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, - bias_end, bias_rem + ps, ps, - &allocated, - DRM_BUDDY_RANGE_ALLOCATION), - "buddy_alloc didn't fail with bias(%x-%x), size=%u, ps=%u\n", - bias_start, bias_end, bias_rem + ps, ps); - - if (bias_rem) { - /* random fill of the remainder */ - size = max(round_up(prandom_u32_state(&prng) % bias_rem, ps), ps); - size = max(size, ps); - - KUNIT_ASSERT_FALSE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, - bias_end, size, ps, - &allocated, - DRM_BUDDY_RANGE_ALLOCATION), - "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", - bias_start, bias_end, size, ps); - /* - * Intentionally allow some space to be left - * unallocated, and ideally not always on the bias - * boundaries. - */ - drm_buddy_free_list(&mm, &tmp, 0); - } else { - list_splice_tail(&tmp, &allocated); - } - } - - kfree(order); - drm_buddy_free_list(&mm, &allocated, 0); - drm_buddy_fini(&mm); - - /* - * Something more free-form. Idea is to pick a random starting bias - * range within the address space and then start filling it up. Also - * randomly grow the bias range in both directions as we go along. This - * should give us bias start/end which is not always uniform like above, - * and in some cases will require the allocator to jump over already - * allocated nodes in the middle of the address space. - */ - - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, ps), - "buddy_init failed\n"); - - bias_start = round_up(prandom_u32_state(&prng) % (mm_size - ps), ps); - bias_end = round_up(bias_start + prandom_u32_state(&prng) % (mm_size - bias_start), ps); - bias_end = max(bias_end, bias_start + ps); - bias_rem = bias_end - bias_start; - - do { - u32 size = max(round_up(prandom_u32_state(&prng) % bias_rem, ps), ps); - - KUNIT_ASSERT_FALSE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, - bias_end, size, ps, - &allocated, - DRM_BUDDY_RANGE_ALLOCATION), - "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", - bias_start, bias_end, size, ps); - bias_rem -= size; - - /* - * Try to randomly grow the bias range in both directions, or - * only one, or perhaps don't grow at all. - */ - do { - u32 old_bias_start = bias_start; - u32 old_bias_end = bias_end; - - if (bias_start) - bias_start -= round_up(prandom_u32_state(&prng) % bias_start, ps); - if (bias_end != mm_size) - bias_end += round_up(prandom_u32_state(&prng) % (mm_size - bias_end), ps); - - bias_rem += old_bias_start - bias_start; - bias_rem += bias_end - old_bias_end; - } while (!bias_rem && (bias_start || bias_end != mm_size)); - } while (bias_rem); - - KUNIT_ASSERT_EQ(test, bias_start, 0); - KUNIT_ASSERT_EQ(test, bias_end, mm_size); - KUNIT_ASSERT_TRUE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, bias_end, - ps, ps, - &allocated, - DRM_BUDDY_RANGE_ALLOCATION), - "buddy_alloc passed with bias(%x-%x), size=%u\n", - bias_start, bias_end, ps); - - drm_buddy_free_list(&mm, &allocated, 0); - drm_buddy_fini(&mm); - - /* - * Allocate cleared blocks in the bias range when the DRM buddy's clear avail is - * zero. This will validate the bias range allocation in scenarios like system boot - * when no cleared blocks are available and exercise the fallback path too. The resulting - * blocks should always be dirty. - */ - - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, ps), - "buddy_init failed\n"); - - bias_start = round_up(prandom_u32_state(&prng) % (mm_size - ps), ps); - bias_end = round_up(bias_start + prandom_u32_state(&prng) % (mm_size - bias_start), ps); - bias_end = max(bias_end, bias_start + ps); - bias_rem = bias_end - bias_start; - - flags = DRM_BUDDY_CLEAR_ALLOCATION | DRM_BUDDY_RANGE_ALLOCATION; - size = max(round_up(prandom_u32_state(&prng) % bias_rem, ps), ps); - - KUNIT_ASSERT_FALSE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, - bias_end, size, ps, - &allocated, - flags), - "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", - bias_start, bias_end, size, ps); - - list_for_each_entry(block, &allocated, link) - KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), false); - - drm_buddy_free_list(&mm, &allocated, 0); - drm_buddy_fini(&mm); -} - -static void drm_test_buddy_alloc_clear(struct kunit *test) -{ - unsigned long n_pages, total, i = 0; - const unsigned long ps = SZ_4K; - struct drm_buddy_block *block; - const int max_order = 12; - LIST_HEAD(allocated); - struct drm_buddy mm; - unsigned int order; - u32 mm_size, size; - LIST_HEAD(dirty); - LIST_HEAD(clean); - - mm_size = SZ_4K << max_order; - KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, mm_size, ps)); - - KUNIT_EXPECT_EQ(test, mm.max_order, max_order); - - /* - * Idea is to allocate and free some random portion of the address space, - * returning those pages as non-dirty and randomly alternate between - * requesting dirty and non-dirty pages (not going over the limit - * we freed as non-dirty), putting that into two separate lists. - * Loop over both lists at the end checking that the dirty list - * is indeed all dirty pages and vice versa. Free it all again, - * keeping the dirty/clear status. - */ - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, - 5 * ps, ps, &allocated, - DRM_BUDDY_TOPDOWN_ALLOCATION), - "buddy_alloc hit an error size=%lu\n", 5 * ps); - drm_buddy_free_list(&mm, &allocated, DRM_BUDDY_CLEARED); - - n_pages = 10; - do { - unsigned long flags; - struct list_head *list; - int slot = i % 2; - - if (slot == 0) { - list = &dirty; - flags = 0; - } else { - list = &clean; - flags = DRM_BUDDY_CLEAR_ALLOCATION; - } - - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, - ps, ps, list, - flags), - "buddy_alloc hit an error size=%lu\n", ps); - } while (++i < n_pages); - - list_for_each_entry(block, &clean, link) - KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), true); - - list_for_each_entry(block, &dirty, link) - KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), false); - - drm_buddy_free_list(&mm, &clean, DRM_BUDDY_CLEARED); - - /* - * Trying to go over the clear limit for some allocation. - * The allocation should never fail with reasonable page-size. - */ - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, - 10 * ps, ps, &clean, - DRM_BUDDY_CLEAR_ALLOCATION), - "buddy_alloc hit an error size=%lu\n", 10 * ps); - - drm_buddy_free_list(&mm, &clean, DRM_BUDDY_CLEARED); - drm_buddy_free_list(&mm, &dirty, 0); - drm_buddy_fini(&mm); - - KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, mm_size, ps)); - - /* - * Create a new mm. Intentionally fragment the address space by creating - * two alternating lists. Free both lists, one as dirty the other as clean. - * Try to allocate double the previous size with matching min_page_size. The - * allocation should never fail as it calls the force_merge. Also check that - * the page is always dirty after force_merge. Free the page as dirty, then - * repeat the whole thing, increment the order until we hit the max_order. - */ - - i = 0; - n_pages = mm_size / ps; - do { - struct list_head *list; - int slot = i % 2; - - if (slot == 0) - list = &dirty; - else - list = &clean; - - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, - ps, ps, list, 0), - "buddy_alloc hit an error size=%lu\n", ps); - } while (++i < n_pages); - - drm_buddy_free_list(&mm, &clean, DRM_BUDDY_CLEARED); - drm_buddy_free_list(&mm, &dirty, 0); - - order = 1; - do { - size = SZ_4K << order; - - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, - size, size, &allocated, - DRM_BUDDY_CLEAR_ALLOCATION), - "buddy_alloc hit an error size=%u\n", size); - total = 0; - list_for_each_entry(block, &allocated, link) { - if (size != mm_size) - KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), false); - total += drm_buddy_block_size(&mm, block); - } - KUNIT_EXPECT_EQ(test, total, size); - - drm_buddy_free_list(&mm, &allocated, 0); - } while (++order <= max_order); - - drm_buddy_fini(&mm); - - /* - * Create a new mm with a non power-of-two size. Allocate a random size from each - * root, free as cleared and then call fini. This will ensure the multi-root - * force merge during fini. - */ - mm_size = (SZ_4K << max_order) + (SZ_4K << (max_order - 2)); - - KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, mm_size, ps)); - KUNIT_EXPECT_EQ(test, mm.max_order, max_order); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, SZ_4K << max_order, - 4 * ps, ps, &allocated, - DRM_BUDDY_RANGE_ALLOCATION), - "buddy_alloc hit an error size=%lu\n", 4 * ps); - drm_buddy_free_list(&mm, &allocated, DRM_BUDDY_CLEARED); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, SZ_4K << max_order, - 2 * ps, ps, &allocated, - DRM_BUDDY_CLEAR_ALLOCATION), - "buddy_alloc hit an error size=%lu\n", 2 * ps); - drm_buddy_free_list(&mm, &allocated, DRM_BUDDY_CLEARED); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, SZ_4K << max_order, mm_size, - ps, ps, &allocated, - DRM_BUDDY_RANGE_ALLOCATION), - "buddy_alloc hit an error size=%lu\n", ps); - drm_buddy_free_list(&mm, &allocated, DRM_BUDDY_CLEARED); - drm_buddy_fini(&mm); -} - -static void drm_test_buddy_alloc_contiguous(struct kunit *test) -{ - const unsigned long ps = SZ_4K, mm_size = 16 * 3 * SZ_4K; - unsigned long i, n_pages, total; - struct drm_buddy_block *block; - struct drm_buddy mm; - LIST_HEAD(left); - LIST_HEAD(middle); - LIST_HEAD(right); - LIST_HEAD(allocated); - - KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, mm_size, ps)); - - /* - * Idea is to fragment the address space by alternating block - * allocations between three different lists; one for left, middle and - * right. We can then free a list to simulate fragmentation. In - * particular we want to exercise the DRM_BUDDY_CONTIGUOUS_ALLOCATION, - * including the try_harder path. - */ - - i = 0; - n_pages = mm_size / ps; - do { - struct list_head *list; - int slot = i % 3; - - if (slot == 0) - list = &left; - else if (slot == 1) - list = &middle; - else - list = &right; - KUNIT_ASSERT_FALSE_MSG(test, - drm_buddy_alloc_blocks(&mm, 0, mm_size, - ps, ps, list, 0), - "buddy_alloc hit an error size=%lu\n", - ps); - } while (++i < n_pages); - - KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, - 3 * ps, ps, &allocated, - DRM_BUDDY_CONTIGUOUS_ALLOCATION), - "buddy_alloc didn't error size=%lu\n", 3 * ps); - - drm_buddy_free_list(&mm, &middle, 0); - KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, - 3 * ps, ps, &allocated, - DRM_BUDDY_CONTIGUOUS_ALLOCATION), - "buddy_alloc didn't error size=%lu\n", 3 * ps); - KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, - 2 * ps, ps, &allocated, - DRM_BUDDY_CONTIGUOUS_ALLOCATION), - "buddy_alloc didn't error size=%lu\n", 2 * ps); - - drm_buddy_free_list(&mm, &right, 0); - KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, - 3 * ps, ps, &allocated, - DRM_BUDDY_CONTIGUOUS_ALLOCATION), - "buddy_alloc didn't error size=%lu\n", 3 * ps); - /* - * At this point we should have enough contiguous space for 2 blocks, - * however they are never buddies (since we freed middle and right) so - * will require the try_harder logic to find them. - */ - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, - 2 * ps, ps, &allocated, - DRM_BUDDY_CONTIGUOUS_ALLOCATION), - "buddy_alloc hit an error size=%lu\n", 2 * ps); - - drm_buddy_free_list(&mm, &left, 0); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, - 3 * ps, ps, &allocated, - DRM_BUDDY_CONTIGUOUS_ALLOCATION), - "buddy_alloc hit an error size=%lu\n", 3 * ps); - - total = 0; - list_for_each_entry(block, &allocated, link) - total += drm_buddy_block_size(&mm, block); - - KUNIT_ASSERT_EQ(test, total, ps * 2 + ps * 3); - - drm_buddy_free_list(&mm, &allocated, 0); - drm_buddy_fini(&mm); -} - -static void drm_test_buddy_alloc_pathological(struct kunit *test) -{ - u64 mm_size, size, start = 0; - struct drm_buddy_block *block; - const int max_order = 3; - unsigned long flags = 0; - int order, top; - struct drm_buddy mm; - LIST_HEAD(blocks); - LIST_HEAD(holes); - LIST_HEAD(tmp); - - /* - * Create a pot-sized mm, then allocate one of each possible - * order within. This should leave the mm with exactly one - * page left. Free the largest block, then whittle down again. - * Eventually we will have a fully 50% fragmented mm. - */ - - mm_size = SZ_4K << max_order; - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), - "buddy_init failed\n"); - - KUNIT_EXPECT_EQ(test, mm.max_order, max_order); - - for (top = max_order; top; top--) { - /* Make room by freeing the largest allocated block */ - block = list_first_entry_or_null(&blocks, typeof(*block), link); - if (block) { - list_del(&block->link); - drm_buddy_free_block(&mm, block); - } - - for (order = top; order--;) { - size = get_size(order, mm.chunk_size); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, - mm_size, size, size, - &tmp, flags), - "buddy_alloc hit -ENOMEM with order=%d, top=%d\n", - order, top); - - block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); - KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); - - list_move_tail(&block->link, &blocks); - } - - /* There should be one final page for this sub-allocation */ - size = get_size(0, mm.chunk_size); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, - size, size, &tmp, flags), - "buddy_alloc hit -ENOMEM for hole\n"); - - block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); - KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); - - list_move_tail(&block->link, &holes); - - size = get_size(top, mm.chunk_size); - KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, - size, size, &tmp, flags), - "buddy_alloc unexpectedly succeeded at top-order %d/%d, it should be full!", - top, max_order); - } - - drm_buddy_free_list(&mm, &holes, 0); - - /* Nothing larger than blocks of chunk_size now available */ - for (order = 1; order <= max_order; order++) { - size = get_size(order, mm.chunk_size); - KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, - size, size, &tmp, flags), - "buddy_alloc unexpectedly succeeded at order %d, it should be full!", - order); - } - - list_splice_tail(&holes, &blocks); - drm_buddy_free_list(&mm, &blocks, 0); - drm_buddy_fini(&mm); -} - -static void drm_test_buddy_alloc_pessimistic(struct kunit *test) -{ - u64 mm_size, size, start = 0; - struct drm_buddy_block *block, *bn; - const unsigned int max_order = 16; - unsigned long flags = 0; - struct drm_buddy mm; - unsigned int order; - LIST_HEAD(blocks); - LIST_HEAD(tmp); - - /* - * Create a pot-sized mm, then allocate one of each possible - * order within. This should leave the mm with exactly one - * page left. - */ - - mm_size = SZ_4K << max_order; - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), - "buddy_init failed\n"); - - KUNIT_EXPECT_EQ(test, mm.max_order, max_order); - - for (order = 0; order < max_order; order++) { - size = get_size(order, mm.chunk_size); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, - size, size, &tmp, flags), - "buddy_alloc hit -ENOMEM with order=%d\n", - order); - - block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); - KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); - - list_move_tail(&block->link, &blocks); - } - - /* And now the last remaining block available */ - size = get_size(0, mm.chunk_size); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, - size, size, &tmp, flags), - "buddy_alloc hit -ENOMEM on final alloc\n"); - - block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); - KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); - - list_move_tail(&block->link, &blocks); - - /* Should be completely full! */ - for (order = max_order; order--;) { - size = get_size(order, mm.chunk_size); - KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, - size, size, &tmp, flags), - "buddy_alloc unexpectedly succeeded, it should be full!"); - } - - block = list_last_entry(&blocks, typeof(*block), link); - list_del(&block->link); - drm_buddy_free_block(&mm, block); - - /* As we free in increasing size, we make available larger blocks */ - order = 1; - list_for_each_entry_safe(block, bn, &blocks, link) { - list_del(&block->link); - drm_buddy_free_block(&mm, block); - - size = get_size(order, mm.chunk_size); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, - size, size, &tmp, flags), - "buddy_alloc hit -ENOMEM with order=%d\n", - order); - - block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); - KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); - - list_del(&block->link); - drm_buddy_free_block(&mm, block); - order++; - } - - /* To confirm, now the whole mm should be available */ - size = get_size(max_order, mm.chunk_size); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, - size, size, &tmp, flags), - "buddy_alloc (realloc) hit -ENOMEM with order=%d\n", - max_order); - - block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); - KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); - - list_del(&block->link); - drm_buddy_free_block(&mm, block); - drm_buddy_free_list(&mm, &blocks, 0); - drm_buddy_fini(&mm); -} - -static void drm_test_buddy_alloc_optimistic(struct kunit *test) -{ - u64 mm_size, size, start = 0; - struct drm_buddy_block *block; - unsigned long flags = 0; - const int max_order = 16; - struct drm_buddy mm; - LIST_HEAD(blocks); - LIST_HEAD(tmp); - int order; - - /* - * Create a mm with one block of each order available, and - * try to allocate them all. - */ - - mm_size = SZ_4K * ((1 << (max_order + 1)) - 1); - - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), - "buddy_init failed\n"); - - KUNIT_EXPECT_EQ(test, mm.max_order, max_order); - - for (order = 0; order <= max_order; order++) { - size = get_size(order, mm.chunk_size); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, - size, size, &tmp, flags), - "buddy_alloc hit -ENOMEM with order=%d\n", - order); - - block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); - KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); - - list_move_tail(&block->link, &blocks); - } - - /* Should be completely full! */ - size = get_size(0, mm.chunk_size); - KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, - size, size, &tmp, flags), - "buddy_alloc unexpectedly succeeded, it should be full!"); - - drm_buddy_free_list(&mm, &blocks, 0); - drm_buddy_fini(&mm); -} - -static void drm_test_buddy_alloc_limit(struct kunit *test) -{ - u64 size = U64_MAX, start = 0; - struct drm_buddy_block *block; - unsigned long flags = 0; - LIST_HEAD(allocated); - struct drm_buddy mm; - - KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, size, SZ_4K)); - - KUNIT_EXPECT_EQ_MSG(test, mm.max_order, DRM_BUDDY_MAX_ORDER, - "mm.max_order(%d) != %d\n", mm.max_order, - DRM_BUDDY_MAX_ORDER); - - size = mm.chunk_size << mm.max_order; - KUNIT_EXPECT_FALSE(test, drm_buddy_alloc_blocks(&mm, start, size, size, - mm.chunk_size, &allocated, flags)); - - block = list_first_entry_or_null(&allocated, struct drm_buddy_block, link); - KUNIT_EXPECT_TRUE(test, block); - - KUNIT_EXPECT_EQ_MSG(test, drm_buddy_block_order(block), mm.max_order, - "block order(%d) != %d\n", - drm_buddy_block_order(block), mm.max_order); - - KUNIT_EXPECT_EQ_MSG(test, drm_buddy_block_size(&mm, block), - BIT_ULL(mm.max_order) * mm.chunk_size, - "block size(%llu) != %llu\n", - drm_buddy_block_size(&mm, block), - BIT_ULL(mm.max_order) * mm.chunk_size); - - drm_buddy_free_list(&mm, &allocated, 0); - drm_buddy_fini(&mm); -} - -static void drm_test_buddy_alloc_exceeds_max_order(struct kunit *test) -{ - u64 mm_size = SZ_8G + SZ_2G, size = SZ_8G + SZ_1G, min_block_size = SZ_8G; - struct drm_buddy mm; - LIST_HEAD(blocks); - int err; - - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), - "buddy_init failed\n"); - - /* CONTIGUOUS allocation should succeed via try_harder fallback */ - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, size, - SZ_4K, &blocks, - DRM_BUDDY_CONTIGUOUS_ALLOCATION), - "buddy_alloc hit an error size=%llu\n", size); - drm_buddy_free_list(&mm, &blocks, 0); - - /* Non-CONTIGUOUS with large min_block_size should return -EINVAL */ - err = drm_buddy_alloc_blocks(&mm, 0, mm_size, size, min_block_size, &blocks, 0); - KUNIT_EXPECT_EQ(test, err, -EINVAL); - - /* Non-CONTIGUOUS + RANGE with large min_block_size should return -EINVAL */ - err = drm_buddy_alloc_blocks(&mm, 0, mm_size, size, min_block_size, &blocks, - DRM_BUDDY_RANGE_ALLOCATION); - KUNIT_EXPECT_EQ(test, err, -EINVAL); - - /* CONTIGUOUS + RANGE should return -EINVAL (no try_harder for RANGE) */ - err = drm_buddy_alloc_blocks(&mm, 0, mm_size, size, SZ_4K, &blocks, - DRM_BUDDY_CONTIGUOUS_ALLOCATION | DRM_BUDDY_RANGE_ALLOCATION); - KUNIT_EXPECT_EQ(test, err, -EINVAL); - - drm_buddy_fini(&mm); -} - -static int drm_buddy_suite_init(struct kunit_suite *suite) -{ - while (!random_seed) - random_seed = get_random_u32(); - - kunit_info(suite, "Testing DRM buddy manager, with random_seed=0x%x\n", - random_seed); - - return 0; -} - -static struct kunit_case drm_buddy_tests[] = { - KUNIT_CASE(drm_test_buddy_alloc_limit), - KUNIT_CASE(drm_test_buddy_alloc_optimistic), - KUNIT_CASE(drm_test_buddy_alloc_pessimistic), - KUNIT_CASE(drm_test_buddy_alloc_pathological), - KUNIT_CASE(drm_test_buddy_alloc_contiguous), - KUNIT_CASE(drm_test_buddy_alloc_clear), - KUNIT_CASE(drm_test_buddy_alloc_range_bias), - KUNIT_CASE(drm_test_buddy_fragmentation_performance), - KUNIT_CASE(drm_test_buddy_alloc_exceeds_max_order), - {} -}; - -static struct kunit_suite drm_buddy_test_suite = { - .name = "drm_buddy", - .suite_init = drm_buddy_suite_init, - .test_cases = drm_buddy_tests, -}; - -kunit_test_suite(drm_buddy_test_suite); - -MODULE_AUTHOR("Intel Corporation"); -MODULE_DESCRIPTION("Kunit test for drm_buddy functions"); -MODULE_LICENSE("GPL"); diff --git a/drivers/gpu/drm/tests/drm_exec_test.c b/drivers/gpu/drm/tests/drm_exec_test.c index 3a20c788c51f..2fc47f3b463b 100644 --- a/drivers/gpu/drm/tests/drm_exec_test.c +++ b/drivers/gpu/drm/tests/drm_exec_test.c @@ -16,8 +16,6 @@ #include #include -#include "../lib/drm_random.h" - struct drm_exec_priv { struct device *dev; struct drm_device *drm; diff --git a/drivers/gpu/drm/tests/drm_mm_test.c b/drivers/gpu/drm/tests/drm_mm_test.c index aec9eccdeae9..e24a619059d8 100644 --- a/drivers/gpu/drm/tests/drm_mm_test.c +++ b/drivers/gpu/drm/tests/drm_mm_test.c @@ -16,8 +16,6 @@ #include #include -#include "../lib/drm_random.h" - enum { BEST, BOTTOMUP, diff --git a/drivers/gpu/drm/ttm/tests/ttm_mock_manager.h b/drivers/gpu/drm/ttm/tests/ttm_mock_manager.h index e4c95f86a467..96ea8c9aae34 100644 --- a/drivers/gpu/drm/ttm/tests/ttm_mock_manager.h +++ b/drivers/gpu/drm/ttm/tests/ttm_mock_manager.h @@ -5,7 +5,7 @@ #ifndef TTM_MOCK_MANAGER_H #define TTM_MOCK_MANAGER_H -#include +#include struct ttm_mock_manager { struct ttm_resource_manager man; diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h b/drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h index a71e14818ec2..babeec5511d9 100644 --- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h +++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h @@ -6,7 +6,7 @@ #ifndef _XE_TTM_VRAM_MGR_TYPES_H_ #define _XE_TTM_VRAM_MGR_TYPES_H_ -#include +#include #include /** diff --git a/drivers/gpu/tests/Makefile b/drivers/gpu/tests/Makefile new file mode 100644 index 000000000000..8e7654e87d82 --- /dev/null +++ b/drivers/gpu/tests/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 + +gpu_buddy_tests-y = gpu_buddy_test.o gpu_random.o +obj-$(CONFIG_DRM_KUNIT_TEST) += gpu_buddy_tests.o diff --git a/drivers/gpu/tests/gpu_buddy_test.c b/drivers/gpu/tests/gpu_buddy_test.c new file mode 100644 index 000000000000..b905932da990 --- /dev/null +++ b/drivers/gpu/tests/gpu_buddy_test.c @@ -0,0 +1,928 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2019 Intel Corporation + * Copyright © 2022 Maíra Canal + */ + +#include + +#include +#include +#include + +#include + +#include "gpu_random.h" + +static unsigned int random_seed; + +static inline u64 get_size(int order, u64 chunk_size) +{ + return (1 << order) * chunk_size; +} + +static void drm_test_buddy_fragmentation_performance(struct kunit *test) +{ + struct drm_buddy_block *block, *tmp; + int num_blocks, i, ret, count = 0; + LIST_HEAD(allocated_blocks); + unsigned long elapsed_ms; + LIST_HEAD(reverse_list); + LIST_HEAD(test_blocks); + LIST_HEAD(clear_list); + LIST_HEAD(dirty_list); + LIST_HEAD(free_list); + struct drm_buddy mm; + u64 mm_size = SZ_4G; + ktime_t start, end; + + /* + * Allocation under severe fragmentation + * + * Create severe fragmentation by allocating the entire 4 GiB address space + * as tiny 8 KiB blocks but forcing a 64 KiB alignment. The resulting pattern + * leaves many scattered holes. Split the allocations into two groups and + * return them with different flags to block coalescing, then repeatedly + * allocate and free 64 KiB blocks while timing the loop. This stresses how + * quickly the allocator can satisfy larger, aligned requests from a pool of + * highly fragmented space. + */ + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), + "buddy_init failed\n"); + + num_blocks = mm_size / SZ_64K; + + start = ktime_get(); + /* Allocate with maximum fragmentation - 8K blocks with 64K alignment */ + for (i = 0; i < num_blocks; i++) + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, SZ_8K, SZ_64K, + &allocated_blocks, 0), + "buddy_alloc hit an error size=%u\n", SZ_8K); + + list_for_each_entry_safe(block, tmp, &allocated_blocks, link) { + if (count % 4 == 0 || count % 4 == 3) + list_move_tail(&block->link, &clear_list); + else + list_move_tail(&block->link, &dirty_list); + count++; + } + + /* Free with different flags to ensure no coalescing */ + drm_buddy_free_list(&mm, &clear_list, DRM_BUDDY_CLEARED); + drm_buddy_free_list(&mm, &dirty_list, 0); + + for (i = 0; i < num_blocks; i++) + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, SZ_64K, SZ_64K, + &test_blocks, 0), + "buddy_alloc hit an error size=%u\n", SZ_64K); + drm_buddy_free_list(&mm, &test_blocks, 0); + + end = ktime_get(); + elapsed_ms = ktime_to_ms(ktime_sub(end, start)); + + kunit_info(test, "Fragmented allocation took %lu ms\n", elapsed_ms); + + drm_buddy_fini(&mm); + + /* + * Reverse free order under fragmentation + * + * Construct a fragmented 4 GiB space by allocating every 8 KiB block with + * 64 KiB alignment, creating a dense scatter of small regions. Half of the + * blocks are selectively freed to form sparse gaps, while the remaining + * allocations are preserved, reordered in reverse, and released back with + * the cleared flag. This models a pathological reverse-ordered free pattern + * and measures how quickly the allocator can merge and reclaim space when + * deallocation occurs in the opposite order of allocation, exposing the + * cost difference between a linear freelist scan and an ordered tree lookup. + */ + ret = drm_buddy_init(&mm, mm_size, SZ_4K); + KUNIT_ASSERT_EQ(test, ret, 0); + + start = ktime_get(); + /* Allocate maximum fragmentation */ + for (i = 0; i < num_blocks; i++) + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, SZ_8K, SZ_64K, + &allocated_blocks, 0), + "buddy_alloc hit an error size=%u\n", SZ_8K); + + list_for_each_entry_safe(block, tmp, &allocated_blocks, link) { + if (count % 2 == 0) + list_move_tail(&block->link, &free_list); + count++; + } + drm_buddy_free_list(&mm, &free_list, DRM_BUDDY_CLEARED); + + list_for_each_entry_safe_reverse(block, tmp, &allocated_blocks, link) + list_move(&block->link, &reverse_list); + drm_buddy_free_list(&mm, &reverse_list, DRM_BUDDY_CLEARED); + + end = ktime_get(); + elapsed_ms = ktime_to_ms(ktime_sub(end, start)); + + kunit_info(test, "Reverse-ordered free took %lu ms\n", elapsed_ms); + + drm_buddy_fini(&mm); +} + +static void drm_test_buddy_alloc_range_bias(struct kunit *test) +{ + u32 mm_size, size, ps, bias_size, bias_start, bias_end, bias_rem; + DRM_RND_STATE(prng, random_seed); + unsigned int i, count, *order; + struct drm_buddy_block *block; + unsigned long flags; + struct drm_buddy mm; + LIST_HEAD(allocated); + + bias_size = SZ_1M; + ps = roundup_pow_of_two(prandom_u32_state(&prng) % bias_size); + ps = max(SZ_4K, ps); + mm_size = (SZ_8M-1) & ~(ps-1); /* Multiple roots */ + + kunit_info(test, "mm_size=%u, ps=%u\n", mm_size, ps); + + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, ps), + "buddy_init failed\n"); + + count = mm_size / bias_size; + order = drm_random_order(count, &prng); + KUNIT_EXPECT_TRUE(test, order); + + /* + * Idea is to split the address space into uniform bias ranges, and then + * in some random order allocate within each bias, using various + * patterns within. This should detect if allocations leak out from a + * given bias, for example. + */ + + for (i = 0; i < count; i++) { + LIST_HEAD(tmp); + u32 size; + + bias_start = order[i] * bias_size; + bias_end = bias_start + bias_size; + bias_rem = bias_size; + + /* internal round_up too big */ + KUNIT_ASSERT_TRUE_MSG(test, + drm_buddy_alloc_blocks(&mm, bias_start, + bias_end, bias_size + ps, bias_size, + &allocated, + DRM_BUDDY_RANGE_ALLOCATION), + "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", + bias_start, bias_end, bias_size, bias_size); + + /* size too big */ + KUNIT_ASSERT_TRUE_MSG(test, + drm_buddy_alloc_blocks(&mm, bias_start, + bias_end, bias_size + ps, ps, + &allocated, + DRM_BUDDY_RANGE_ALLOCATION), + "buddy_alloc didn't fail with bias(%x-%x), size=%u, ps=%u\n", + bias_start, bias_end, bias_size + ps, ps); + + /* bias range too small for size */ + KUNIT_ASSERT_TRUE_MSG(test, + drm_buddy_alloc_blocks(&mm, bias_start + ps, + bias_end, bias_size, ps, + &allocated, + DRM_BUDDY_RANGE_ALLOCATION), + "buddy_alloc didn't fail with bias(%x-%x), size=%u, ps=%u\n", + bias_start + ps, bias_end, bias_size, ps); + + /* bias misaligned */ + KUNIT_ASSERT_TRUE_MSG(test, + drm_buddy_alloc_blocks(&mm, bias_start + ps, + bias_end - ps, + bias_size >> 1, bias_size >> 1, + &allocated, + DRM_BUDDY_RANGE_ALLOCATION), + "buddy_alloc h didn't fail with bias(%x-%x), size=%u, ps=%u\n", + bias_start + ps, bias_end - ps, bias_size >> 1, bias_size >> 1); + + /* single big page */ + KUNIT_ASSERT_FALSE_MSG(test, + drm_buddy_alloc_blocks(&mm, bias_start, + bias_end, bias_size, bias_size, + &tmp, + DRM_BUDDY_RANGE_ALLOCATION), + "buddy_alloc i failed with bias(%x-%x), size=%u, ps=%u\n", + bias_start, bias_end, bias_size, bias_size); + drm_buddy_free_list(&mm, &tmp, 0); + + /* single page with internal round_up */ + KUNIT_ASSERT_FALSE_MSG(test, + drm_buddy_alloc_blocks(&mm, bias_start, + bias_end, ps, bias_size, + &tmp, + DRM_BUDDY_RANGE_ALLOCATION), + "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", + bias_start, bias_end, ps, bias_size); + drm_buddy_free_list(&mm, &tmp, 0); + + /* random size within */ + size = max(round_up(prandom_u32_state(&prng) % bias_rem, ps), ps); + if (size) + KUNIT_ASSERT_FALSE_MSG(test, + drm_buddy_alloc_blocks(&mm, bias_start, + bias_end, size, ps, + &tmp, + DRM_BUDDY_RANGE_ALLOCATION), + "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", + bias_start, bias_end, size, ps); + + bias_rem -= size; + /* too big for current avail */ + KUNIT_ASSERT_TRUE_MSG(test, + drm_buddy_alloc_blocks(&mm, bias_start, + bias_end, bias_rem + ps, ps, + &allocated, + DRM_BUDDY_RANGE_ALLOCATION), + "buddy_alloc didn't fail with bias(%x-%x), size=%u, ps=%u\n", + bias_start, bias_end, bias_rem + ps, ps); + + if (bias_rem) { + /* random fill of the remainder */ + size = max(round_up(prandom_u32_state(&prng) % bias_rem, ps), ps); + size = max(size, ps); + + KUNIT_ASSERT_FALSE_MSG(test, + drm_buddy_alloc_blocks(&mm, bias_start, + bias_end, size, ps, + &allocated, + DRM_BUDDY_RANGE_ALLOCATION), + "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", + bias_start, bias_end, size, ps); + /* + * Intentionally allow some space to be left + * unallocated, and ideally not always on the bias + * boundaries. + */ + drm_buddy_free_list(&mm, &tmp, 0); + } else { + list_splice_tail(&tmp, &allocated); + } + } + + kfree(order); + drm_buddy_free_list(&mm, &allocated, 0); + drm_buddy_fini(&mm); + + /* + * Something more free-form. Idea is to pick a random starting bias + * range within the address space and then start filling it up. Also + * randomly grow the bias range in both directions as we go along. This + * should give us bias start/end which is not always uniform like above, + * and in some cases will require the allocator to jump over already + * allocated nodes in the middle of the address space. + */ + + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, ps), + "buddy_init failed\n"); + + bias_start = round_up(prandom_u32_state(&prng) % (mm_size - ps), ps); + bias_end = round_up(bias_start + prandom_u32_state(&prng) % (mm_size - bias_start), ps); + bias_end = max(bias_end, bias_start + ps); + bias_rem = bias_end - bias_start; + + do { + u32 size = max(round_up(prandom_u32_state(&prng) % bias_rem, ps), ps); + + KUNIT_ASSERT_FALSE_MSG(test, + drm_buddy_alloc_blocks(&mm, bias_start, + bias_end, size, ps, + &allocated, + DRM_BUDDY_RANGE_ALLOCATION), + "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", + bias_start, bias_end, size, ps); + bias_rem -= size; + + /* + * Try to randomly grow the bias range in both directions, or + * only one, or perhaps don't grow at all. + */ + do { + u32 old_bias_start = bias_start; + u32 old_bias_end = bias_end; + + if (bias_start) + bias_start -= round_up(prandom_u32_state(&prng) % bias_start, ps); + if (bias_end != mm_size) + bias_end += round_up(prandom_u32_state(&prng) % (mm_size - bias_end), ps); + + bias_rem += old_bias_start - bias_start; + bias_rem += bias_end - old_bias_end; + } while (!bias_rem && (bias_start || bias_end != mm_size)); + } while (bias_rem); + + KUNIT_ASSERT_EQ(test, bias_start, 0); + KUNIT_ASSERT_EQ(test, bias_end, mm_size); + KUNIT_ASSERT_TRUE_MSG(test, + drm_buddy_alloc_blocks(&mm, bias_start, bias_end, + ps, ps, + &allocated, + DRM_BUDDY_RANGE_ALLOCATION), + "buddy_alloc passed with bias(%x-%x), size=%u\n", + bias_start, bias_end, ps); + + drm_buddy_free_list(&mm, &allocated, 0); + drm_buddy_fini(&mm); + + /* + * Allocate cleared blocks in the bias range when the DRM buddy's clear avail is + * zero. This will validate the bias range allocation in scenarios like system boot + * when no cleared blocks are available and exercise the fallback path too. The resulting + * blocks should always be dirty. + */ + + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, ps), + "buddy_init failed\n"); + + bias_start = round_up(prandom_u32_state(&prng) % (mm_size - ps), ps); + bias_end = round_up(bias_start + prandom_u32_state(&prng) % (mm_size - bias_start), ps); + bias_end = max(bias_end, bias_start + ps); + bias_rem = bias_end - bias_start; + + flags = DRM_BUDDY_CLEAR_ALLOCATION | DRM_BUDDY_RANGE_ALLOCATION; + size = max(round_up(prandom_u32_state(&prng) % bias_rem, ps), ps); + + KUNIT_ASSERT_FALSE_MSG(test, + drm_buddy_alloc_blocks(&mm, bias_start, + bias_end, size, ps, + &allocated, + flags), + "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", + bias_start, bias_end, size, ps); + + list_for_each_entry(block, &allocated, link) + KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), false); + + drm_buddy_free_list(&mm, &allocated, 0); + drm_buddy_fini(&mm); +} + +static void drm_test_buddy_alloc_clear(struct kunit *test) +{ + unsigned long n_pages, total, i = 0; + const unsigned long ps = SZ_4K; + struct drm_buddy_block *block; + const int max_order = 12; + LIST_HEAD(allocated); + struct drm_buddy mm; + unsigned int order; + u32 mm_size, size; + LIST_HEAD(dirty); + LIST_HEAD(clean); + + mm_size = SZ_4K << max_order; + KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, mm_size, ps)); + + KUNIT_EXPECT_EQ(test, mm.max_order, max_order); + + /* + * Idea is to allocate and free some random portion of the address space, + * returning those pages as non-dirty and randomly alternate between + * requesting dirty and non-dirty pages (not going over the limit + * we freed as non-dirty), putting that into two separate lists. + * Loop over both lists at the end checking that the dirty list + * is indeed all dirty pages and vice versa. Free it all again, + * keeping the dirty/clear status. + */ + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + 5 * ps, ps, &allocated, + DRM_BUDDY_TOPDOWN_ALLOCATION), + "buddy_alloc hit an error size=%lu\n", 5 * ps); + drm_buddy_free_list(&mm, &allocated, DRM_BUDDY_CLEARED); + + n_pages = 10; + do { + unsigned long flags; + struct list_head *list; + int slot = i % 2; + + if (slot == 0) { + list = &dirty; + flags = 0; + } else { + list = &clean; + flags = DRM_BUDDY_CLEAR_ALLOCATION; + } + + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + ps, ps, list, + flags), + "buddy_alloc hit an error size=%lu\n", ps); + } while (++i < n_pages); + + list_for_each_entry(block, &clean, link) + KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), true); + + list_for_each_entry(block, &dirty, link) + KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), false); + + drm_buddy_free_list(&mm, &clean, DRM_BUDDY_CLEARED); + + /* + * Trying to go over the clear limit for some allocation. + * The allocation should never fail with reasonable page-size. + */ + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + 10 * ps, ps, &clean, + DRM_BUDDY_CLEAR_ALLOCATION), + "buddy_alloc hit an error size=%lu\n", 10 * ps); + + drm_buddy_free_list(&mm, &clean, DRM_BUDDY_CLEARED); + drm_buddy_free_list(&mm, &dirty, 0); + drm_buddy_fini(&mm); + + KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, mm_size, ps)); + + /* + * Create a new mm. Intentionally fragment the address space by creating + * two alternating lists. Free both lists, one as dirty the other as clean. + * Try to allocate double the previous size with matching min_page_size. The + * allocation should never fail as it calls the force_merge. Also check that + * the page is always dirty after force_merge. Free the page as dirty, then + * repeat the whole thing, increment the order until we hit the max_order. + */ + + i = 0; + n_pages = mm_size / ps; + do { + struct list_head *list; + int slot = i % 2; + + if (slot == 0) + list = &dirty; + else + list = &clean; + + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + ps, ps, list, 0), + "buddy_alloc hit an error size=%lu\n", ps); + } while (++i < n_pages); + + drm_buddy_free_list(&mm, &clean, DRM_BUDDY_CLEARED); + drm_buddy_free_list(&mm, &dirty, 0); + + order = 1; + do { + size = SZ_4K << order; + + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + size, size, &allocated, + DRM_BUDDY_CLEAR_ALLOCATION), + "buddy_alloc hit an error size=%u\n", size); + total = 0; + list_for_each_entry(block, &allocated, link) { + if (size != mm_size) + KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), false); + total += drm_buddy_block_size(&mm, block); + } + KUNIT_EXPECT_EQ(test, total, size); + + drm_buddy_free_list(&mm, &allocated, 0); + } while (++order <= max_order); + + drm_buddy_fini(&mm); + + /* + * Create a new mm with a non power-of-two size. Allocate a random size from each + * root, free as cleared and then call fini. This will ensure the multi-root + * force merge during fini. + */ + mm_size = (SZ_4K << max_order) + (SZ_4K << (max_order - 2)); + + KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, mm_size, ps)); + KUNIT_EXPECT_EQ(test, mm.max_order, max_order); + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, SZ_4K << max_order, + 4 * ps, ps, &allocated, + DRM_BUDDY_RANGE_ALLOCATION), + "buddy_alloc hit an error size=%lu\n", 4 * ps); + drm_buddy_free_list(&mm, &allocated, DRM_BUDDY_CLEARED); + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, SZ_4K << max_order, + 2 * ps, ps, &allocated, + DRM_BUDDY_CLEAR_ALLOCATION), + "buddy_alloc hit an error size=%lu\n", 2 * ps); + drm_buddy_free_list(&mm, &allocated, DRM_BUDDY_CLEARED); + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, SZ_4K << max_order, mm_size, + ps, ps, &allocated, + DRM_BUDDY_RANGE_ALLOCATION), + "buddy_alloc hit an error size=%lu\n", ps); + drm_buddy_free_list(&mm, &allocated, DRM_BUDDY_CLEARED); + drm_buddy_fini(&mm); +} + +static void drm_test_buddy_alloc_contiguous(struct kunit *test) +{ + const unsigned long ps = SZ_4K, mm_size = 16 * 3 * SZ_4K; + unsigned long i, n_pages, total; + struct drm_buddy_block *block; + struct drm_buddy mm; + LIST_HEAD(left); + LIST_HEAD(middle); + LIST_HEAD(right); + LIST_HEAD(allocated); + + KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, mm_size, ps)); + + /* + * Idea is to fragment the address space by alternating block + * allocations between three different lists; one for left, middle and + * right. We can then free a list to simulate fragmentation. In + * particular we want to exercise the DRM_BUDDY_CONTIGUOUS_ALLOCATION, + * including the try_harder path. + */ + + i = 0; + n_pages = mm_size / ps; + do { + struct list_head *list; + int slot = i % 3; + + if (slot == 0) + list = &left; + else if (slot == 1) + list = &middle; + else + list = &right; + KUNIT_ASSERT_FALSE_MSG(test, + drm_buddy_alloc_blocks(&mm, 0, mm_size, + ps, ps, list, 0), + "buddy_alloc hit an error size=%lu\n", + ps); + } while (++i < n_pages); + + KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + 3 * ps, ps, &allocated, + DRM_BUDDY_CONTIGUOUS_ALLOCATION), + "buddy_alloc didn't error size=%lu\n", 3 * ps); + + drm_buddy_free_list(&mm, &middle, 0); + KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + 3 * ps, ps, &allocated, + DRM_BUDDY_CONTIGUOUS_ALLOCATION), + "buddy_alloc didn't error size=%lu\n", 3 * ps); + KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + 2 * ps, ps, &allocated, + DRM_BUDDY_CONTIGUOUS_ALLOCATION), + "buddy_alloc didn't error size=%lu\n", 2 * ps); + + drm_buddy_free_list(&mm, &right, 0); + KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + 3 * ps, ps, &allocated, + DRM_BUDDY_CONTIGUOUS_ALLOCATION), + "buddy_alloc didn't error size=%lu\n", 3 * ps); + /* + * At this point we should have enough contiguous space for 2 blocks, + * however they are never buddies (since we freed middle and right) so + * will require the try_harder logic to find them. + */ + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + 2 * ps, ps, &allocated, + DRM_BUDDY_CONTIGUOUS_ALLOCATION), + "buddy_alloc hit an error size=%lu\n", 2 * ps); + + drm_buddy_free_list(&mm, &left, 0); + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + 3 * ps, ps, &allocated, + DRM_BUDDY_CONTIGUOUS_ALLOCATION), + "buddy_alloc hit an error size=%lu\n", 3 * ps); + + total = 0; + list_for_each_entry(block, &allocated, link) + total += drm_buddy_block_size(&mm, block); + + KUNIT_ASSERT_EQ(test, total, ps * 2 + ps * 3); + + drm_buddy_free_list(&mm, &allocated, 0); + drm_buddy_fini(&mm); +} + +static void drm_test_buddy_alloc_pathological(struct kunit *test) +{ + u64 mm_size, size, start = 0; + struct drm_buddy_block *block; + const int max_order = 3; + unsigned long flags = 0; + int order, top; + struct drm_buddy mm; + LIST_HEAD(blocks); + LIST_HEAD(holes); + LIST_HEAD(tmp); + + /* + * Create a pot-sized mm, then allocate one of each possible + * order within. This should leave the mm with exactly one + * page left. Free the largest block, then whittle down again. + * Eventually we will have a fully 50% fragmented mm. + */ + + mm_size = SZ_4K << max_order; + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), + "buddy_init failed\n"); + + KUNIT_EXPECT_EQ(test, mm.max_order, max_order); + + for (top = max_order; top; top--) { + /* Make room by freeing the largest allocated block */ + block = list_first_entry_or_null(&blocks, typeof(*block), link); + if (block) { + list_del(&block->link); + drm_buddy_free_block(&mm, block); + } + + for (order = top; order--;) { + size = get_size(order, mm.chunk_size); + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, + mm_size, size, size, + &tmp, flags), + "buddy_alloc hit -ENOMEM with order=%d, top=%d\n", + order, top); + + block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); + KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); + + list_move_tail(&block->link, &blocks); + } + + /* There should be one final page for this sub-allocation */ + size = get_size(0, mm.chunk_size); + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + size, size, &tmp, flags), + "buddy_alloc hit -ENOMEM for hole\n"); + + block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); + KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); + + list_move_tail(&block->link, &holes); + + size = get_size(top, mm.chunk_size); + KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + size, size, &tmp, flags), + "buddy_alloc unexpectedly succeeded at top-order %d/%d, it should be full!", + top, max_order); + } + + drm_buddy_free_list(&mm, &holes, 0); + + /* Nothing larger than blocks of chunk_size now available */ + for (order = 1; order <= max_order; order++) { + size = get_size(order, mm.chunk_size); + KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + size, size, &tmp, flags), + "buddy_alloc unexpectedly succeeded at order %d, it should be full!", + order); + } + + list_splice_tail(&holes, &blocks); + drm_buddy_free_list(&mm, &blocks, 0); + drm_buddy_fini(&mm); +} + +static void drm_test_buddy_alloc_pessimistic(struct kunit *test) +{ + u64 mm_size, size, start = 0; + struct drm_buddy_block *block, *bn; + const unsigned int max_order = 16; + unsigned long flags = 0; + struct drm_buddy mm; + unsigned int order; + LIST_HEAD(blocks); + LIST_HEAD(tmp); + + /* + * Create a pot-sized mm, then allocate one of each possible + * order within. This should leave the mm with exactly one + * page left. + */ + + mm_size = SZ_4K << max_order; + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), + "buddy_init failed\n"); + + KUNIT_EXPECT_EQ(test, mm.max_order, max_order); + + for (order = 0; order < max_order; order++) { + size = get_size(order, mm.chunk_size); + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + size, size, &tmp, flags), + "buddy_alloc hit -ENOMEM with order=%d\n", + order); + + block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); + KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); + + list_move_tail(&block->link, &blocks); + } + + /* And now the last remaining block available */ + size = get_size(0, mm.chunk_size); + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + size, size, &tmp, flags), + "buddy_alloc hit -ENOMEM on final alloc\n"); + + block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); + KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); + + list_move_tail(&block->link, &blocks); + + /* Should be completely full! */ + for (order = max_order; order--;) { + size = get_size(order, mm.chunk_size); + KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + size, size, &tmp, flags), + "buddy_alloc unexpectedly succeeded, it should be full!"); + } + + block = list_last_entry(&blocks, typeof(*block), link); + list_del(&block->link); + drm_buddy_free_block(&mm, block); + + /* As we free in increasing size, we make available larger blocks */ + order = 1; + list_for_each_entry_safe(block, bn, &blocks, link) { + list_del(&block->link); + drm_buddy_free_block(&mm, block); + + size = get_size(order, mm.chunk_size); + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + size, size, &tmp, flags), + "buddy_alloc hit -ENOMEM with order=%d\n", + order); + + block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); + KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); + + list_del(&block->link); + drm_buddy_free_block(&mm, block); + order++; + } + + /* To confirm, now the whole mm should be available */ + size = get_size(max_order, mm.chunk_size); + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + size, size, &tmp, flags), + "buddy_alloc (realloc) hit -ENOMEM with order=%d\n", + max_order); + + block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); + KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); + + list_del(&block->link); + drm_buddy_free_block(&mm, block); + drm_buddy_free_list(&mm, &blocks, 0); + drm_buddy_fini(&mm); +} + +static void drm_test_buddy_alloc_optimistic(struct kunit *test) +{ + u64 mm_size, size, start = 0; + struct drm_buddy_block *block; + unsigned long flags = 0; + const int max_order = 16; + struct drm_buddy mm; + LIST_HEAD(blocks); + LIST_HEAD(tmp); + int order; + + /* + * Create a mm with one block of each order available, and + * try to allocate them all. + */ + + mm_size = SZ_4K * ((1 << (max_order + 1)) - 1); + + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), + "buddy_init failed\n"); + + KUNIT_EXPECT_EQ(test, mm.max_order, max_order); + + for (order = 0; order <= max_order; order++) { + size = get_size(order, mm.chunk_size); + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + size, size, &tmp, flags), + "buddy_alloc hit -ENOMEM with order=%d\n", + order); + + block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); + KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); + + list_move_tail(&block->link, &blocks); + } + + /* Should be completely full! */ + size = get_size(0, mm.chunk_size); + KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + size, size, &tmp, flags), + "buddy_alloc unexpectedly succeeded, it should be full!"); + + drm_buddy_free_list(&mm, &blocks, 0); + drm_buddy_fini(&mm); +} + +static void drm_test_buddy_alloc_limit(struct kunit *test) +{ + u64 size = U64_MAX, start = 0; + struct drm_buddy_block *block; + unsigned long flags = 0; + LIST_HEAD(allocated); + struct drm_buddy mm; + + KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, size, SZ_4K)); + + KUNIT_EXPECT_EQ_MSG(test, mm.max_order, DRM_BUDDY_MAX_ORDER, + "mm.max_order(%d) != %d\n", mm.max_order, + DRM_BUDDY_MAX_ORDER); + + size = mm.chunk_size << mm.max_order; + KUNIT_EXPECT_FALSE(test, drm_buddy_alloc_blocks(&mm, start, size, size, + mm.chunk_size, &allocated, flags)); + + block = list_first_entry_or_null(&allocated, struct drm_buddy_block, link); + KUNIT_EXPECT_TRUE(test, block); + + KUNIT_EXPECT_EQ_MSG(test, drm_buddy_block_order(block), mm.max_order, + "block order(%d) != %d\n", + drm_buddy_block_order(block), mm.max_order); + + KUNIT_EXPECT_EQ_MSG(test, drm_buddy_block_size(&mm, block), + BIT_ULL(mm.max_order) * mm.chunk_size, + "block size(%llu) != %llu\n", + drm_buddy_block_size(&mm, block), + BIT_ULL(mm.max_order) * mm.chunk_size); + + drm_buddy_free_list(&mm, &allocated, 0); + drm_buddy_fini(&mm); +} + +static void drm_test_buddy_alloc_exceeds_max_order(struct kunit *test) +{ + u64 mm_size = SZ_8G + SZ_2G, size = SZ_8G + SZ_1G, min_block_size = SZ_8G; + struct drm_buddy mm; + LIST_HEAD(blocks); + int err; + + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), + "buddy_init failed\n"); + + /* CONTIGUOUS allocation should succeed via try_harder fallback */ + KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, size, + SZ_4K, &blocks, + DRM_BUDDY_CONTIGUOUS_ALLOCATION), + "buddy_alloc hit an error size=%llu\n", size); + drm_buddy_free_list(&mm, &blocks, 0); + + /* Non-CONTIGUOUS with large min_block_size should return -EINVAL */ + err = drm_buddy_alloc_blocks(&mm, 0, mm_size, size, min_block_size, &blocks, 0); + KUNIT_EXPECT_EQ(test, err, -EINVAL); + + /* Non-CONTIGUOUS + RANGE with large min_block_size should return -EINVAL */ + err = drm_buddy_alloc_blocks(&mm, 0, mm_size, size, min_block_size, &blocks, + DRM_BUDDY_RANGE_ALLOCATION); + KUNIT_EXPECT_EQ(test, err, -EINVAL); + + /* CONTIGUOUS + RANGE should return -EINVAL (no try_harder for RANGE) */ + err = drm_buddy_alloc_blocks(&mm, 0, mm_size, size, SZ_4K, &blocks, + DRM_BUDDY_CONTIGUOUS_ALLOCATION | DRM_BUDDY_RANGE_ALLOCATION); + KUNIT_EXPECT_EQ(test, err, -EINVAL); + + drm_buddy_fini(&mm); +} + +static int drm_buddy_suite_init(struct kunit_suite *suite) +{ + while (!random_seed) + random_seed = get_random_u32(); + + kunit_info(suite, "Testing DRM buddy manager, with random_seed=0x%x\n", + random_seed); + + return 0; +} + +static struct kunit_case drm_buddy_tests[] = { + KUNIT_CASE(drm_test_buddy_alloc_limit), + KUNIT_CASE(drm_test_buddy_alloc_optimistic), + KUNIT_CASE(drm_test_buddy_alloc_pessimistic), + KUNIT_CASE(drm_test_buddy_alloc_pathological), + KUNIT_CASE(drm_test_buddy_alloc_contiguous), + KUNIT_CASE(drm_test_buddy_alloc_clear), + KUNIT_CASE(drm_test_buddy_alloc_range_bias), + KUNIT_CASE(drm_test_buddy_fragmentation_performance), + KUNIT_CASE(drm_test_buddy_alloc_exceeds_max_order), + {} +}; + +static struct kunit_suite drm_buddy_test_suite = { + .name = "drm_buddy", + .suite_init = drm_buddy_suite_init, + .test_cases = drm_buddy_tests, +}; + +kunit_test_suite(drm_buddy_test_suite); + +MODULE_AUTHOR("Intel Corporation"); +MODULE_DESCRIPTION("Kunit test for drm_buddy functions"); +MODULE_LICENSE("GPL"); diff --git a/drivers/gpu/tests/gpu_random.c b/drivers/gpu/tests/gpu_random.c new file mode 100644 index 000000000000..ddd1f594b5d5 --- /dev/null +++ b/drivers/gpu/tests/gpu_random.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include + +#include "gpu_random.h" + +u32 drm_prandom_u32_max_state(u32 ep_ro, struct rnd_state *state) +{ + return upper_32_bits((u64)prandom_u32_state(state) * ep_ro); +} +EXPORT_SYMBOL(drm_prandom_u32_max_state); + +void drm_random_reorder(unsigned int *order, unsigned int count, + struct rnd_state *state) +{ + unsigned int i, j; + + for (i = 0; i < count; ++i) { + BUILD_BUG_ON(sizeof(unsigned int) > sizeof(u32)); + j = drm_prandom_u32_max_state(count, state); + swap(order[i], order[j]); + } +} +EXPORT_SYMBOL(drm_random_reorder); + +unsigned int *drm_random_order(unsigned int count, struct rnd_state *state) +{ + unsigned int *order, i; + + order = kmalloc_array(count, sizeof(*order), GFP_KERNEL); + if (!order) + return order; + + for (i = 0; i < count; i++) + order[i] = i; + + drm_random_reorder(order, count, state); + return order; +} +EXPORT_SYMBOL(drm_random_order); diff --git a/drivers/gpu/tests/gpu_random.h b/drivers/gpu/tests/gpu_random.h new file mode 100644 index 000000000000..9f827260a89d --- /dev/null +++ b/drivers/gpu/tests/gpu_random.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __DRM_RANDOM_H__ +#define __DRM_RANDOM_H__ + +/* This is a temporary home for a couple of utility functions that should + * be transposed to lib/ at the earliest convenience. + */ + +#include + +#define DRM_RND_STATE_INITIALIZER(seed__) ({ \ + struct rnd_state state__; \ + prandom_seed_state(&state__, (seed__)); \ + state__; \ +}) + +#define DRM_RND_STATE(name__, seed__) \ + struct rnd_state name__ = DRM_RND_STATE_INITIALIZER(seed__) + +unsigned int *drm_random_order(unsigned int count, + struct rnd_state *state); +void drm_random_reorder(unsigned int *order, + unsigned int count, + struct rnd_state *state); +u32 drm_prandom_u32_max_state(u32 ep_ro, + struct rnd_state *state); + +#endif /* !__DRM_RANDOM_H__ */ diff --git a/include/drm/drm_buddy.h b/include/drm/drm_buddy.h deleted file mode 100644 index b909fa8f810a..000000000000 --- a/include/drm/drm_buddy.h +++ /dev/null @@ -1,171 +0,0 @@ -/* SPDX-License-Identifier: MIT */ -/* - * Copyright © 2021 Intel Corporation - */ - -#ifndef __DRM_BUDDY_H__ -#define __DRM_BUDDY_H__ - -#include -#include -#include -#include -#include - -struct drm_printer; - -#define DRM_BUDDY_RANGE_ALLOCATION BIT(0) -#define DRM_BUDDY_TOPDOWN_ALLOCATION BIT(1) -#define DRM_BUDDY_CONTIGUOUS_ALLOCATION BIT(2) -#define DRM_BUDDY_CLEAR_ALLOCATION BIT(3) -#define DRM_BUDDY_CLEARED BIT(4) -#define DRM_BUDDY_TRIM_DISABLE BIT(5) - -struct drm_buddy_block { -#define DRM_BUDDY_HEADER_OFFSET GENMASK_ULL(63, 12) -#define DRM_BUDDY_HEADER_STATE GENMASK_ULL(11, 10) -#define DRM_BUDDY_ALLOCATED (1 << 10) -#define DRM_BUDDY_FREE (2 << 10) -#define DRM_BUDDY_SPLIT (3 << 10) -#define DRM_BUDDY_HEADER_CLEAR GENMASK_ULL(9, 9) -/* Free to be used, if needed in the future */ -#define DRM_BUDDY_HEADER_UNUSED GENMASK_ULL(8, 6) -#define DRM_BUDDY_HEADER_ORDER GENMASK_ULL(5, 0) - u64 header; - - struct drm_buddy_block *left; - struct drm_buddy_block *right; - struct drm_buddy_block *parent; - - void *private; /* owned by creator */ - - /* - * While the block is allocated by the user through drm_buddy_alloc*, - * the user has ownership of the link, for example to maintain within - * a list, if so desired. As soon as the block is freed with - * drm_buddy_free* ownership is given back to the mm. - */ - union { - struct rb_node rb; - struct list_head link; - }; - - struct list_head tmp_link; -}; - -/* Order-zero must be at least SZ_4K */ -#define DRM_BUDDY_MAX_ORDER (63 - 12) - -/* - * Binary Buddy System. - * - * Locking should be handled by the user, a simple mutex around - * drm_buddy_alloc* and drm_buddy_free* should suffice. - */ -struct drm_buddy { - /* Maintain a free list for each order. */ - struct rb_root **free_trees; - - /* - * Maintain explicit binary tree(s) to track the allocation of the - * address space. This gives us a simple way of finding a buddy block - * and performing the potentially recursive merge step when freeing a - * block. Nodes are either allocated or free, in which case they will - * also exist on the respective free list. - */ - struct drm_buddy_block **roots; - - /* - * Anything from here is public, and remains static for the lifetime of - * the mm. Everything above is considered do-not-touch. - */ - unsigned int n_roots; - unsigned int max_order; - - /* Must be at least SZ_4K */ - u64 chunk_size; - u64 size; - u64 avail; - u64 clear_avail; -}; - -static inline u64 -drm_buddy_block_offset(const struct drm_buddy_block *block) -{ - return block->header & DRM_BUDDY_HEADER_OFFSET; -} - -static inline unsigned int -drm_buddy_block_order(struct drm_buddy_block *block) -{ - return block->header & DRM_BUDDY_HEADER_ORDER; -} - -static inline unsigned int -drm_buddy_block_state(struct drm_buddy_block *block) -{ - return block->header & DRM_BUDDY_HEADER_STATE; -} - -static inline bool -drm_buddy_block_is_allocated(struct drm_buddy_block *block) -{ - return drm_buddy_block_state(block) == DRM_BUDDY_ALLOCATED; -} - -static inline bool -drm_buddy_block_is_clear(struct drm_buddy_block *block) -{ - return block->header & DRM_BUDDY_HEADER_CLEAR; -} - -static inline bool -drm_buddy_block_is_free(struct drm_buddy_block *block) -{ - return drm_buddy_block_state(block) == DRM_BUDDY_FREE; -} - -static inline bool -drm_buddy_block_is_split(struct drm_buddy_block *block) -{ - return drm_buddy_block_state(block) == DRM_BUDDY_SPLIT; -} - -static inline u64 -drm_buddy_block_size(struct drm_buddy *mm, - struct drm_buddy_block *block) -{ - return mm->chunk_size << drm_buddy_block_order(block); -} - -int drm_buddy_init(struct drm_buddy *mm, u64 size, u64 chunk_size); - -void drm_buddy_fini(struct drm_buddy *mm); - -struct drm_buddy_block * -drm_get_buddy(struct drm_buddy_block *block); - -int drm_buddy_alloc_blocks(struct drm_buddy *mm, - u64 start, u64 end, u64 size, - u64 min_page_size, - struct list_head *blocks, - unsigned long flags); - -int drm_buddy_block_trim(struct drm_buddy *mm, - u64 *start, - u64 new_size, - struct list_head *blocks); - -void drm_buddy_reset_clear(struct drm_buddy *mm, bool is_clear); - -void drm_buddy_free_block(struct drm_buddy *mm, struct drm_buddy_block *block); - -void drm_buddy_free_list(struct drm_buddy *mm, - struct list_head *objects, - unsigned int flags); - -void drm_buddy_print(struct drm_buddy *mm, struct drm_printer *p); -void drm_buddy_block_print(struct drm_buddy *mm, - struct drm_buddy_block *block, - struct drm_printer *p); -#endif diff --git a/include/linux/gpu_buddy.h b/include/linux/gpu_buddy.h new file mode 100644 index 000000000000..b909fa8f810a --- /dev/null +++ b/include/linux/gpu_buddy.h @@ -0,0 +1,171 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2021 Intel Corporation + */ + +#ifndef __DRM_BUDDY_H__ +#define __DRM_BUDDY_H__ + +#include +#include +#include +#include +#include + +struct drm_printer; + +#define DRM_BUDDY_RANGE_ALLOCATION BIT(0) +#define DRM_BUDDY_TOPDOWN_ALLOCATION BIT(1) +#define DRM_BUDDY_CONTIGUOUS_ALLOCATION BIT(2) +#define DRM_BUDDY_CLEAR_ALLOCATION BIT(3) +#define DRM_BUDDY_CLEARED BIT(4) +#define DRM_BUDDY_TRIM_DISABLE BIT(5) + +struct drm_buddy_block { +#define DRM_BUDDY_HEADER_OFFSET GENMASK_ULL(63, 12) +#define DRM_BUDDY_HEADER_STATE GENMASK_ULL(11, 10) +#define DRM_BUDDY_ALLOCATED (1 << 10) +#define DRM_BUDDY_FREE (2 << 10) +#define DRM_BUDDY_SPLIT (3 << 10) +#define DRM_BUDDY_HEADER_CLEAR GENMASK_ULL(9, 9) +/* Free to be used, if needed in the future */ +#define DRM_BUDDY_HEADER_UNUSED GENMASK_ULL(8, 6) +#define DRM_BUDDY_HEADER_ORDER GENMASK_ULL(5, 0) + u64 header; + + struct drm_buddy_block *left; + struct drm_buddy_block *right; + struct drm_buddy_block *parent; + + void *private; /* owned by creator */ + + /* + * While the block is allocated by the user through drm_buddy_alloc*, + * the user has ownership of the link, for example to maintain within + * a list, if so desired. As soon as the block is freed with + * drm_buddy_free* ownership is given back to the mm. + */ + union { + struct rb_node rb; + struct list_head link; + }; + + struct list_head tmp_link; +}; + +/* Order-zero must be at least SZ_4K */ +#define DRM_BUDDY_MAX_ORDER (63 - 12) + +/* + * Binary Buddy System. + * + * Locking should be handled by the user, a simple mutex around + * drm_buddy_alloc* and drm_buddy_free* should suffice. + */ +struct drm_buddy { + /* Maintain a free list for each order. */ + struct rb_root **free_trees; + + /* + * Maintain explicit binary tree(s) to track the allocation of the + * address space. This gives us a simple way of finding a buddy block + * and performing the potentially recursive merge step when freeing a + * block. Nodes are either allocated or free, in which case they will + * also exist on the respective free list. + */ + struct drm_buddy_block **roots; + + /* + * Anything from here is public, and remains static for the lifetime of + * the mm. Everything above is considered do-not-touch. + */ + unsigned int n_roots; + unsigned int max_order; + + /* Must be at least SZ_4K */ + u64 chunk_size; + u64 size; + u64 avail; + u64 clear_avail; +}; + +static inline u64 +drm_buddy_block_offset(const struct drm_buddy_block *block) +{ + return block->header & DRM_BUDDY_HEADER_OFFSET; +} + +static inline unsigned int +drm_buddy_block_order(struct drm_buddy_block *block) +{ + return block->header & DRM_BUDDY_HEADER_ORDER; +} + +static inline unsigned int +drm_buddy_block_state(struct drm_buddy_block *block) +{ + return block->header & DRM_BUDDY_HEADER_STATE; +} + +static inline bool +drm_buddy_block_is_allocated(struct drm_buddy_block *block) +{ + return drm_buddy_block_state(block) == DRM_BUDDY_ALLOCATED; +} + +static inline bool +drm_buddy_block_is_clear(struct drm_buddy_block *block) +{ + return block->header & DRM_BUDDY_HEADER_CLEAR; +} + +static inline bool +drm_buddy_block_is_free(struct drm_buddy_block *block) +{ + return drm_buddy_block_state(block) == DRM_BUDDY_FREE; +} + +static inline bool +drm_buddy_block_is_split(struct drm_buddy_block *block) +{ + return drm_buddy_block_state(block) == DRM_BUDDY_SPLIT; +} + +static inline u64 +drm_buddy_block_size(struct drm_buddy *mm, + struct drm_buddy_block *block) +{ + return mm->chunk_size << drm_buddy_block_order(block); +} + +int drm_buddy_init(struct drm_buddy *mm, u64 size, u64 chunk_size); + +void drm_buddy_fini(struct drm_buddy *mm); + +struct drm_buddy_block * +drm_get_buddy(struct drm_buddy_block *block); + +int drm_buddy_alloc_blocks(struct drm_buddy *mm, + u64 start, u64 end, u64 size, + u64 min_page_size, + struct list_head *blocks, + unsigned long flags); + +int drm_buddy_block_trim(struct drm_buddy *mm, + u64 *start, + u64 new_size, + struct list_head *blocks); + +void drm_buddy_reset_clear(struct drm_buddy *mm, bool is_clear); + +void drm_buddy_free_block(struct drm_buddy *mm, struct drm_buddy_block *block); + +void drm_buddy_free_list(struct drm_buddy *mm, + struct list_head *objects, + unsigned int flags); + +void drm_buddy_print(struct drm_buddy *mm, struct drm_printer *p); +void drm_buddy_block_print(struct drm_buddy *mm, + struct drm_buddy_block *block, + struct drm_printer *p); +#endif -- cgit v1.2.3 From ba110db8e1bc206c13fd7d985e79b033f53bfdea Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Fri, 6 Feb 2026 08:52:38 +1000 Subject: gpu: Move DRM buddy allocator one level up (part two) Move the DRM buddy allocator one level up so that it can be used by GPU drivers (example, nova-core) that have usecases other than DRM (such as VFIO vGPU support). Modify the API, structures and Kconfigs to use "gpu_buddy" terminology. Adapt the drivers and tests to use the new API. The commit cannot be split due to bisectability, however no functional change is intended. Verified by running K-UNIT tests and build tested various configurations. Signed-off-by: Joel Fernandes Reviewed-by: Dave Airlie [airlied: I've split this into two so git can find copies easier. I've also just nuked drm_random library, that stuff needs to be done elsewhere and only the buddy tests seem to be using it]. Signed-off-by: Dave Airlie --- Documentation/gpu/drm-mm.rst | 6 + MAINTAINERS | 8 +- drivers/gpu/Kconfig | 13 + drivers/gpu/Makefile | 1 + drivers/gpu/buddy.c | 556 ++++++++++----------- drivers/gpu/drm/Kconfig | 1 + drivers/gpu/drm/Makefile | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h | 12 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 79 +-- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h | 18 +- drivers/gpu/drm/drm_buddy.c | 77 +++ drivers/gpu/drm/i915/i915_scatterlist.c | 8 +- drivers/gpu/drm/i915/i915_ttm_buddy_manager.c | 55 +- drivers/gpu/drm/i915/i915_ttm_buddy_manager.h | 4 +- .../gpu/drm/i915/selftests/intel_memory_region.c | 20 +- drivers/gpu/drm/ttm/tests/ttm_bo_validate_test.c | 4 +- drivers/gpu/drm/ttm/tests/ttm_mock_manager.c | 18 +- drivers/gpu/drm/ttm/tests/ttm_mock_manager.h | 2 +- drivers/gpu/drm/xe/xe_res_cursor.h | 34 +- drivers/gpu/drm/xe/xe_svm.c | 12 +- drivers/gpu/drm/xe/xe_ttm_vram_mgr.c | 71 +-- drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h | 2 +- drivers/gpu/tests/Makefile | 2 +- drivers/gpu/tests/gpu_buddy_test.c | 412 +++++++-------- drivers/gpu/tests/gpu_random.c | 16 +- drivers/gpu/tests/gpu_random.h | 18 +- drivers/video/Kconfig | 1 + include/drm/drm_buddy.h | 18 + include/linux/gpu_buddy.h | 124 ++--- 30 files changed, 855 insertions(+), 741 deletions(-) create mode 100644 drivers/gpu/Kconfig create mode 100644 drivers/gpu/drm/drm_buddy.c create mode 100644 include/drm/drm_buddy.h (limited to 'include/linux') diff --git a/Documentation/gpu/drm-mm.rst b/Documentation/gpu/drm-mm.rst index ceee0e663237..32fb506db05b 100644 --- a/Documentation/gpu/drm-mm.rst +++ b/Documentation/gpu/drm-mm.rst @@ -532,6 +532,12 @@ Buddy Allocator Function References (GPU buddy) .. kernel-doc:: drivers/gpu/buddy.c :export: +DRM Buddy Specific Logging Function References +---------------------------------------------- + +.. kernel-doc:: drivers/gpu/drm/drm_buddy.c + :export: + DRM Cache Handling and Fast WC memcpy() ======================================= diff --git a/MAINTAINERS b/MAINTAINERS index 086cbf5c36b3..f2bec2c0d7e3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8797,15 +8797,17 @@ T: git https://gitlab.freedesktop.org/drm/misc/kernel.git F: drivers/gpu/drm/ttm/ F: include/drm/ttm/ -DRM BUDDY ALLOCATOR +GPU BUDDY ALLOCATOR M: Matthew Auld M: Arun Pravin R: Christian Koenig L: dri-devel@lists.freedesktop.org S: Maintained T: git https://gitlab.freedesktop.org/drm/misc/kernel.git -F: drivers/gpu/drm/drm_buddy.c -F: drivers/gpu/drm/tests/drm_buddy_test.c +F: drivers/gpu/drm_buddy.c +F: drivers/gpu/buddy.c +F: drivers/gpu/tests/gpu_buddy_test.c +F: include/linux/gpu_buddy.h F: include/drm/drm_buddy.h DRM AUTOMATED TESTING diff --git a/drivers/gpu/Kconfig b/drivers/gpu/Kconfig new file mode 100644 index 000000000000..ebb2ad4b7ea0 --- /dev/null +++ b/drivers/gpu/Kconfig @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0 + +config GPU_BUDDY + bool + help + A page based buddy allocator for GPU memory. + +config GPU_BUDDY_KUNIT_TEST + tristate "KUnit tests for GPU buddy allocator" if !KUNIT_ALL_TESTS + depends on GPU_BUDDY && KUNIT + default KUNIT_ALL_TESTS + help + KUnit tests for the GPU buddy allocator. diff --git a/drivers/gpu/Makefile b/drivers/gpu/Makefile index c5292ee2c852..5cd54d06e262 100644 --- a/drivers/gpu/Makefile +++ b/drivers/gpu/Makefile @@ -6,3 +6,4 @@ obj-y += host1x/ drm/ vga/ tests/ obj-$(CONFIG_IMX_IPUV3_CORE) += ipu-v3/ obj-$(CONFIG_TRACE_GPU_MEM) += trace/ obj-$(CONFIG_NOVA_CORE) += nova-core/ +obj-$(CONFIG_GPU_BUDDY) += buddy.o diff --git a/drivers/gpu/buddy.c b/drivers/gpu/buddy.c index 4cc63d961d26..603c59a2013a 100644 --- a/drivers/gpu/buddy.c +++ b/drivers/gpu/buddy.c @@ -11,27 +11,17 @@ #include #include -#include - -enum drm_buddy_free_tree { - DRM_BUDDY_CLEAR_TREE = 0, - DRM_BUDDY_DIRTY_TREE, - DRM_BUDDY_MAX_FREE_TREES, -}; static struct kmem_cache *slab_blocks; -#define for_each_free_tree(tree) \ - for ((tree) = 0; (tree) < DRM_BUDDY_MAX_FREE_TREES; (tree)++) - -static struct drm_buddy_block *drm_block_alloc(struct drm_buddy *mm, - struct drm_buddy_block *parent, +static struct gpu_buddy_block *gpu_block_alloc(struct gpu_buddy *mm, + struct gpu_buddy_block *parent, unsigned int order, u64 offset) { - struct drm_buddy_block *block; + struct gpu_buddy_block *block; - BUG_ON(order > DRM_BUDDY_MAX_ORDER); + BUG_ON(order > GPU_BUDDY_MAX_ORDER); block = kmem_cache_zalloc(slab_blocks, GFP_KERNEL); if (!block) @@ -43,30 +33,30 @@ static struct drm_buddy_block *drm_block_alloc(struct drm_buddy *mm, RB_CLEAR_NODE(&block->rb); - BUG_ON(block->header & DRM_BUDDY_HEADER_UNUSED); + BUG_ON(block->header & GPU_BUDDY_HEADER_UNUSED); return block; } -static void drm_block_free(struct drm_buddy *mm, - struct drm_buddy_block *block) +static void gpu_block_free(struct gpu_buddy *mm, + struct gpu_buddy_block *block) { kmem_cache_free(slab_blocks, block); } -static enum drm_buddy_free_tree -get_block_tree(struct drm_buddy_block *block) +static enum gpu_buddy_free_tree +get_block_tree(struct gpu_buddy_block *block) { - return drm_buddy_block_is_clear(block) ? - DRM_BUDDY_CLEAR_TREE : DRM_BUDDY_DIRTY_TREE; + return gpu_buddy_block_is_clear(block) ? + GPU_BUDDY_CLEAR_TREE : GPU_BUDDY_DIRTY_TREE; } -static struct drm_buddy_block * +static struct gpu_buddy_block * rbtree_get_free_block(const struct rb_node *node) { - return node ? rb_entry(node, struct drm_buddy_block, rb) : NULL; + return node ? rb_entry(node, struct gpu_buddy_block, rb) : NULL; } -static struct drm_buddy_block * +static struct gpu_buddy_block * rbtree_last_free_block(struct rb_root *root) { return rbtree_get_free_block(rb_last(root)); @@ -77,33 +67,33 @@ static bool rbtree_is_empty(struct rb_root *root) return RB_EMPTY_ROOT(root); } -static bool drm_buddy_block_offset_less(const struct drm_buddy_block *block, - const struct drm_buddy_block *node) +static bool gpu_buddy_block_offset_less(const struct gpu_buddy_block *block, + const struct gpu_buddy_block *node) { - return drm_buddy_block_offset(block) < drm_buddy_block_offset(node); + return gpu_buddy_block_offset(block) < gpu_buddy_block_offset(node); } static bool rbtree_block_offset_less(struct rb_node *block, const struct rb_node *node) { - return drm_buddy_block_offset_less(rbtree_get_free_block(block), + return gpu_buddy_block_offset_less(rbtree_get_free_block(block), rbtree_get_free_block(node)); } -static void rbtree_insert(struct drm_buddy *mm, - struct drm_buddy_block *block, - enum drm_buddy_free_tree tree) +static void rbtree_insert(struct gpu_buddy *mm, + struct gpu_buddy_block *block, + enum gpu_buddy_free_tree tree) { rb_add(&block->rb, - &mm->free_trees[tree][drm_buddy_block_order(block)], + &mm->free_trees[tree][gpu_buddy_block_order(block)], rbtree_block_offset_less); } -static void rbtree_remove(struct drm_buddy *mm, - struct drm_buddy_block *block) +static void rbtree_remove(struct gpu_buddy *mm, + struct gpu_buddy_block *block) { - unsigned int order = drm_buddy_block_order(block); - enum drm_buddy_free_tree tree; + unsigned int order = gpu_buddy_block_order(block); + enum gpu_buddy_free_tree tree; struct rb_root *root; tree = get_block_tree(block); @@ -113,42 +103,42 @@ static void rbtree_remove(struct drm_buddy *mm, RB_CLEAR_NODE(&block->rb); } -static void clear_reset(struct drm_buddy_block *block) +static void clear_reset(struct gpu_buddy_block *block) { - block->header &= ~DRM_BUDDY_HEADER_CLEAR; + block->header &= ~GPU_BUDDY_HEADER_CLEAR; } -static void mark_cleared(struct drm_buddy_block *block) +static void mark_cleared(struct gpu_buddy_block *block) { - block->header |= DRM_BUDDY_HEADER_CLEAR; + block->header |= GPU_BUDDY_HEADER_CLEAR; } -static void mark_allocated(struct drm_buddy *mm, - struct drm_buddy_block *block) +static void mark_allocated(struct gpu_buddy *mm, + struct gpu_buddy_block *block) { - block->header &= ~DRM_BUDDY_HEADER_STATE; - block->header |= DRM_BUDDY_ALLOCATED; + block->header &= ~GPU_BUDDY_HEADER_STATE; + block->header |= GPU_BUDDY_ALLOCATED; rbtree_remove(mm, block); } -static void mark_free(struct drm_buddy *mm, - struct drm_buddy_block *block) +static void mark_free(struct gpu_buddy *mm, + struct gpu_buddy_block *block) { - enum drm_buddy_free_tree tree; + enum gpu_buddy_free_tree tree; - block->header &= ~DRM_BUDDY_HEADER_STATE; - block->header |= DRM_BUDDY_FREE; + block->header &= ~GPU_BUDDY_HEADER_STATE; + block->header |= GPU_BUDDY_FREE; tree = get_block_tree(block); rbtree_insert(mm, block, tree); } -static void mark_split(struct drm_buddy *mm, - struct drm_buddy_block *block) +static void mark_split(struct gpu_buddy *mm, + struct gpu_buddy_block *block) { - block->header &= ~DRM_BUDDY_HEADER_STATE; - block->header |= DRM_BUDDY_SPLIT; + block->header &= ~GPU_BUDDY_HEADER_STATE; + block->header |= GPU_BUDDY_SPLIT; rbtree_remove(mm, block); } @@ -163,10 +153,10 @@ static inline bool contains(u64 s1, u64 e1, u64 s2, u64 e2) return s1 <= s2 && e1 >= e2; } -static struct drm_buddy_block * -__get_buddy(struct drm_buddy_block *block) +static struct gpu_buddy_block * +__get_buddy(struct gpu_buddy_block *block) { - struct drm_buddy_block *parent; + struct gpu_buddy_block *parent; parent = block->parent; if (!parent) @@ -178,19 +168,19 @@ __get_buddy(struct drm_buddy_block *block) return parent->left; } -static unsigned int __drm_buddy_free(struct drm_buddy *mm, - struct drm_buddy_block *block, +static unsigned int __gpu_buddy_free(struct gpu_buddy *mm, + struct gpu_buddy_block *block, bool force_merge) { - struct drm_buddy_block *parent; + struct gpu_buddy_block *parent; unsigned int order; while ((parent = block->parent)) { - struct drm_buddy_block *buddy; + struct gpu_buddy_block *buddy; buddy = __get_buddy(block); - if (!drm_buddy_block_is_free(buddy)) + if (!gpu_buddy_block_is_free(buddy)) break; if (!force_merge) { @@ -198,31 +188,31 @@ static unsigned int __drm_buddy_free(struct drm_buddy *mm, * Check the block and its buddy clear state and exit * the loop if they both have the dissimilar state. */ - if (drm_buddy_block_is_clear(block) != - drm_buddy_block_is_clear(buddy)) + if (gpu_buddy_block_is_clear(block) != + gpu_buddy_block_is_clear(buddy)) break; - if (drm_buddy_block_is_clear(block)) + if (gpu_buddy_block_is_clear(block)) mark_cleared(parent); } rbtree_remove(mm, buddy); - if (force_merge && drm_buddy_block_is_clear(buddy)) - mm->clear_avail -= drm_buddy_block_size(mm, buddy); + if (force_merge && gpu_buddy_block_is_clear(buddy)) + mm->clear_avail -= gpu_buddy_block_size(mm, buddy); - drm_block_free(mm, block); - drm_block_free(mm, buddy); + gpu_block_free(mm, block); + gpu_block_free(mm, buddy); block = parent; } - order = drm_buddy_block_order(block); + order = gpu_buddy_block_order(block); mark_free(mm, block); return order; } -static int __force_merge(struct drm_buddy *mm, +static int __force_merge(struct gpu_buddy *mm, u64 start, u64 end, unsigned int min_order) @@ -241,7 +231,7 @@ static int __force_merge(struct drm_buddy *mm, struct rb_node *iter = rb_last(&mm->free_trees[tree][i]); while (iter) { - struct drm_buddy_block *block, *buddy; + struct gpu_buddy_block *block, *buddy; u64 block_start, block_end; block = rbtree_get_free_block(iter); @@ -250,18 +240,18 @@ static int __force_merge(struct drm_buddy *mm, if (!block || !block->parent) continue; - block_start = drm_buddy_block_offset(block); - block_end = block_start + drm_buddy_block_size(mm, block) - 1; + block_start = gpu_buddy_block_offset(block); + block_end = block_start + gpu_buddy_block_size(mm, block) - 1; if (!contains(start, end, block_start, block_end)) continue; buddy = __get_buddy(block); - if (!drm_buddy_block_is_free(buddy)) + if (!gpu_buddy_block_is_free(buddy)) continue; - WARN_ON(drm_buddy_block_is_clear(block) == - drm_buddy_block_is_clear(buddy)); + WARN_ON(gpu_buddy_block_is_clear(block) == + gpu_buddy_block_is_clear(buddy)); /* * Advance to the next node when the current node is the buddy, @@ -271,10 +261,10 @@ static int __force_merge(struct drm_buddy *mm, iter = rb_prev(iter); rbtree_remove(mm, block); - if (drm_buddy_block_is_clear(block)) - mm->clear_avail -= drm_buddy_block_size(mm, block); + if (gpu_buddy_block_is_clear(block)) + mm->clear_avail -= gpu_buddy_block_size(mm, block); - order = __drm_buddy_free(mm, block, true); + order = __gpu_buddy_free(mm, block, true); if (order >= min_order) return 0; } @@ -285,9 +275,9 @@ static int __force_merge(struct drm_buddy *mm, } /** - * drm_buddy_init - init memory manager + * gpu_buddy_init - init memory manager * - * @mm: DRM buddy manager to initialize + * @mm: GPU buddy manager to initialize * @size: size in bytes to manage * @chunk_size: minimum page size in bytes for our allocations * @@ -296,7 +286,7 @@ static int __force_merge(struct drm_buddy *mm, * Returns: * 0 on success, error code on failure. */ -int drm_buddy_init(struct drm_buddy *mm, u64 size, u64 chunk_size) +int gpu_buddy_init(struct gpu_buddy *mm, u64 size, u64 chunk_size) { unsigned int i, j, root_count = 0; u64 offset = 0; @@ -318,9 +308,9 @@ int drm_buddy_init(struct drm_buddy *mm, u64 size, u64 chunk_size) mm->chunk_size = chunk_size; mm->max_order = ilog2(size) - ilog2(chunk_size); - BUG_ON(mm->max_order > DRM_BUDDY_MAX_ORDER); + BUG_ON(mm->max_order > GPU_BUDDY_MAX_ORDER); - mm->free_trees = kmalloc_array(DRM_BUDDY_MAX_FREE_TREES, + mm->free_trees = kmalloc_array(GPU_BUDDY_MAX_FREE_TREES, sizeof(*mm->free_trees), GFP_KERNEL); if (!mm->free_trees) @@ -340,7 +330,7 @@ int drm_buddy_init(struct drm_buddy *mm, u64 size, u64 chunk_size) mm->n_roots = hweight64(size); mm->roots = kmalloc_array(mm->n_roots, - sizeof(struct drm_buddy_block *), + sizeof(struct gpu_buddy_block *), GFP_KERNEL); if (!mm->roots) goto out_free_tree; @@ -350,21 +340,21 @@ int drm_buddy_init(struct drm_buddy *mm, u64 size, u64 chunk_size) * not itself a power-of-two. */ do { - struct drm_buddy_block *root; + struct gpu_buddy_block *root; unsigned int order; u64 root_size; order = ilog2(size) - ilog2(chunk_size); root_size = chunk_size << order; - root = drm_block_alloc(mm, NULL, order, offset); + root = gpu_block_alloc(mm, NULL, order, offset); if (!root) goto out_free_roots; mark_free(mm, root); BUG_ON(root_count > mm->max_order); - BUG_ON(drm_buddy_block_size(mm, root) < chunk_size); + BUG_ON(gpu_buddy_block_size(mm, root) < chunk_size); mm->roots[root_count] = root; @@ -377,7 +367,7 @@ int drm_buddy_init(struct drm_buddy *mm, u64 size, u64 chunk_size) out_free_roots: while (root_count--) - drm_block_free(mm, mm->roots[root_count]); + gpu_block_free(mm, mm->roots[root_count]); kfree(mm->roots); out_free_tree: while (i--) @@ -385,16 +375,16 @@ out_free_tree: kfree(mm->free_trees); return -ENOMEM; } -EXPORT_SYMBOL(drm_buddy_init); +EXPORT_SYMBOL(gpu_buddy_init); /** - * drm_buddy_fini - tear down the memory manager + * gpu_buddy_fini - tear down the memory manager * - * @mm: DRM buddy manager to free + * @mm: GPU buddy manager to free * * Cleanup memory manager resources and the freetree */ -void drm_buddy_fini(struct drm_buddy *mm) +void gpu_buddy_fini(struct gpu_buddy *mm) { u64 root_size, size, start; unsigned int order; @@ -404,13 +394,13 @@ void drm_buddy_fini(struct drm_buddy *mm) for (i = 0; i < mm->n_roots; ++i) { order = ilog2(size) - ilog2(mm->chunk_size); - start = drm_buddy_block_offset(mm->roots[i]); + start = gpu_buddy_block_offset(mm->roots[i]); __force_merge(mm, start, start + size, order); - if (WARN_ON(!drm_buddy_block_is_free(mm->roots[i]))) + if (WARN_ON(!gpu_buddy_block_is_free(mm->roots[i]))) kunit_fail_current_test("buddy_fini() root"); - drm_block_free(mm, mm->roots[i]); + gpu_block_free(mm, mm->roots[i]); root_size = mm->chunk_size << order; size -= root_size; @@ -423,31 +413,31 @@ void drm_buddy_fini(struct drm_buddy *mm) kfree(mm->free_trees); kfree(mm->roots); } -EXPORT_SYMBOL(drm_buddy_fini); +EXPORT_SYMBOL(gpu_buddy_fini); -static int split_block(struct drm_buddy *mm, - struct drm_buddy_block *block) +static int split_block(struct gpu_buddy *mm, + struct gpu_buddy_block *block) { - unsigned int block_order = drm_buddy_block_order(block) - 1; - u64 offset = drm_buddy_block_offset(block); + unsigned int block_order = gpu_buddy_block_order(block) - 1; + u64 offset = gpu_buddy_block_offset(block); - BUG_ON(!drm_buddy_block_is_free(block)); - BUG_ON(!drm_buddy_block_order(block)); + BUG_ON(!gpu_buddy_block_is_free(block)); + BUG_ON(!gpu_buddy_block_order(block)); - block->left = drm_block_alloc(mm, block, block_order, offset); + block->left = gpu_block_alloc(mm, block, block_order, offset); if (!block->left) return -ENOMEM; - block->right = drm_block_alloc(mm, block, block_order, + block->right = gpu_block_alloc(mm, block, block_order, offset + (mm->chunk_size << block_order)); if (!block->right) { - drm_block_free(mm, block->left); + gpu_block_free(mm, block->left); return -ENOMEM; } mark_split(mm, block); - if (drm_buddy_block_is_clear(block)) { + if (gpu_buddy_block_is_clear(block)) { mark_cleared(block->left); mark_cleared(block->right); clear_reset(block); @@ -460,34 +450,34 @@ static int split_block(struct drm_buddy *mm, } /** - * drm_get_buddy - get buddy address + * gpu_get_buddy - get buddy address * - * @block: DRM buddy block + * @block: GPU buddy block * * Returns the corresponding buddy block for @block, or NULL * if this is a root block and can't be merged further. * Requires some kind of locking to protect against * any concurrent allocate and free operations. */ -struct drm_buddy_block * -drm_get_buddy(struct drm_buddy_block *block) +struct gpu_buddy_block * +gpu_get_buddy(struct gpu_buddy_block *block) { return __get_buddy(block); } -EXPORT_SYMBOL(drm_get_buddy); +EXPORT_SYMBOL(gpu_get_buddy); /** - * drm_buddy_reset_clear - reset blocks clear state + * gpu_buddy_reset_clear - reset blocks clear state * - * @mm: DRM buddy manager + * @mm: GPU buddy manager * @is_clear: blocks clear state * * Reset the clear state based on @is_clear value for each block * in the freetree. */ -void drm_buddy_reset_clear(struct drm_buddy *mm, bool is_clear) +void gpu_buddy_reset_clear(struct gpu_buddy *mm, bool is_clear) { - enum drm_buddy_free_tree src_tree, dst_tree; + enum gpu_buddy_free_tree src_tree, dst_tree; u64 root_size, size, start; unsigned int order; int i; @@ -495,60 +485,60 @@ void drm_buddy_reset_clear(struct drm_buddy *mm, bool is_clear) size = mm->size; for (i = 0; i < mm->n_roots; ++i) { order = ilog2(size) - ilog2(mm->chunk_size); - start = drm_buddy_block_offset(mm->roots[i]); + start = gpu_buddy_block_offset(mm->roots[i]); __force_merge(mm, start, start + size, order); root_size = mm->chunk_size << order; size -= root_size; } - src_tree = is_clear ? DRM_BUDDY_DIRTY_TREE : DRM_BUDDY_CLEAR_TREE; - dst_tree = is_clear ? DRM_BUDDY_CLEAR_TREE : DRM_BUDDY_DIRTY_TREE; + src_tree = is_clear ? GPU_BUDDY_DIRTY_TREE : GPU_BUDDY_CLEAR_TREE; + dst_tree = is_clear ? GPU_BUDDY_CLEAR_TREE : GPU_BUDDY_DIRTY_TREE; for (i = 0; i <= mm->max_order; ++i) { struct rb_root *root = &mm->free_trees[src_tree][i]; - struct drm_buddy_block *block, *tmp; + struct gpu_buddy_block *block, *tmp; rbtree_postorder_for_each_entry_safe(block, tmp, root, rb) { rbtree_remove(mm, block); if (is_clear) { mark_cleared(block); - mm->clear_avail += drm_buddy_block_size(mm, block); + mm->clear_avail += gpu_buddy_block_size(mm, block); } else { clear_reset(block); - mm->clear_avail -= drm_buddy_block_size(mm, block); + mm->clear_avail -= gpu_buddy_block_size(mm, block); } rbtree_insert(mm, block, dst_tree); } } } -EXPORT_SYMBOL(drm_buddy_reset_clear); +EXPORT_SYMBOL(gpu_buddy_reset_clear); /** - * drm_buddy_free_block - free a block + * gpu_buddy_free_block - free a block * - * @mm: DRM buddy manager + * @mm: GPU buddy manager * @block: block to be freed */ -void drm_buddy_free_block(struct drm_buddy *mm, - struct drm_buddy_block *block) +void gpu_buddy_free_block(struct gpu_buddy *mm, + struct gpu_buddy_block *block) { - BUG_ON(!drm_buddy_block_is_allocated(block)); - mm->avail += drm_buddy_block_size(mm, block); - if (drm_buddy_block_is_clear(block)) - mm->clear_avail += drm_buddy_block_size(mm, block); + BUG_ON(!gpu_buddy_block_is_allocated(block)); + mm->avail += gpu_buddy_block_size(mm, block); + if (gpu_buddy_block_is_clear(block)) + mm->clear_avail += gpu_buddy_block_size(mm, block); - __drm_buddy_free(mm, block, false); + __gpu_buddy_free(mm, block, false); } -EXPORT_SYMBOL(drm_buddy_free_block); +EXPORT_SYMBOL(gpu_buddy_free_block); -static void __drm_buddy_free_list(struct drm_buddy *mm, +static void __gpu_buddy_free_list(struct gpu_buddy *mm, struct list_head *objects, bool mark_clear, bool mark_dirty) { - struct drm_buddy_block *block, *on; + struct gpu_buddy_block *block, *on; WARN_ON(mark_dirty && mark_clear); @@ -557,13 +547,13 @@ static void __drm_buddy_free_list(struct drm_buddy *mm, mark_cleared(block); else if (mark_dirty) clear_reset(block); - drm_buddy_free_block(mm, block); + gpu_buddy_free_block(mm, block); cond_resched(); } INIT_LIST_HEAD(objects); } -static void drm_buddy_free_list_internal(struct drm_buddy *mm, +static void gpu_buddy_free_list_internal(struct gpu_buddy *mm, struct list_head *objects) { /* @@ -571,43 +561,43 @@ static void drm_buddy_free_list_internal(struct drm_buddy *mm, * at this point. For example we might have just failed part of the * allocation. */ - __drm_buddy_free_list(mm, objects, false, false); + __gpu_buddy_free_list(mm, objects, false, false); } /** - * drm_buddy_free_list - free blocks + * gpu_buddy_free_list - free blocks * - * @mm: DRM buddy manager + * @mm: GPU buddy manager * @objects: input list head to free blocks - * @flags: optional flags like DRM_BUDDY_CLEARED + * @flags: optional flags like GPU_BUDDY_CLEARED */ -void drm_buddy_free_list(struct drm_buddy *mm, +void gpu_buddy_free_list(struct gpu_buddy *mm, struct list_head *objects, unsigned int flags) { - bool mark_clear = flags & DRM_BUDDY_CLEARED; + bool mark_clear = flags & GPU_BUDDY_CLEARED; - __drm_buddy_free_list(mm, objects, mark_clear, !mark_clear); + __gpu_buddy_free_list(mm, objects, mark_clear, !mark_clear); } -EXPORT_SYMBOL(drm_buddy_free_list); +EXPORT_SYMBOL(gpu_buddy_free_list); -static bool block_incompatible(struct drm_buddy_block *block, unsigned int flags) +static bool block_incompatible(struct gpu_buddy_block *block, unsigned int flags) { - bool needs_clear = flags & DRM_BUDDY_CLEAR_ALLOCATION; + bool needs_clear = flags & GPU_BUDDY_CLEAR_ALLOCATION; - return needs_clear != drm_buddy_block_is_clear(block); + return needs_clear != gpu_buddy_block_is_clear(block); } -static struct drm_buddy_block * -__alloc_range_bias(struct drm_buddy *mm, +static struct gpu_buddy_block * +__alloc_range_bias(struct gpu_buddy *mm, u64 start, u64 end, unsigned int order, unsigned long flags, bool fallback) { u64 req_size = mm->chunk_size << order; - struct drm_buddy_block *block; - struct drm_buddy_block *buddy; + struct gpu_buddy_block *block; + struct gpu_buddy_block *buddy; LIST_HEAD(dfs); int err; int i; @@ -622,23 +612,23 @@ __alloc_range_bias(struct drm_buddy *mm, u64 block_end; block = list_first_entry_or_null(&dfs, - struct drm_buddy_block, + struct gpu_buddy_block, tmp_link); if (!block) break; list_del(&block->tmp_link); - if (drm_buddy_block_order(block) < order) + if (gpu_buddy_block_order(block) < order) continue; - block_start = drm_buddy_block_offset(block); - block_end = block_start + drm_buddy_block_size(mm, block) - 1; + block_start = gpu_buddy_block_offset(block); + block_end = block_start + gpu_buddy_block_size(mm, block) - 1; if (!overlaps(start, end, block_start, block_end)) continue; - if (drm_buddy_block_is_allocated(block)) + if (gpu_buddy_block_is_allocated(block)) continue; if (block_start < start || block_end > end) { @@ -654,17 +644,17 @@ __alloc_range_bias(struct drm_buddy *mm, continue; if (contains(start, end, block_start, block_end) && - order == drm_buddy_block_order(block)) { + order == gpu_buddy_block_order(block)) { /* * Find the free block within the range. */ - if (drm_buddy_block_is_free(block)) + if (gpu_buddy_block_is_free(block)) return block; continue; } - if (!drm_buddy_block_is_split(block)) { + if (!gpu_buddy_block_is_split(block)) { err = split_block(mm, block); if (unlikely(err)) goto err_undo; @@ -684,19 +674,19 @@ err_undo: */ buddy = __get_buddy(block); if (buddy && - (drm_buddy_block_is_free(block) && - drm_buddy_block_is_free(buddy))) - __drm_buddy_free(mm, block, false); + (gpu_buddy_block_is_free(block) && + gpu_buddy_block_is_free(buddy))) + __gpu_buddy_free(mm, block, false); return ERR_PTR(err); } -static struct drm_buddy_block * -__drm_buddy_alloc_range_bias(struct drm_buddy *mm, +static struct gpu_buddy_block * +__gpu_buddy_alloc_range_bias(struct gpu_buddy *mm, u64 start, u64 end, unsigned int order, unsigned long flags) { - struct drm_buddy_block *block; + struct gpu_buddy_block *block; bool fallback = false; block = __alloc_range_bias(mm, start, end, order, @@ -708,12 +698,12 @@ __drm_buddy_alloc_range_bias(struct drm_buddy *mm, return block; } -static struct drm_buddy_block * -get_maxblock(struct drm_buddy *mm, +static struct gpu_buddy_block * +get_maxblock(struct gpu_buddy *mm, unsigned int order, - enum drm_buddy_free_tree tree) + enum gpu_buddy_free_tree tree) { - struct drm_buddy_block *max_block = NULL, *block = NULL; + struct gpu_buddy_block *max_block = NULL, *block = NULL; struct rb_root *root; unsigned int i; @@ -728,8 +718,8 @@ get_maxblock(struct drm_buddy *mm, continue; } - if (drm_buddy_block_offset(block) > - drm_buddy_block_offset(max_block)) { + if (gpu_buddy_block_offset(block) > + gpu_buddy_block_offset(max_block)) { max_block = block; } } @@ -737,25 +727,25 @@ get_maxblock(struct drm_buddy *mm, return max_block; } -static struct drm_buddy_block * -alloc_from_freetree(struct drm_buddy *mm, +static struct gpu_buddy_block * +alloc_from_freetree(struct gpu_buddy *mm, unsigned int order, unsigned long flags) { - struct drm_buddy_block *block = NULL; + struct gpu_buddy_block *block = NULL; struct rb_root *root; - enum drm_buddy_free_tree tree; + enum gpu_buddy_free_tree tree; unsigned int tmp; int err; - tree = (flags & DRM_BUDDY_CLEAR_ALLOCATION) ? - DRM_BUDDY_CLEAR_TREE : DRM_BUDDY_DIRTY_TREE; + tree = (flags & GPU_BUDDY_CLEAR_ALLOCATION) ? + GPU_BUDDY_CLEAR_TREE : GPU_BUDDY_DIRTY_TREE; - if (flags & DRM_BUDDY_TOPDOWN_ALLOCATION) { + if (flags & GPU_BUDDY_TOPDOWN_ALLOCATION) { block = get_maxblock(mm, order, tree); if (block) /* Store the obtained block order */ - tmp = drm_buddy_block_order(block); + tmp = gpu_buddy_block_order(block); } else { for (tmp = order; tmp <= mm->max_order; ++tmp) { /* Get RB tree root for this order and tree */ @@ -768,8 +758,8 @@ alloc_from_freetree(struct drm_buddy *mm, if (!block) { /* Try allocating from the other tree */ - tree = (tree == DRM_BUDDY_CLEAR_TREE) ? - DRM_BUDDY_DIRTY_TREE : DRM_BUDDY_CLEAR_TREE; + tree = (tree == GPU_BUDDY_CLEAR_TREE) ? + GPU_BUDDY_DIRTY_TREE : GPU_BUDDY_CLEAR_TREE; for (tmp = order; tmp <= mm->max_order; ++tmp) { root = &mm->free_trees[tree][tmp]; @@ -782,7 +772,7 @@ alloc_from_freetree(struct drm_buddy *mm, return ERR_PTR(-ENOSPC); } - BUG_ON(!drm_buddy_block_is_free(block)); + BUG_ON(!gpu_buddy_block_is_free(block)); while (tmp != order) { err = split_block(mm, block); @@ -796,18 +786,18 @@ alloc_from_freetree(struct drm_buddy *mm, err_undo: if (tmp != order) - __drm_buddy_free(mm, block, false); + __gpu_buddy_free(mm, block, false); return ERR_PTR(err); } -static int __alloc_range(struct drm_buddy *mm, +static int __alloc_range(struct gpu_buddy *mm, struct list_head *dfs, u64 start, u64 size, struct list_head *blocks, u64 *total_allocated_on_err) { - struct drm_buddy_block *block; - struct drm_buddy_block *buddy; + struct gpu_buddy_block *block; + struct gpu_buddy_block *buddy; u64 total_allocated = 0; LIST_HEAD(allocated); u64 end; @@ -820,31 +810,31 @@ static int __alloc_range(struct drm_buddy *mm, u64 block_end; block = list_first_entry_or_null(dfs, - struct drm_buddy_block, + struct gpu_buddy_block, tmp_link); if (!block) break; list_del(&block->tmp_link); - block_start = drm_buddy_block_offset(block); - block_end = block_start + drm_buddy_block_size(mm, block) - 1; + block_start = gpu_buddy_block_offset(block); + block_end = block_start + gpu_buddy_block_size(mm, block) - 1; if (!overlaps(start, end, block_start, block_end)) continue; - if (drm_buddy_block_is_allocated(block)) { + if (gpu_buddy_block_is_allocated(block)) { err = -ENOSPC; goto err_free; } if (contains(start, end, block_start, block_end)) { - if (drm_buddy_block_is_free(block)) { + if (gpu_buddy_block_is_free(block)) { mark_allocated(mm, block); - total_allocated += drm_buddy_block_size(mm, block); - mm->avail -= drm_buddy_block_size(mm, block); - if (drm_buddy_block_is_clear(block)) - mm->clear_avail -= drm_buddy_block_size(mm, block); + total_allocated += gpu_buddy_block_size(mm, block); + mm->avail -= gpu_buddy_block_size(mm, block); + if (gpu_buddy_block_is_clear(block)) + mm->clear_avail -= gpu_buddy_block_size(mm, block); list_add_tail(&block->link, &allocated); continue; } else if (!mm->clear_avail) { @@ -853,7 +843,7 @@ static int __alloc_range(struct drm_buddy *mm, } } - if (!drm_buddy_block_is_split(block)) { + if (!gpu_buddy_block_is_split(block)) { err = split_block(mm, block); if (unlikely(err)) goto err_undo; @@ -880,22 +870,22 @@ err_undo: */ buddy = __get_buddy(block); if (buddy && - (drm_buddy_block_is_free(block) && - drm_buddy_block_is_free(buddy))) - __drm_buddy_free(mm, block, false); + (gpu_buddy_block_is_free(block) && + gpu_buddy_block_is_free(buddy))) + __gpu_buddy_free(mm, block, false); err_free: if (err == -ENOSPC && total_allocated_on_err) { list_splice_tail(&allocated, blocks); *total_allocated_on_err = total_allocated; } else { - drm_buddy_free_list_internal(mm, &allocated); + gpu_buddy_free_list_internal(mm, &allocated); } return err; } -static int __drm_buddy_alloc_range(struct drm_buddy *mm, +static int __gpu_buddy_alloc_range(struct gpu_buddy *mm, u64 start, u64 size, u64 *total_allocated_on_err, @@ -911,13 +901,13 @@ static int __drm_buddy_alloc_range(struct drm_buddy *mm, blocks, total_allocated_on_err); } -static int __alloc_contig_try_harder(struct drm_buddy *mm, +static int __alloc_contig_try_harder(struct gpu_buddy *mm, u64 size, u64 min_block_size, struct list_head *blocks) { u64 rhs_offset, lhs_offset, lhs_size, filled; - struct drm_buddy_block *block; + struct gpu_buddy_block *block; unsigned int tree, order; LIST_HEAD(blocks_lhs); unsigned long pages; @@ -943,8 +933,8 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm, block = rbtree_get_free_block(iter); /* Allocate blocks traversing RHS */ - rhs_offset = drm_buddy_block_offset(block); - err = __drm_buddy_alloc_range(mm, rhs_offset, size, + rhs_offset = gpu_buddy_block_offset(block); + err = __gpu_buddy_alloc_range(mm, rhs_offset, size, &filled, blocks); if (!err || err != -ENOSPC) return err; @@ -954,18 +944,18 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm, lhs_size = round_up(lhs_size, min_block_size); /* Allocate blocks traversing LHS */ - lhs_offset = drm_buddy_block_offset(block) - lhs_size; - err = __drm_buddy_alloc_range(mm, lhs_offset, lhs_size, + lhs_offset = gpu_buddy_block_offset(block) - lhs_size; + err = __gpu_buddy_alloc_range(mm, lhs_offset, lhs_size, NULL, &blocks_lhs); if (!err) { list_splice(&blocks_lhs, blocks); return 0; } else if (err != -ENOSPC) { - drm_buddy_free_list_internal(mm, blocks); + gpu_buddy_free_list_internal(mm, blocks); return err; } /* Free blocks for the next iteration */ - drm_buddy_free_list_internal(mm, blocks); + gpu_buddy_free_list_internal(mm, blocks); iter = rb_prev(iter); } @@ -975,9 +965,9 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm, } /** - * drm_buddy_block_trim - free unused pages + * gpu_buddy_block_trim - free unused pages * - * @mm: DRM buddy manager + * @mm: GPU buddy manager * @start: start address to begin the trimming. * @new_size: original size requested * @blocks: Input and output list of allocated blocks. @@ -993,13 +983,13 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm, * Returns: * 0 on success, error code on failure. */ -int drm_buddy_block_trim(struct drm_buddy *mm, +int gpu_buddy_block_trim(struct gpu_buddy *mm, u64 *start, u64 new_size, struct list_head *blocks) { - struct drm_buddy_block *parent; - struct drm_buddy_block *block; + struct gpu_buddy_block *parent; + struct gpu_buddy_block *block; u64 block_start, block_end; LIST_HEAD(dfs); u64 new_start; @@ -1009,22 +999,22 @@ int drm_buddy_block_trim(struct drm_buddy *mm, return -EINVAL; block = list_first_entry(blocks, - struct drm_buddy_block, + struct gpu_buddy_block, link); - block_start = drm_buddy_block_offset(block); - block_end = block_start + drm_buddy_block_size(mm, block); + block_start = gpu_buddy_block_offset(block); + block_end = block_start + gpu_buddy_block_size(mm, block); - if (WARN_ON(!drm_buddy_block_is_allocated(block))) + if (WARN_ON(!gpu_buddy_block_is_allocated(block))) return -EINVAL; - if (new_size > drm_buddy_block_size(mm, block)) + if (new_size > gpu_buddy_block_size(mm, block)) return -EINVAL; if (!new_size || !IS_ALIGNED(new_size, mm->chunk_size)) return -EINVAL; - if (new_size == drm_buddy_block_size(mm, block)) + if (new_size == gpu_buddy_block_size(mm, block)) return 0; new_start = block_start; @@ -1043,9 +1033,9 @@ int drm_buddy_block_trim(struct drm_buddy *mm, list_del(&block->link); mark_free(mm, block); - mm->avail += drm_buddy_block_size(mm, block); - if (drm_buddy_block_is_clear(block)) - mm->clear_avail += drm_buddy_block_size(mm, block); + mm->avail += gpu_buddy_block_size(mm, block); + if (gpu_buddy_block_is_clear(block)) + mm->clear_avail += gpu_buddy_block_size(mm, block); /* Prevent recursively freeing this node */ parent = block->parent; @@ -1055,26 +1045,26 @@ int drm_buddy_block_trim(struct drm_buddy *mm, err = __alloc_range(mm, &dfs, new_start, new_size, blocks, NULL); if (err) { mark_allocated(mm, block); - mm->avail -= drm_buddy_block_size(mm, block); - if (drm_buddy_block_is_clear(block)) - mm->clear_avail -= drm_buddy_block_size(mm, block); + mm->avail -= gpu_buddy_block_size(mm, block); + if (gpu_buddy_block_is_clear(block)) + mm->clear_avail -= gpu_buddy_block_size(mm, block); list_add(&block->link, blocks); } block->parent = parent; return err; } -EXPORT_SYMBOL(drm_buddy_block_trim); +EXPORT_SYMBOL(gpu_buddy_block_trim); -static struct drm_buddy_block * -__drm_buddy_alloc_blocks(struct drm_buddy *mm, +static struct gpu_buddy_block * +__gpu_buddy_alloc_blocks(struct gpu_buddy *mm, u64 start, u64 end, unsigned int order, unsigned long flags) { - if (flags & DRM_BUDDY_RANGE_ALLOCATION) + if (flags & GPU_BUDDY_RANGE_ALLOCATION) /* Allocate traversing within the range */ - return __drm_buddy_alloc_range_bias(mm, start, end, + return __gpu_buddy_alloc_range_bias(mm, start, end, order, flags); else /* Allocate from freetree */ @@ -1082,15 +1072,15 @@ __drm_buddy_alloc_blocks(struct drm_buddy *mm, } /** - * drm_buddy_alloc_blocks - allocate power-of-two blocks + * gpu_buddy_alloc_blocks - allocate power-of-two blocks * - * @mm: DRM buddy manager to allocate from + * @mm: GPU buddy manager to allocate from * @start: start of the allowed range for this block * @end: end of the allowed range for this block * @size: size of the allocation in bytes * @min_block_size: alignment of the allocation * @blocks: output list head to add allocated blocks - * @flags: DRM_BUDDY_*_ALLOCATION flags + * @flags: GPU_BUDDY_*_ALLOCATION flags * * alloc_range_bias() called on range limitations, which traverses * the tree and returns the desired block. @@ -1101,13 +1091,13 @@ __drm_buddy_alloc_blocks(struct drm_buddy *mm, * Returns: * 0 on success, error code on failure. */ -int drm_buddy_alloc_blocks(struct drm_buddy *mm, +int gpu_buddy_alloc_blocks(struct gpu_buddy *mm, u64 start, u64 end, u64 size, u64 min_block_size, struct list_head *blocks, unsigned long flags) { - struct drm_buddy_block *block = NULL; + struct gpu_buddy_block *block = NULL; u64 original_size, original_min_size; unsigned int min_order, order; LIST_HEAD(allocated); @@ -1137,14 +1127,14 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, if (!IS_ALIGNED(start | end, min_block_size)) return -EINVAL; - return __drm_buddy_alloc_range(mm, start, size, NULL, blocks); + return __gpu_buddy_alloc_range(mm, start, size, NULL, blocks); } original_size = size; original_min_size = min_block_size; /* Roundup the size to power of 2 */ - if (flags & DRM_BUDDY_CONTIGUOUS_ALLOCATION) { + if (flags & GPU_BUDDY_CONTIGUOUS_ALLOCATION) { size = roundup_pow_of_two(size); min_block_size = size; /* Align size value to min_block_size */ @@ -1157,8 +1147,8 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, min_order = ilog2(min_block_size) - ilog2(mm->chunk_size); if (order > mm->max_order || size > mm->size) { - if ((flags & DRM_BUDDY_CONTIGUOUS_ALLOCATION) && - !(flags & DRM_BUDDY_RANGE_ALLOCATION)) + if ((flags & GPU_BUDDY_CONTIGUOUS_ALLOCATION) && + !(flags & GPU_BUDDY_RANGE_ALLOCATION)) return __alloc_contig_try_harder(mm, original_size, original_min_size, blocks); @@ -1171,7 +1161,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, BUG_ON(order < min_order); do { - block = __drm_buddy_alloc_blocks(mm, start, + block = __gpu_buddy_alloc_blocks(mm, start, end, order, flags); @@ -1182,7 +1172,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, /* Try allocation through force merge method */ if (mm->clear_avail && !__force_merge(mm, start, end, min_order)) { - block = __drm_buddy_alloc_blocks(mm, start, + block = __gpu_buddy_alloc_blocks(mm, start, end, min_order, flags); @@ -1196,8 +1186,8 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, * Try contiguous block allocation through * try harder method. */ - if (flags & DRM_BUDDY_CONTIGUOUS_ALLOCATION && - !(flags & DRM_BUDDY_RANGE_ALLOCATION)) + if (flags & GPU_BUDDY_CONTIGUOUS_ALLOCATION && + !(flags & GPU_BUDDY_RANGE_ALLOCATION)) return __alloc_contig_try_harder(mm, original_size, original_min_size, @@ -1208,9 +1198,9 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, } while (1); mark_allocated(mm, block); - mm->avail -= drm_buddy_block_size(mm, block); - if (drm_buddy_block_is_clear(block)) - mm->clear_avail -= drm_buddy_block_size(mm, block); + mm->avail -= gpu_buddy_block_size(mm, block); + if (gpu_buddy_block_is_clear(block)) + mm->clear_avail -= gpu_buddy_block_size(mm, block); kmemleak_update_trace(block); list_add_tail(&block->link, &allocated); @@ -1221,7 +1211,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, } while (1); /* Trim the allocated block to the required size */ - if (!(flags & DRM_BUDDY_TRIM_DISABLE) && + if (!(flags & GPU_BUDDY_TRIM_DISABLE) && original_size != size) { struct list_head *trim_list; LIST_HEAD(temp); @@ -1234,11 +1224,11 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, block = list_last_entry(&allocated, typeof(*block), link); list_move(&block->link, &temp); trim_list = &temp; - trim_size = drm_buddy_block_size(mm, block) - + trim_size = gpu_buddy_block_size(mm, block) - (size - original_size); } - drm_buddy_block_trim(mm, + gpu_buddy_block_trim(mm, NULL, trim_size, trim_list); @@ -1251,44 +1241,42 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm, return 0; err_free: - drm_buddy_free_list_internal(mm, &allocated); + gpu_buddy_free_list_internal(mm, &allocated); return err; } -EXPORT_SYMBOL(drm_buddy_alloc_blocks); +EXPORT_SYMBOL(gpu_buddy_alloc_blocks); /** - * drm_buddy_block_print - print block information + * gpu_buddy_block_print - print block information * - * @mm: DRM buddy manager - * @block: DRM buddy block - * @p: DRM printer to use + * @mm: GPU buddy manager + * @block: GPU buddy block */ -void drm_buddy_block_print(struct drm_buddy *mm, - struct drm_buddy_block *block, - struct drm_printer *p) +void gpu_buddy_block_print(struct gpu_buddy *mm, + struct gpu_buddy_block *block) { - u64 start = drm_buddy_block_offset(block); - u64 size = drm_buddy_block_size(mm, block); + u64 start = gpu_buddy_block_offset(block); + u64 size = gpu_buddy_block_size(mm, block); - drm_printf(p, "%#018llx-%#018llx: %llu\n", start, start + size, size); + pr_info("%#018llx-%#018llx: %llu\n", start, start + size, size); } -EXPORT_SYMBOL(drm_buddy_block_print); +EXPORT_SYMBOL(gpu_buddy_block_print); /** - * drm_buddy_print - print allocator state + * gpu_buddy_print - print allocator state * - * @mm: DRM buddy manager - * @p: DRM printer to use + * @mm: GPU buddy manager + * @p: GPU printer to use */ -void drm_buddy_print(struct drm_buddy *mm, struct drm_printer *p) +void gpu_buddy_print(struct gpu_buddy *mm) { int order; - drm_printf(p, "chunk_size: %lluKiB, total: %lluMiB, free: %lluMiB, clear_free: %lluMiB\n", - mm->chunk_size >> 10, mm->size >> 20, mm->avail >> 20, mm->clear_avail >> 20); + pr_info("chunk_size: %lluKiB, total: %lluMiB, free: %lluMiB, clear_free: %lluMiB\n", + mm->chunk_size >> 10, mm->size >> 20, mm->avail >> 20, mm->clear_avail >> 20); for (order = mm->max_order; order >= 0; order--) { - struct drm_buddy_block *block, *tmp; + struct gpu_buddy_block *block, *tmp; struct rb_root *root; u64 count = 0, free; unsigned int tree; @@ -1297,40 +1285,38 @@ void drm_buddy_print(struct drm_buddy *mm, struct drm_printer *p) root = &mm->free_trees[tree][order]; rbtree_postorder_for_each_entry_safe(block, tmp, root, rb) { - BUG_ON(!drm_buddy_block_is_free(block)); + BUG_ON(!gpu_buddy_block_is_free(block)); count++; } } - drm_printf(p, "order-%2d ", order); - free = count * (mm->chunk_size << order); if (free < SZ_1M) - drm_printf(p, "free: %8llu KiB", free >> 10); + pr_info("order-%2d free: %8llu KiB, blocks: %llu\n", + order, free >> 10, count); else - drm_printf(p, "free: %8llu MiB", free >> 20); - - drm_printf(p, ", blocks: %llu\n", count); + pr_info("order-%2d free: %8llu MiB, blocks: %llu\n", + order, free >> 20, count); } } -EXPORT_SYMBOL(drm_buddy_print); +EXPORT_SYMBOL(gpu_buddy_print); -static void drm_buddy_module_exit(void) +static void gpu_buddy_module_exit(void) { kmem_cache_destroy(slab_blocks); } -static int __init drm_buddy_module_init(void) +static int __init gpu_buddy_module_init(void) { - slab_blocks = KMEM_CACHE(drm_buddy_block, 0); + slab_blocks = KMEM_CACHE(gpu_buddy_block, 0); if (!slab_blocks) return -ENOMEM; return 0; } -module_init(drm_buddy_module_init); -module_exit(drm_buddy_module_exit); +module_init(gpu_buddy_module_init); +module_exit(gpu_buddy_module_exit); -MODULE_DESCRIPTION("DRM Buddy Allocator"); +MODULE_DESCRIPTION("GPU Buddy Allocator"); MODULE_LICENSE("Dual MIT/GPL"); diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index 862ff4000969..758f2eb3d588 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -220,6 +220,7 @@ config DRM_GPUSVM config DRM_BUDDY tristate depends on DRM + select GPU_BUDDY help A page based buddy allocator diff --git a/drivers/gpu/drm/Makefile b/drivers/gpu/drm/Makefile index 892859cfe95f..d0e37f8c2a46 100644 --- a/drivers/gpu/drm/Makefile +++ b/drivers/gpu/drm/Makefile @@ -114,7 +114,7 @@ drm_gpusvm_helper-$(CONFIG_ZONE_DEVICE) += \ obj-$(CONFIG_DRM_GPUSVM) += drm_gpusvm_helper.o -obj-$(CONFIG_DRM_BUDDY) += ../buddy.o +obj-$(CONFIG_DRM_BUDDY) += drm_buddy.o drm_dma_helper-y := drm_gem_dma_helper.o drm_dma_helper-$(CONFIG_DRM_FBDEV_EMULATION) += drm_fbdev_dma.o diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index f582113d78b7..149f8f942eae 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -5663,7 +5663,7 @@ int amdgpu_ras_add_critical_region(struct amdgpu_device *adev, struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct amdgpu_vram_mgr_resource *vres; struct ras_critical_region *region; - struct drm_buddy_block *block; + struct gpu_buddy_block *block; int ret = 0; if (!bo || !bo->tbo.resource) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h index be2e56ce1355..8908d9e08a30 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h @@ -55,7 +55,7 @@ static inline void amdgpu_res_first(struct ttm_resource *res, uint64_t start, uint64_t size, struct amdgpu_res_cursor *cur) { - struct drm_buddy_block *block; + struct gpu_buddy_block *block; struct list_head *head, *next; struct drm_mm_node *node; @@ -71,7 +71,7 @@ static inline void amdgpu_res_first(struct ttm_resource *res, head = &to_amdgpu_vram_mgr_resource(res)->blocks; block = list_first_entry_or_null(head, - struct drm_buddy_block, + struct gpu_buddy_block, link); if (!block) goto fallback; @@ -81,7 +81,7 @@ static inline void amdgpu_res_first(struct ttm_resource *res, next = block->link.next; if (next != head) - block = list_entry(next, struct drm_buddy_block, link); + block = list_entry(next, struct gpu_buddy_block, link); } cur->start = amdgpu_vram_mgr_block_start(block) + start; @@ -125,7 +125,7 @@ fallback: */ static inline void amdgpu_res_next(struct amdgpu_res_cursor *cur, uint64_t size) { - struct drm_buddy_block *block; + struct gpu_buddy_block *block; struct drm_mm_node *node; struct list_head *next; @@ -146,7 +146,7 @@ static inline void amdgpu_res_next(struct amdgpu_res_cursor *cur, uint64_t size) block = cur->node; next = block->link.next; - block = list_entry(next, struct drm_buddy_block, link); + block = list_entry(next, struct gpu_buddy_block, link); cur->node = block; cur->start = amdgpu_vram_mgr_block_start(block); @@ -175,7 +175,7 @@ static inline void amdgpu_res_next(struct amdgpu_res_cursor *cur, uint64_t size) */ static inline bool amdgpu_res_cleared(struct amdgpu_res_cursor *cur) { - struct drm_buddy_block *block; + struct gpu_buddy_block *block; switch (cur->mem_type) { case TTM_PL_VRAM: diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 9d934c07fa6b..cd94f6efb7cb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "amdgpu.h" #include "amdgpu_vm.h" @@ -52,15 +53,15 @@ to_amdgpu_device(struct amdgpu_vram_mgr *mgr) return container_of(mgr, struct amdgpu_device, mman.vram_mgr); } -static inline struct drm_buddy_block * +static inline struct gpu_buddy_block * amdgpu_vram_mgr_first_block(struct list_head *list) { - return list_first_entry_or_null(list, struct drm_buddy_block, link); + return list_first_entry_or_null(list, struct gpu_buddy_block, link); } static inline bool amdgpu_is_vram_mgr_blocks_contiguous(struct list_head *head) { - struct drm_buddy_block *block; + struct gpu_buddy_block *block; u64 start, size; block = amdgpu_vram_mgr_first_block(head); @@ -71,7 +72,7 @@ static inline bool amdgpu_is_vram_mgr_blocks_contiguous(struct list_head *head) start = amdgpu_vram_mgr_block_start(block); size = amdgpu_vram_mgr_block_size(block); - block = list_entry(block->link.next, struct drm_buddy_block, link); + block = list_entry(block->link.next, struct gpu_buddy_block, link); if (start + size != amdgpu_vram_mgr_block_start(block)) return false; } @@ -81,7 +82,7 @@ static inline bool amdgpu_is_vram_mgr_blocks_contiguous(struct list_head *head) static inline u64 amdgpu_vram_mgr_blocks_size(struct list_head *head) { - struct drm_buddy_block *block; + struct gpu_buddy_block *block; u64 size = 0; list_for_each_entry(block, head, link) @@ -254,7 +255,7 @@ const struct attribute_group amdgpu_vram_mgr_attr_group = { * Calculate how many bytes of the DRM BUDDY block are inside visible VRAM */ static u64 amdgpu_vram_mgr_vis_size(struct amdgpu_device *adev, - struct drm_buddy_block *block) + struct gpu_buddy_block *block) { u64 start = amdgpu_vram_mgr_block_start(block); u64 end = start + amdgpu_vram_mgr_block_size(block); @@ -279,7 +280,7 @@ u64 amdgpu_vram_mgr_bo_visible_size(struct amdgpu_bo *bo) struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); struct ttm_resource *res = bo->tbo.resource; struct amdgpu_vram_mgr_resource *vres = to_amdgpu_vram_mgr_resource(res); - struct drm_buddy_block *block; + struct gpu_buddy_block *block; u64 usage = 0; if (amdgpu_gmc_vram_full_visible(&adev->gmc)) @@ -299,15 +300,15 @@ static void amdgpu_vram_mgr_do_reserve(struct ttm_resource_manager *man) { struct amdgpu_vram_mgr *mgr = to_vram_mgr(man); struct amdgpu_device *adev = to_amdgpu_device(mgr); - struct drm_buddy *mm = &mgr->mm; + struct gpu_buddy *mm = &mgr->mm; struct amdgpu_vram_reservation *rsv, *temp; - struct drm_buddy_block *block; + struct gpu_buddy_block *block; uint64_t vis_usage; list_for_each_entry_safe(rsv, temp, &mgr->reservations_pending, blocks) { - if (drm_buddy_alloc_blocks(mm, rsv->start, rsv->start + rsv->size, + if (gpu_buddy_alloc_blocks(mm, rsv->start, rsv->start + rsv->size, rsv->size, mm->chunk_size, &rsv->allocated, - DRM_BUDDY_RANGE_ALLOCATION)) + GPU_BUDDY_RANGE_ALLOCATION)) continue; block = amdgpu_vram_mgr_first_block(&rsv->allocated); @@ -403,7 +404,7 @@ int amdgpu_vram_mgr_query_address_block_info(struct amdgpu_vram_mgr *mgr, uint64_t address, struct amdgpu_vram_block_info *info) { struct amdgpu_vram_mgr_resource *vres; - struct drm_buddy_block *block; + struct gpu_buddy_block *block; u64 start, size; int ret = -ENOENT; @@ -450,8 +451,8 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, struct amdgpu_vram_mgr_resource *vres; u64 size, remaining_size, lpfn, fpfn; unsigned int adjust_dcc_size = 0; - struct drm_buddy *mm = &mgr->mm; - struct drm_buddy_block *block; + struct gpu_buddy *mm = &mgr->mm; + struct gpu_buddy_block *block; unsigned long pages_per_block; int r; @@ -493,17 +494,17 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, INIT_LIST_HEAD(&vres->blocks); if (place->flags & TTM_PL_FLAG_TOPDOWN) - vres->flags |= DRM_BUDDY_TOPDOWN_ALLOCATION; + vres->flags |= GPU_BUDDY_TOPDOWN_ALLOCATION; if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) - vres->flags |= DRM_BUDDY_CONTIGUOUS_ALLOCATION; + vres->flags |= GPU_BUDDY_CONTIGUOUS_ALLOCATION; if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CLEARED) - vres->flags |= DRM_BUDDY_CLEAR_ALLOCATION; + vres->flags |= GPU_BUDDY_CLEAR_ALLOCATION; if (fpfn || lpfn != mgr->mm.size) /* Allocate blocks in desired range */ - vres->flags |= DRM_BUDDY_RANGE_ALLOCATION; + vres->flags |= GPU_BUDDY_RANGE_ALLOCATION; if (bo->flags & AMDGPU_GEM_CREATE_GFX12_DCC && adev->gmc.gmc_funcs->get_dcc_alignment) @@ -516,7 +517,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, dcc_size = roundup_pow_of_two(vres->base.size + adjust_dcc_size); remaining_size = (u64)dcc_size; - vres->flags |= DRM_BUDDY_TRIM_DISABLE; + vres->flags |= GPU_BUDDY_TRIM_DISABLE; } mutex_lock(&mgr->lock); @@ -536,7 +537,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, BUG_ON(min_block_size < mm->chunk_size); - r = drm_buddy_alloc_blocks(mm, fpfn, + r = gpu_buddy_alloc_blocks(mm, fpfn, lpfn, size, min_block_size, @@ -545,7 +546,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, if (unlikely(r == -ENOSPC) && pages_per_block == ~0ul && !(place->flags & TTM_PL_FLAG_CONTIGUOUS)) { - vres->flags &= ~DRM_BUDDY_CONTIGUOUS_ALLOCATION; + vres->flags &= ~GPU_BUDDY_CONTIGUOUS_ALLOCATION; pages_per_block = max_t(u32, 2UL << (20UL - PAGE_SHIFT), tbo->page_alignment); @@ -566,7 +567,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, list_add_tail(&vres->vres_node, &mgr->allocated_vres_list); if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && adjust_dcc_size) { - struct drm_buddy_block *dcc_block; + struct gpu_buddy_block *dcc_block; unsigned long dcc_start; u64 trim_start; @@ -576,7 +577,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, roundup((unsigned long)amdgpu_vram_mgr_block_start(dcc_block), adjust_dcc_size); trim_start = (u64)dcc_start; - drm_buddy_block_trim(mm, &trim_start, + gpu_buddy_block_trim(mm, &trim_start, (u64)vres->base.size, &vres->blocks); } @@ -614,7 +615,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man, return 0; error_free_blocks: - drm_buddy_free_list(mm, &vres->blocks, 0); + gpu_buddy_free_list(mm, &vres->blocks, 0); mutex_unlock(&mgr->lock); error_fini: ttm_resource_fini(man, &vres->base); @@ -637,8 +638,8 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager *man, struct amdgpu_vram_mgr_resource *vres = to_amdgpu_vram_mgr_resource(res); struct amdgpu_vram_mgr *mgr = to_vram_mgr(man); struct amdgpu_device *adev = to_amdgpu_device(mgr); - struct drm_buddy *mm = &mgr->mm; - struct drm_buddy_block *block; + struct gpu_buddy *mm = &mgr->mm; + struct gpu_buddy_block *block; uint64_t vis_usage = 0; mutex_lock(&mgr->lock); @@ -649,7 +650,7 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager *man, list_for_each_entry(block, &vres->blocks, link) vis_usage += amdgpu_vram_mgr_vis_size(adev, block); - drm_buddy_free_list(mm, &vres->blocks, vres->flags); + gpu_buddy_free_list(mm, &vres->blocks, vres->flags); amdgpu_vram_mgr_do_reserve(man); mutex_unlock(&mgr->lock); @@ -688,7 +689,7 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, if (!*sgt) return -ENOMEM; - /* Determine the number of DRM_BUDDY blocks to export */ + /* Determine the number of GPU_BUDDY blocks to export */ amdgpu_res_first(res, offset, length, &cursor); while (cursor.remaining) { num_entries++; @@ -704,10 +705,10 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev, sg->length = 0; /* - * Walk down DRM_BUDDY blocks to populate scatterlist nodes - * @note: Use iterator api to get first the DRM_BUDDY block + * Walk down GPU_BUDDY blocks to populate scatterlist nodes + * @note: Use iterator api to get first the GPU_BUDDY block * and the number of bytes from it. Access the following - * DRM_BUDDY block(s) if more buffer needs to exported + * GPU_BUDDY block(s) if more buffer needs to exported */ amdgpu_res_first(res, offset, length, &cursor); for_each_sgtable_sg((*sgt), sg, i) { @@ -792,10 +793,10 @@ uint64_t amdgpu_vram_mgr_vis_usage(struct amdgpu_vram_mgr *mgr) void amdgpu_vram_mgr_clear_reset_blocks(struct amdgpu_device *adev) { struct amdgpu_vram_mgr *mgr = &adev->mman.vram_mgr; - struct drm_buddy *mm = &mgr->mm; + struct gpu_buddy *mm = &mgr->mm; mutex_lock(&mgr->lock); - drm_buddy_reset_clear(mm, false); + gpu_buddy_reset_clear(mm, false); mutex_unlock(&mgr->lock); } @@ -815,7 +816,7 @@ static bool amdgpu_vram_mgr_intersects(struct ttm_resource_manager *man, size_t size) { struct amdgpu_vram_mgr_resource *mgr = to_amdgpu_vram_mgr_resource(res); - struct drm_buddy_block *block; + struct gpu_buddy_block *block; /* Check each drm buddy block individually */ list_for_each_entry(block, &mgr->blocks, link) { @@ -848,7 +849,7 @@ static bool amdgpu_vram_mgr_compatible(struct ttm_resource_manager *man, size_t size) { struct amdgpu_vram_mgr_resource *mgr = to_amdgpu_vram_mgr_resource(res); - struct drm_buddy_block *block; + struct gpu_buddy_block *block; /* Check each drm buddy block individually */ list_for_each_entry(block, &mgr->blocks, link) { @@ -877,7 +878,7 @@ static void amdgpu_vram_mgr_debug(struct ttm_resource_manager *man, struct drm_printer *printer) { struct amdgpu_vram_mgr *mgr = to_vram_mgr(man); - struct drm_buddy *mm = &mgr->mm; + struct gpu_buddy *mm = &mgr->mm; struct amdgpu_vram_reservation *rsv; drm_printf(printer, " vis usage:%llu\n", @@ -930,7 +931,7 @@ int amdgpu_vram_mgr_init(struct amdgpu_device *adev) mgr->default_page_size = PAGE_SIZE; man->func = &amdgpu_vram_mgr_func; - err = drm_buddy_init(&mgr->mm, man->size, PAGE_SIZE); + err = gpu_buddy_init(&mgr->mm, man->size, PAGE_SIZE); if (err) return err; @@ -965,11 +966,11 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev) kfree(rsv); list_for_each_entry_safe(rsv, temp, &mgr->reserved_pages, blocks) { - drm_buddy_free_list(&mgr->mm, &rsv->allocated, 0); + gpu_buddy_free_list(&mgr->mm, &rsv->allocated, 0); kfree(rsv); } if (!adev->gmc.is_app_apu) - drm_buddy_fini(&mgr->mm); + gpu_buddy_fini(&mgr->mm); mutex_unlock(&mgr->lock); ttm_resource_manager_cleanup(man); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h index 874779618056..429a21a2e9b2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h @@ -28,7 +28,7 @@ struct amdgpu_vram_mgr { struct ttm_resource_manager manager; - struct drm_buddy mm; + struct gpu_buddy mm; /* protects access to buffer objects */ struct mutex lock; struct list_head reservations_pending; @@ -57,19 +57,19 @@ struct amdgpu_vram_mgr_resource { struct amdgpu_vres_task task; }; -static inline u64 amdgpu_vram_mgr_block_start(struct drm_buddy_block *block) +static inline u64 amdgpu_vram_mgr_block_start(struct gpu_buddy_block *block) { - return drm_buddy_block_offset(block); + return gpu_buddy_block_offset(block); } -static inline u64 amdgpu_vram_mgr_block_size(struct drm_buddy_block *block) +static inline u64 amdgpu_vram_mgr_block_size(struct gpu_buddy_block *block) { - return (u64)PAGE_SIZE << drm_buddy_block_order(block); + return (u64)PAGE_SIZE << gpu_buddy_block_order(block); } -static inline bool amdgpu_vram_mgr_is_cleared(struct drm_buddy_block *block) +static inline bool amdgpu_vram_mgr_is_cleared(struct gpu_buddy_block *block) { - return drm_buddy_block_is_clear(block); + return gpu_buddy_block_is_clear(block); } static inline struct amdgpu_vram_mgr_resource * @@ -82,8 +82,8 @@ static inline void amdgpu_vram_mgr_set_cleared(struct ttm_resource *res) { struct amdgpu_vram_mgr_resource *ares = to_amdgpu_vram_mgr_resource(res); - WARN_ON(ares->flags & DRM_BUDDY_CLEARED); - ares->flags |= DRM_BUDDY_CLEARED; + WARN_ON(ares->flags & GPU_BUDDY_CLEARED); + ares->flags |= GPU_BUDDY_CLEARED; } int amdgpu_vram_mgr_query_address_block_info(struct amdgpu_vram_mgr *mgr, diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c new file mode 100644 index 000000000000..841f3de5f307 --- /dev/null +++ b/drivers/gpu/drm/drm_buddy.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2021 Intel Corporation + */ + +#include + +#include +#include +#include +#include + +#include +#include +#include + +/** + * drm_buddy_block_print - print block information + * + * @mm: DRM buddy manager + * @block: DRM buddy block + * @p: DRM printer to use + */ +void drm_buddy_block_print(struct gpu_buddy *mm, + struct gpu_buddy_block *block, + struct drm_printer *p) +{ + u64 start = gpu_buddy_block_offset(block); + u64 size = gpu_buddy_block_size(mm, block); + + drm_printf(p, "%#018llx-%#018llx: %llu\n", start, start + size, size); +} +EXPORT_SYMBOL(drm_buddy_block_print); + +/** + * drm_buddy_print - print allocator state + * + * @mm: DRM buddy manager + * @p: DRM printer to use + */ +void drm_buddy_print(struct gpu_buddy *mm, struct drm_printer *p) +{ + int order; + + drm_printf(p, "chunk_size: %lluKiB, total: %lluMiB, free: %lluMiB, clear_free: %lluMiB\n", + mm->chunk_size >> 10, mm->size >> 20, mm->avail >> 20, mm->clear_avail >> 20); + + for (order = mm->max_order; order >= 0; order--) { + struct gpu_buddy_block *block, *tmp; + struct rb_root *root; + u64 count = 0, free; + unsigned int tree; + + for_each_free_tree(tree) { + root = &mm->free_trees[tree][order]; + + rbtree_postorder_for_each_entry_safe(block, tmp, root, rb) { + BUG_ON(!gpu_buddy_block_is_free(block)); + count++; + } + } + + drm_printf(p, "order-%2d ", order); + + free = count * (mm->chunk_size << order); + if (free < SZ_1M) + drm_printf(p, "free: %8llu KiB", free >> 10); + else + drm_printf(p, "free: %8llu MiB", free >> 20); + + drm_printf(p, ", blocks: %llu\n", count); + } +} +EXPORT_SYMBOL(drm_buddy_print); + +MODULE_DESCRIPTION("DRM-specific GPU Buddy Allocator Print Helpers"); +MODULE_LICENSE("Dual MIT/GPL"); diff --git a/drivers/gpu/drm/i915/i915_scatterlist.c b/drivers/gpu/drm/i915/i915_scatterlist.c index 30246f02bcfe..6a34dae13769 100644 --- a/drivers/gpu/drm/i915/i915_scatterlist.c +++ b/drivers/gpu/drm/i915/i915_scatterlist.c @@ -167,9 +167,9 @@ struct i915_refct_sgt *i915_rsgt_from_buddy_resource(struct ttm_resource *res, struct i915_ttm_buddy_resource *bman_res = to_ttm_buddy_resource(res); const u64 size = res->size; const u32 max_segment = round_down(UINT_MAX, page_alignment); - struct drm_buddy *mm = bman_res->mm; + struct gpu_buddy *mm = bman_res->mm; struct list_head *blocks = &bman_res->blocks; - struct drm_buddy_block *block; + struct gpu_buddy_block *block; struct i915_refct_sgt *rsgt; struct scatterlist *sg; struct sg_table *st; @@ -202,8 +202,8 @@ struct i915_refct_sgt *i915_rsgt_from_buddy_resource(struct ttm_resource *res, list_for_each_entry(block, blocks, link) { u64 block_size, offset; - block_size = min_t(u64, size, drm_buddy_block_size(mm, block)); - offset = drm_buddy_block_offset(block); + block_size = min_t(u64, size, gpu_buddy_block_size(mm, block)); + offset = gpu_buddy_block_offset(block); while (block_size) { u64 len; diff --git a/drivers/gpu/drm/i915/i915_ttm_buddy_manager.c b/drivers/gpu/drm/i915/i915_ttm_buddy_manager.c index 6b256d95badd..c5ca90088705 100644 --- a/drivers/gpu/drm/i915/i915_ttm_buddy_manager.c +++ b/drivers/gpu/drm/i915/i915_ttm_buddy_manager.c @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -16,7 +17,7 @@ struct i915_ttm_buddy_manager { struct ttm_resource_manager manager; - struct drm_buddy mm; + struct gpu_buddy mm; struct list_head reserved; struct mutex lock; unsigned long visible_size; @@ -38,7 +39,7 @@ static int i915_ttm_buddy_man_alloc(struct ttm_resource_manager *man, { struct i915_ttm_buddy_manager *bman = to_buddy_manager(man); struct i915_ttm_buddy_resource *bman_res; - struct drm_buddy *mm = &bman->mm; + struct gpu_buddy *mm = &bman->mm; unsigned long n_pages, lpfn; u64 min_page_size; u64 size; @@ -57,13 +58,13 @@ static int i915_ttm_buddy_man_alloc(struct ttm_resource_manager *man, bman_res->mm = mm; if (place->flags & TTM_PL_FLAG_TOPDOWN) - bman_res->flags |= DRM_BUDDY_TOPDOWN_ALLOCATION; + bman_res->flags |= GPU_BUDDY_TOPDOWN_ALLOCATION; if (place->flags & TTM_PL_FLAG_CONTIGUOUS) - bman_res->flags |= DRM_BUDDY_CONTIGUOUS_ALLOCATION; + bman_res->flags |= GPU_BUDDY_CONTIGUOUS_ALLOCATION; if (place->fpfn || lpfn != man->size) - bman_res->flags |= DRM_BUDDY_RANGE_ALLOCATION; + bman_res->flags |= GPU_BUDDY_RANGE_ALLOCATION; GEM_BUG_ON(!bman_res->base.size); size = bman_res->base.size; @@ -89,7 +90,7 @@ static int i915_ttm_buddy_man_alloc(struct ttm_resource_manager *man, goto err_free_res; } - err = drm_buddy_alloc_blocks(mm, (u64)place->fpfn << PAGE_SHIFT, + err = gpu_buddy_alloc_blocks(mm, (u64)place->fpfn << PAGE_SHIFT, (u64)lpfn << PAGE_SHIFT, (u64)n_pages << PAGE_SHIFT, min_page_size, @@ -101,15 +102,15 @@ static int i915_ttm_buddy_man_alloc(struct ttm_resource_manager *man, if (lpfn <= bman->visible_size) { bman_res->used_visible_size = PFN_UP(bman_res->base.size); } else { - struct drm_buddy_block *block; + struct gpu_buddy_block *block; list_for_each_entry(block, &bman_res->blocks, link) { unsigned long start = - drm_buddy_block_offset(block) >> PAGE_SHIFT; + gpu_buddy_block_offset(block) >> PAGE_SHIFT; if (start < bman->visible_size) { unsigned long end = start + - (drm_buddy_block_size(mm, block) >> PAGE_SHIFT); + (gpu_buddy_block_size(mm, block) >> PAGE_SHIFT); bman_res->used_visible_size += min(end, bman->visible_size) - start; @@ -126,7 +127,7 @@ static int i915_ttm_buddy_man_alloc(struct ttm_resource_manager *man, return 0; err_free_blocks: - drm_buddy_free_list(mm, &bman_res->blocks, 0); + gpu_buddy_free_list(mm, &bman_res->blocks, 0); mutex_unlock(&bman->lock); err_free_res: ttm_resource_fini(man, &bman_res->base); @@ -141,7 +142,7 @@ static void i915_ttm_buddy_man_free(struct ttm_resource_manager *man, struct i915_ttm_buddy_manager *bman = to_buddy_manager(man); mutex_lock(&bman->lock); - drm_buddy_free_list(&bman->mm, &bman_res->blocks, 0); + gpu_buddy_free_list(&bman->mm, &bman_res->blocks, 0); bman->visible_avail += bman_res->used_visible_size; mutex_unlock(&bman->lock); @@ -156,8 +157,8 @@ static bool i915_ttm_buddy_man_intersects(struct ttm_resource_manager *man, { struct i915_ttm_buddy_resource *bman_res = to_ttm_buddy_resource(res); struct i915_ttm_buddy_manager *bman = to_buddy_manager(man); - struct drm_buddy *mm = &bman->mm; - struct drm_buddy_block *block; + struct gpu_buddy *mm = &bman->mm; + struct gpu_buddy_block *block; if (!place->fpfn && !place->lpfn) return true; @@ -176,9 +177,9 @@ static bool i915_ttm_buddy_man_intersects(struct ttm_resource_manager *man, /* Check each drm buddy block individually */ list_for_each_entry(block, &bman_res->blocks, link) { unsigned long fpfn = - drm_buddy_block_offset(block) >> PAGE_SHIFT; + gpu_buddy_block_offset(block) >> PAGE_SHIFT; unsigned long lpfn = fpfn + - (drm_buddy_block_size(mm, block) >> PAGE_SHIFT); + (gpu_buddy_block_size(mm, block) >> PAGE_SHIFT); if (place->fpfn < lpfn && place->lpfn > fpfn) return true; @@ -194,8 +195,8 @@ static bool i915_ttm_buddy_man_compatible(struct ttm_resource_manager *man, { struct i915_ttm_buddy_resource *bman_res = to_ttm_buddy_resource(res); struct i915_ttm_buddy_manager *bman = to_buddy_manager(man); - struct drm_buddy *mm = &bman->mm; - struct drm_buddy_block *block; + struct gpu_buddy *mm = &bman->mm; + struct gpu_buddy_block *block; if (!place->fpfn && !place->lpfn) return true; @@ -209,9 +210,9 @@ static bool i915_ttm_buddy_man_compatible(struct ttm_resource_manager *man, /* Check each drm buddy block individually */ list_for_each_entry(block, &bman_res->blocks, link) { unsigned long fpfn = - drm_buddy_block_offset(block) >> PAGE_SHIFT; + gpu_buddy_block_offset(block) >> PAGE_SHIFT; unsigned long lpfn = fpfn + - (drm_buddy_block_size(mm, block) >> PAGE_SHIFT); + (gpu_buddy_block_size(mm, block) >> PAGE_SHIFT); if (fpfn < place->fpfn || lpfn > place->lpfn) return false; @@ -224,7 +225,7 @@ static void i915_ttm_buddy_man_debug(struct ttm_resource_manager *man, struct drm_printer *printer) { struct i915_ttm_buddy_manager *bman = to_buddy_manager(man); - struct drm_buddy_block *block; + struct gpu_buddy_block *block; mutex_lock(&bman->lock); drm_printf(printer, "default_page_size: %lluKiB\n", @@ -293,7 +294,7 @@ int i915_ttm_buddy_man_init(struct ttm_device *bdev, if (!bman) return -ENOMEM; - err = drm_buddy_init(&bman->mm, size, chunk_size); + err = gpu_buddy_init(&bman->mm, size, chunk_size); if (err) goto err_free_bman; @@ -333,7 +334,7 @@ int i915_ttm_buddy_man_fini(struct ttm_device *bdev, unsigned int type) { struct ttm_resource_manager *man = ttm_manager_type(bdev, type); struct i915_ttm_buddy_manager *bman = to_buddy_manager(man); - struct drm_buddy *mm = &bman->mm; + struct gpu_buddy *mm = &bman->mm; int ret; ttm_resource_manager_set_used(man, false); @@ -345,8 +346,8 @@ int i915_ttm_buddy_man_fini(struct ttm_device *bdev, unsigned int type) ttm_set_driver_manager(bdev, type, NULL); mutex_lock(&bman->lock); - drm_buddy_free_list(mm, &bman->reserved, 0); - drm_buddy_fini(mm); + gpu_buddy_free_list(mm, &bman->reserved, 0); + gpu_buddy_fini(mm); bman->visible_avail += bman->visible_reserved; WARN_ON_ONCE(bman->visible_avail != bman->visible_size); mutex_unlock(&bman->lock); @@ -371,15 +372,15 @@ int i915_ttm_buddy_man_reserve(struct ttm_resource_manager *man, u64 start, u64 size) { struct i915_ttm_buddy_manager *bman = to_buddy_manager(man); - struct drm_buddy *mm = &bman->mm; + struct gpu_buddy *mm = &bman->mm; unsigned long fpfn = start >> PAGE_SHIFT; unsigned long flags = 0; int ret; - flags |= DRM_BUDDY_RANGE_ALLOCATION; + flags |= GPU_BUDDY_RANGE_ALLOCATION; mutex_lock(&bman->lock); - ret = drm_buddy_alloc_blocks(mm, start, + ret = gpu_buddy_alloc_blocks(mm, start, start + size, size, mm->chunk_size, &bman->reserved, diff --git a/drivers/gpu/drm/i915/i915_ttm_buddy_manager.h b/drivers/gpu/drm/i915/i915_ttm_buddy_manager.h index d64620712830..1cff018c1689 100644 --- a/drivers/gpu/drm/i915/i915_ttm_buddy_manager.h +++ b/drivers/gpu/drm/i915/i915_ttm_buddy_manager.h @@ -13,7 +13,7 @@ struct ttm_device; struct ttm_resource_manager; -struct drm_buddy; +struct gpu_buddy; /** * struct i915_ttm_buddy_resource @@ -33,7 +33,7 @@ struct i915_ttm_buddy_resource { struct list_head blocks; unsigned long flags; unsigned long used_visible_size; - struct drm_buddy *mm; + struct gpu_buddy *mm; }; /** diff --git a/drivers/gpu/drm/i915/selftests/intel_memory_region.c b/drivers/gpu/drm/i915/selftests/intel_memory_region.c index 7b856b5090f9..8307390943a2 100644 --- a/drivers/gpu/drm/i915/selftests/intel_memory_region.c +++ b/drivers/gpu/drm/i915/selftests/intel_memory_region.c @@ -6,7 +6,7 @@ #include #include -#include +#include #include "../i915_selftest.h" @@ -371,7 +371,7 @@ static int igt_mock_splintered_region(void *arg) struct drm_i915_private *i915 = mem->i915; struct i915_ttm_buddy_resource *res; struct drm_i915_gem_object *obj; - struct drm_buddy *mm; + struct gpu_buddy *mm; unsigned int expected_order; LIST_HEAD(objects); u64 size; @@ -447,8 +447,8 @@ static int igt_mock_max_segment(void *arg) struct drm_i915_private *i915 = mem->i915; struct i915_ttm_buddy_resource *res; struct drm_i915_gem_object *obj; - struct drm_buddy_block *block; - struct drm_buddy *mm; + struct gpu_buddy_block *block; + struct gpu_buddy *mm; struct list_head *blocks; struct scatterlist *sg; I915_RND_STATE(prng); @@ -487,8 +487,8 @@ static int igt_mock_max_segment(void *arg) mm = res->mm; size = 0; list_for_each_entry(block, blocks, link) { - if (drm_buddy_block_size(mm, block) > size) - size = drm_buddy_block_size(mm, block); + if (gpu_buddy_block_size(mm, block) > size) + size = gpu_buddy_block_size(mm, block); } if (size < max_segment) { pr_err("%s: Failed to create a huge contiguous block [> %u], largest block %lld\n", @@ -527,14 +527,14 @@ static u64 igt_object_mappable_total(struct drm_i915_gem_object *obj) struct intel_memory_region *mr = obj->mm.region; struct i915_ttm_buddy_resource *bman_res = to_ttm_buddy_resource(obj->mm.res); - struct drm_buddy *mm = bman_res->mm; - struct drm_buddy_block *block; + struct gpu_buddy *mm = bman_res->mm; + struct gpu_buddy_block *block; u64 total; total = 0; list_for_each_entry(block, &bman_res->blocks, link) { - u64 start = drm_buddy_block_offset(block); - u64 end = start + drm_buddy_block_size(mm, block); + u64 start = gpu_buddy_block_offset(block); + u64 end = start + gpu_buddy_block_size(mm, block); if (start < resource_size(&mr->io)) total += min_t(u64, end, resource_size(&mr->io)) - start; diff --git a/drivers/gpu/drm/ttm/tests/ttm_bo_validate_test.c b/drivers/gpu/drm/ttm/tests/ttm_bo_validate_test.c index 6d95447a989d..e32f3c8d7b84 100644 --- a/drivers/gpu/drm/ttm/tests/ttm_bo_validate_test.c +++ b/drivers/gpu/drm/ttm/tests/ttm_bo_validate_test.c @@ -251,7 +251,7 @@ static void ttm_bo_validate_basic(struct kunit *test) NULL, &dummy_ttm_bo_destroy); KUNIT_EXPECT_EQ(test, err, 0); - snd_place = ttm_place_kunit_init(test, snd_mem, DRM_BUDDY_TOPDOWN_ALLOCATION); + snd_place = ttm_place_kunit_init(test, snd_mem, GPU_BUDDY_TOPDOWN_ALLOCATION); snd_placement = ttm_placement_kunit_init(test, snd_place, 1); err = ttm_bo_validate(bo, snd_placement, &ctx_val); @@ -263,7 +263,7 @@ static void ttm_bo_validate_basic(struct kunit *test) KUNIT_EXPECT_TRUE(test, ttm_tt_is_populated(bo->ttm)); KUNIT_EXPECT_EQ(test, bo->resource->mem_type, snd_mem); KUNIT_EXPECT_EQ(test, bo->resource->placement, - DRM_BUDDY_TOPDOWN_ALLOCATION); + GPU_BUDDY_TOPDOWN_ALLOCATION); ttm_bo_fini(bo); ttm_mock_manager_fini(priv->ttm_dev, snd_mem); diff --git a/drivers/gpu/drm/ttm/tests/ttm_mock_manager.c b/drivers/gpu/drm/ttm/tests/ttm_mock_manager.c index dd395229e388..294d56d9067e 100644 --- a/drivers/gpu/drm/ttm/tests/ttm_mock_manager.c +++ b/drivers/gpu/drm/ttm/tests/ttm_mock_manager.c @@ -31,7 +31,7 @@ static int ttm_mock_manager_alloc(struct ttm_resource_manager *man, { struct ttm_mock_manager *manager = to_mock_mgr(man); struct ttm_mock_resource *mock_res; - struct drm_buddy *mm = &manager->mm; + struct gpu_buddy *mm = &manager->mm; u64 lpfn, fpfn, alloc_size; int err; @@ -47,14 +47,14 @@ static int ttm_mock_manager_alloc(struct ttm_resource_manager *man, INIT_LIST_HEAD(&mock_res->blocks); if (place->flags & TTM_PL_FLAG_TOPDOWN) - mock_res->flags |= DRM_BUDDY_TOPDOWN_ALLOCATION; + mock_res->flags |= GPU_BUDDY_TOPDOWN_ALLOCATION; if (place->flags & TTM_PL_FLAG_CONTIGUOUS) - mock_res->flags |= DRM_BUDDY_CONTIGUOUS_ALLOCATION; + mock_res->flags |= GPU_BUDDY_CONTIGUOUS_ALLOCATION; alloc_size = (uint64_t)mock_res->base.size; mutex_lock(&manager->lock); - err = drm_buddy_alloc_blocks(mm, fpfn, lpfn, alloc_size, + err = gpu_buddy_alloc_blocks(mm, fpfn, lpfn, alloc_size, manager->default_page_size, &mock_res->blocks, mock_res->flags); @@ -67,7 +67,7 @@ static int ttm_mock_manager_alloc(struct ttm_resource_manager *man, return 0; error_free_blocks: - drm_buddy_free_list(mm, &mock_res->blocks, 0); + gpu_buddy_free_list(mm, &mock_res->blocks, 0); ttm_resource_fini(man, &mock_res->base); mutex_unlock(&manager->lock); @@ -79,10 +79,10 @@ static void ttm_mock_manager_free(struct ttm_resource_manager *man, { struct ttm_mock_manager *manager = to_mock_mgr(man); struct ttm_mock_resource *mock_res = to_mock_mgr_resource(res); - struct drm_buddy *mm = &manager->mm; + struct gpu_buddy *mm = &manager->mm; mutex_lock(&manager->lock); - drm_buddy_free_list(mm, &mock_res->blocks, 0); + gpu_buddy_free_list(mm, &mock_res->blocks, 0); mutex_unlock(&manager->lock); ttm_resource_fini(man, res); @@ -106,7 +106,7 @@ int ttm_mock_manager_init(struct ttm_device *bdev, u32 mem_type, u32 size) mutex_init(&manager->lock); - err = drm_buddy_init(&manager->mm, size, PAGE_SIZE); + err = gpu_buddy_init(&manager->mm, size, PAGE_SIZE); if (err) { kfree(manager); @@ -142,7 +142,7 @@ void ttm_mock_manager_fini(struct ttm_device *bdev, u32 mem_type) ttm_resource_manager_set_used(man, false); mutex_lock(&mock_man->lock); - drm_buddy_fini(&mock_man->mm); + gpu_buddy_fini(&mock_man->mm); mutex_unlock(&mock_man->lock); ttm_set_driver_manager(bdev, mem_type, NULL); diff --git a/drivers/gpu/drm/ttm/tests/ttm_mock_manager.h b/drivers/gpu/drm/ttm/tests/ttm_mock_manager.h index 96ea8c9aae34..08710756fd8e 100644 --- a/drivers/gpu/drm/ttm/tests/ttm_mock_manager.h +++ b/drivers/gpu/drm/ttm/tests/ttm_mock_manager.h @@ -9,7 +9,7 @@ struct ttm_mock_manager { struct ttm_resource_manager man; - struct drm_buddy mm; + struct gpu_buddy mm; u64 default_page_size; /* protects allocations of mock buffer objects */ struct mutex lock; diff --git a/drivers/gpu/drm/xe/xe_res_cursor.h b/drivers/gpu/drm/xe/xe_res_cursor.h index 4e00008b7081..5f4ab08c0686 100644 --- a/drivers/gpu/drm/xe/xe_res_cursor.h +++ b/drivers/gpu/drm/xe/xe_res_cursor.h @@ -58,7 +58,7 @@ struct xe_res_cursor { /** @dma_addr: Current element in a struct drm_pagemap_addr array */ const struct drm_pagemap_addr *dma_addr; /** @mm: Buddy allocator for VRAM cursor */ - struct drm_buddy *mm; + struct gpu_buddy *mm; /** * @dma_start: DMA start address for the current segment. * This may be different to @dma_addr.addr since elements in @@ -69,7 +69,7 @@ struct xe_res_cursor { u64 dma_seg_size; }; -static struct drm_buddy *xe_res_get_buddy(struct ttm_resource *res) +static struct gpu_buddy *xe_res_get_buddy(struct ttm_resource *res) { struct ttm_resource_manager *mgr; @@ -104,30 +104,30 @@ static inline void xe_res_first(struct ttm_resource *res, case XE_PL_STOLEN: case XE_PL_VRAM0: case XE_PL_VRAM1: { - struct drm_buddy_block *block; + struct gpu_buddy_block *block; struct list_head *head, *next; - struct drm_buddy *mm = xe_res_get_buddy(res); + struct gpu_buddy *mm = xe_res_get_buddy(res); head = &to_xe_ttm_vram_mgr_resource(res)->blocks; block = list_first_entry_or_null(head, - struct drm_buddy_block, + struct gpu_buddy_block, link); if (!block) goto fallback; - while (start >= drm_buddy_block_size(mm, block)) { - start -= drm_buddy_block_size(mm, block); + while (start >= gpu_buddy_block_size(mm, block)) { + start -= gpu_buddy_block_size(mm, block); next = block->link.next; if (next != head) - block = list_entry(next, struct drm_buddy_block, + block = list_entry(next, struct gpu_buddy_block, link); } cur->mm = mm; - cur->start = drm_buddy_block_offset(block) + start; - cur->size = min(drm_buddy_block_size(mm, block) - start, + cur->start = gpu_buddy_block_offset(block) + start; + cur->size = min(gpu_buddy_block_size(mm, block) - start, size); cur->remaining = size; cur->node = block; @@ -259,7 +259,7 @@ static inline void xe_res_first_dma(const struct drm_pagemap_addr *dma_addr, */ static inline void xe_res_next(struct xe_res_cursor *cur, u64 size) { - struct drm_buddy_block *block; + struct gpu_buddy_block *block; struct list_head *next; u64 start; @@ -295,18 +295,18 @@ static inline void xe_res_next(struct xe_res_cursor *cur, u64 size) block = cur->node; next = block->link.next; - block = list_entry(next, struct drm_buddy_block, link); + block = list_entry(next, struct gpu_buddy_block, link); - while (start >= drm_buddy_block_size(cur->mm, block)) { - start -= drm_buddy_block_size(cur->mm, block); + while (start >= gpu_buddy_block_size(cur->mm, block)) { + start -= gpu_buddy_block_size(cur->mm, block); next = block->link.next; - block = list_entry(next, struct drm_buddy_block, link); + block = list_entry(next, struct gpu_buddy_block, link); } - cur->start = drm_buddy_block_offset(block) + start; - cur->size = min(drm_buddy_block_size(cur->mm, block) - start, + cur->start = gpu_buddy_block_offset(block) + start; + cur->size = min(gpu_buddy_block_size(cur->mm, block) - start, cur->remaining); cur->node = block; break; diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c index 213f0334518a..cda3bf7e2418 100644 --- a/drivers/gpu/drm/xe/xe_svm.c +++ b/drivers/gpu/drm/xe/xe_svm.c @@ -747,7 +747,7 @@ static u64 block_offset_to_pfn(struct drm_pagemap *dpagemap, u64 offset) return PHYS_PFN(offset + xpagemap->hpa_base); } -static struct drm_buddy *vram_to_buddy(struct xe_vram_region *vram) +static struct gpu_buddy *vram_to_buddy(struct xe_vram_region *vram) { return &vram->ttm.mm; } @@ -758,17 +758,17 @@ static int xe_svm_populate_devmem_pfn(struct drm_pagemap_devmem *devmem_allocati struct xe_bo *bo = to_xe_bo(devmem_allocation); struct ttm_resource *res = bo->ttm.resource; struct list_head *blocks = &to_xe_ttm_vram_mgr_resource(res)->blocks; - struct drm_buddy_block *block; + struct gpu_buddy_block *block; int j = 0; list_for_each_entry(block, blocks, link) { struct xe_vram_region *vr = block->private; - struct drm_buddy *buddy = vram_to_buddy(vr); + struct gpu_buddy *buddy = vram_to_buddy(vr); u64 block_pfn = block_offset_to_pfn(devmem_allocation->dpagemap, - drm_buddy_block_offset(block)); + gpu_buddy_block_offset(block)); int i; - for (i = 0; i < drm_buddy_block_size(buddy, block) >> PAGE_SHIFT; ++i) + for (i = 0; i < gpu_buddy_block_size(buddy, block) >> PAGE_SHIFT; ++i) pfn[j++] = block_pfn + i; } @@ -1033,7 +1033,7 @@ static int xe_drm_pagemap_populate_mm(struct drm_pagemap *dpagemap, struct dma_fence *pre_migrate_fence = NULL; struct xe_device *xe = vr->xe; struct device *dev = xe->drm.dev; - struct drm_buddy_block *block; + struct gpu_buddy_block *block; struct xe_validation_ctx vctx; struct list_head *blocks; struct drm_exec exec; diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c index 6553a19f7cf2..d119217d566a 100644 --- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c +++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c @@ -6,6 +6,7 @@ #include #include +#include #include #include @@ -16,16 +17,16 @@ #include "xe_ttm_vram_mgr.h" #include "xe_vram_types.h" -static inline struct drm_buddy_block * +static inline struct gpu_buddy_block * xe_ttm_vram_mgr_first_block(struct list_head *list) { - return list_first_entry_or_null(list, struct drm_buddy_block, link); + return list_first_entry_or_null(list, struct gpu_buddy_block, link); } -static inline bool xe_is_vram_mgr_blocks_contiguous(struct drm_buddy *mm, +static inline bool xe_is_vram_mgr_blocks_contiguous(struct gpu_buddy *mm, struct list_head *head) { - struct drm_buddy_block *block; + struct gpu_buddy_block *block; u64 start, size; block = xe_ttm_vram_mgr_first_block(head); @@ -33,12 +34,12 @@ static inline bool xe_is_vram_mgr_blocks_contiguous(struct drm_buddy *mm, return false; while (head != block->link.next) { - start = drm_buddy_block_offset(block); - size = drm_buddy_block_size(mm, block); + start = gpu_buddy_block_offset(block); + size = gpu_buddy_block_size(mm, block); - block = list_entry(block->link.next, struct drm_buddy_block, + block = list_entry(block->link.next, struct gpu_buddy_block, link); - if (start + size != drm_buddy_block_offset(block)) + if (start + size != gpu_buddy_block_offset(block)) return false; } @@ -52,7 +53,7 @@ static int xe_ttm_vram_mgr_new(struct ttm_resource_manager *man, { struct xe_ttm_vram_mgr *mgr = to_xe_ttm_vram_mgr(man); struct xe_ttm_vram_mgr_resource *vres; - struct drm_buddy *mm = &mgr->mm; + struct gpu_buddy *mm = &mgr->mm; u64 size, min_page_size; unsigned long lpfn; int err; @@ -79,10 +80,10 @@ static int xe_ttm_vram_mgr_new(struct ttm_resource_manager *man, INIT_LIST_HEAD(&vres->blocks); if (place->flags & TTM_PL_FLAG_TOPDOWN) - vres->flags |= DRM_BUDDY_TOPDOWN_ALLOCATION; + vres->flags |= GPU_BUDDY_TOPDOWN_ALLOCATION; if (place->fpfn || lpfn != man->size >> PAGE_SHIFT) - vres->flags |= DRM_BUDDY_RANGE_ALLOCATION; + vres->flags |= GPU_BUDDY_RANGE_ALLOCATION; if (WARN_ON(!vres->base.size)) { err = -EINVAL; @@ -118,27 +119,27 @@ static int xe_ttm_vram_mgr_new(struct ttm_resource_manager *man, lpfn = max_t(unsigned long, place->fpfn + (size >> PAGE_SHIFT), lpfn); } - err = drm_buddy_alloc_blocks(mm, (u64)place->fpfn << PAGE_SHIFT, + err = gpu_buddy_alloc_blocks(mm, (u64)place->fpfn << PAGE_SHIFT, (u64)lpfn << PAGE_SHIFT, size, min_page_size, &vres->blocks, vres->flags); if (err) goto error_unlock; if (place->flags & TTM_PL_FLAG_CONTIGUOUS) { - if (!drm_buddy_block_trim(mm, NULL, vres->base.size, &vres->blocks)) + if (!gpu_buddy_block_trim(mm, NULL, vres->base.size, &vres->blocks)) size = vres->base.size; } if (lpfn <= mgr->visible_size >> PAGE_SHIFT) { vres->used_visible_size = size; } else { - struct drm_buddy_block *block; + struct gpu_buddy_block *block; list_for_each_entry(block, &vres->blocks, link) { - u64 start = drm_buddy_block_offset(block); + u64 start = gpu_buddy_block_offset(block); if (start < mgr->visible_size) { - u64 end = start + drm_buddy_block_size(mm, block); + u64 end = start + gpu_buddy_block_size(mm, block); vres->used_visible_size += min(end, mgr->visible_size) - start; @@ -158,11 +159,11 @@ static int xe_ttm_vram_mgr_new(struct ttm_resource_manager *man, * the object. */ if (vres->base.placement & TTM_PL_FLAG_CONTIGUOUS) { - struct drm_buddy_block *block = list_first_entry(&vres->blocks, + struct gpu_buddy_block *block = list_first_entry(&vres->blocks, typeof(*block), link); - vres->base.start = drm_buddy_block_offset(block) >> PAGE_SHIFT; + vres->base.start = gpu_buddy_block_offset(block) >> PAGE_SHIFT; } else { vres->base.start = XE_BO_INVALID_OFFSET; } @@ -184,10 +185,10 @@ static void xe_ttm_vram_mgr_del(struct ttm_resource_manager *man, struct xe_ttm_vram_mgr_resource *vres = to_xe_ttm_vram_mgr_resource(res); struct xe_ttm_vram_mgr *mgr = to_xe_ttm_vram_mgr(man); - struct drm_buddy *mm = &mgr->mm; + struct gpu_buddy *mm = &mgr->mm; mutex_lock(&mgr->lock); - drm_buddy_free_list(mm, &vres->blocks, 0); + gpu_buddy_free_list(mm, &vres->blocks, 0); mgr->visible_avail += vres->used_visible_size; mutex_unlock(&mgr->lock); @@ -200,7 +201,7 @@ static void xe_ttm_vram_mgr_debug(struct ttm_resource_manager *man, struct drm_printer *printer) { struct xe_ttm_vram_mgr *mgr = to_xe_ttm_vram_mgr(man); - struct drm_buddy *mm = &mgr->mm; + struct gpu_buddy *mm = &mgr->mm; mutex_lock(&mgr->lock); drm_printf(printer, "default_page_size: %lluKiB\n", @@ -223,8 +224,8 @@ static bool xe_ttm_vram_mgr_intersects(struct ttm_resource_manager *man, struct xe_ttm_vram_mgr *mgr = to_xe_ttm_vram_mgr(man); struct xe_ttm_vram_mgr_resource *vres = to_xe_ttm_vram_mgr_resource(res); - struct drm_buddy *mm = &mgr->mm; - struct drm_buddy_block *block; + struct gpu_buddy *mm = &mgr->mm; + struct gpu_buddy_block *block; if (!place->fpfn && !place->lpfn) return true; @@ -234,9 +235,9 @@ static bool xe_ttm_vram_mgr_intersects(struct ttm_resource_manager *man, list_for_each_entry(block, &vres->blocks, link) { unsigned long fpfn = - drm_buddy_block_offset(block) >> PAGE_SHIFT; + gpu_buddy_block_offset(block) >> PAGE_SHIFT; unsigned long lpfn = fpfn + - (drm_buddy_block_size(mm, block) >> PAGE_SHIFT); + (gpu_buddy_block_size(mm, block) >> PAGE_SHIFT); if (place->fpfn < lpfn && place->lpfn > fpfn) return true; @@ -253,8 +254,8 @@ static bool xe_ttm_vram_mgr_compatible(struct ttm_resource_manager *man, struct xe_ttm_vram_mgr *mgr = to_xe_ttm_vram_mgr(man); struct xe_ttm_vram_mgr_resource *vres = to_xe_ttm_vram_mgr_resource(res); - struct drm_buddy *mm = &mgr->mm; - struct drm_buddy_block *block; + struct gpu_buddy *mm = &mgr->mm; + struct gpu_buddy_block *block; if (!place->fpfn && !place->lpfn) return true; @@ -264,9 +265,9 @@ static bool xe_ttm_vram_mgr_compatible(struct ttm_resource_manager *man, list_for_each_entry(block, &vres->blocks, link) { unsigned long fpfn = - drm_buddy_block_offset(block) >> PAGE_SHIFT; + gpu_buddy_block_offset(block) >> PAGE_SHIFT; unsigned long lpfn = fpfn + - (drm_buddy_block_size(mm, block) >> PAGE_SHIFT); + (gpu_buddy_block_size(mm, block) >> PAGE_SHIFT); if (fpfn < place->fpfn || lpfn > place->lpfn) return false; @@ -296,7 +297,7 @@ static void xe_ttm_vram_mgr_fini(struct drm_device *dev, void *arg) WARN_ON_ONCE(mgr->visible_avail != mgr->visible_size); - drm_buddy_fini(&mgr->mm); + gpu_buddy_fini(&mgr->mm); ttm_resource_manager_cleanup(&mgr->manager); @@ -327,7 +328,7 @@ int __xe_ttm_vram_mgr_init(struct xe_device *xe, struct xe_ttm_vram_mgr *mgr, mgr->visible_avail = io_size; ttm_resource_manager_init(man, &xe->ttm, size); - err = drm_buddy_init(&mgr->mm, man->size, default_page_size); + err = gpu_buddy_init(&mgr->mm, man->size, default_page_size); if (err) return err; @@ -375,7 +376,7 @@ int xe_ttm_vram_mgr_alloc_sgt(struct xe_device *xe, if (!*sgt) return -ENOMEM; - /* Determine the number of DRM_BUDDY blocks to export */ + /* Determine the number of GPU_BUDDY blocks to export */ xe_res_first(res, offset, length, &cursor); while (cursor.remaining) { num_entries++; @@ -392,10 +393,10 @@ int xe_ttm_vram_mgr_alloc_sgt(struct xe_device *xe, sg->length = 0; /* - * Walk down DRM_BUDDY blocks to populate scatterlist nodes - * @note: Use iterator api to get first the DRM_BUDDY block + * Walk down GPU_BUDDY blocks to populate scatterlist nodes + * @note: Use iterator api to get first the GPU_BUDDY block * and the number of bytes from it. Access the following - * DRM_BUDDY block(s) if more buffer needs to exported + * GPU_BUDDY block(s) if more buffer needs to exported */ xe_res_first(res, offset, length, &cursor); for_each_sgtable_sg((*sgt), sg, i) { diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h b/drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h index babeec5511d9..9106da056b49 100644 --- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h +++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr_types.h @@ -18,7 +18,7 @@ struct xe_ttm_vram_mgr { /** @manager: Base TTM resource manager */ struct ttm_resource_manager manager; /** @mm: DRM buddy allocator which manages the VRAM */ - struct drm_buddy mm; + struct gpu_buddy mm; /** @visible_size: Proped size of the CPU visible portion */ u64 visible_size; /** @visible_avail: CPU visible portion still unallocated */ diff --git a/drivers/gpu/tests/Makefile b/drivers/gpu/tests/Makefile index 8e7654e87d82..4183e6e2de45 100644 --- a/drivers/gpu/tests/Makefile +++ b/drivers/gpu/tests/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 gpu_buddy_tests-y = gpu_buddy_test.o gpu_random.o -obj-$(CONFIG_DRM_KUNIT_TEST) += gpu_buddy_tests.o +obj-$(CONFIG_GPU_BUDDY_KUNIT_TEST) += gpu_buddy_tests.o diff --git a/drivers/gpu/tests/gpu_buddy_test.c b/drivers/gpu/tests/gpu_buddy_test.c index b905932da990..450e71deed90 100644 --- a/drivers/gpu/tests/gpu_buddy_test.c +++ b/drivers/gpu/tests/gpu_buddy_test.c @@ -21,9 +21,9 @@ static inline u64 get_size(int order, u64 chunk_size) return (1 << order) * chunk_size; } -static void drm_test_buddy_fragmentation_performance(struct kunit *test) +static void gpu_test_buddy_fragmentation_performance(struct kunit *test) { - struct drm_buddy_block *block, *tmp; + struct gpu_buddy_block *block, *tmp; int num_blocks, i, ret, count = 0; LIST_HEAD(allocated_blocks); unsigned long elapsed_ms; @@ -32,7 +32,7 @@ static void drm_test_buddy_fragmentation_performance(struct kunit *test) LIST_HEAD(clear_list); LIST_HEAD(dirty_list); LIST_HEAD(free_list); - struct drm_buddy mm; + struct gpu_buddy mm; u64 mm_size = SZ_4G; ktime_t start, end; @@ -47,7 +47,7 @@ static void drm_test_buddy_fragmentation_performance(struct kunit *test) * quickly the allocator can satisfy larger, aligned requests from a pool of * highly fragmented space. */ - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_init(&mm, mm_size, SZ_4K), "buddy_init failed\n"); num_blocks = mm_size / SZ_64K; @@ -55,7 +55,7 @@ static void drm_test_buddy_fragmentation_performance(struct kunit *test) start = ktime_get(); /* Allocate with maximum fragmentation - 8K blocks with 64K alignment */ for (i = 0; i < num_blocks; i++) - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, SZ_8K, SZ_64K, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, 0, mm_size, SZ_8K, SZ_64K, &allocated_blocks, 0), "buddy_alloc hit an error size=%u\n", SZ_8K); @@ -68,21 +68,21 @@ static void drm_test_buddy_fragmentation_performance(struct kunit *test) } /* Free with different flags to ensure no coalescing */ - drm_buddy_free_list(&mm, &clear_list, DRM_BUDDY_CLEARED); - drm_buddy_free_list(&mm, &dirty_list, 0); + gpu_buddy_free_list(&mm, &clear_list, GPU_BUDDY_CLEARED); + gpu_buddy_free_list(&mm, &dirty_list, 0); for (i = 0; i < num_blocks; i++) - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, SZ_64K, SZ_64K, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, 0, mm_size, SZ_64K, SZ_64K, &test_blocks, 0), "buddy_alloc hit an error size=%u\n", SZ_64K); - drm_buddy_free_list(&mm, &test_blocks, 0); + gpu_buddy_free_list(&mm, &test_blocks, 0); end = ktime_get(); elapsed_ms = ktime_to_ms(ktime_sub(end, start)); kunit_info(test, "Fragmented allocation took %lu ms\n", elapsed_ms); - drm_buddy_fini(&mm); + gpu_buddy_fini(&mm); /* * Reverse free order under fragmentation @@ -96,13 +96,13 @@ static void drm_test_buddy_fragmentation_performance(struct kunit *test) * deallocation occurs in the opposite order of allocation, exposing the * cost difference between a linear freelist scan and an ordered tree lookup. */ - ret = drm_buddy_init(&mm, mm_size, SZ_4K); + ret = gpu_buddy_init(&mm, mm_size, SZ_4K); KUNIT_ASSERT_EQ(test, ret, 0); start = ktime_get(); /* Allocate maximum fragmentation */ for (i = 0; i < num_blocks; i++) - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, SZ_8K, SZ_64K, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, 0, mm_size, SZ_8K, SZ_64K, &allocated_blocks, 0), "buddy_alloc hit an error size=%u\n", SZ_8K); @@ -111,28 +111,28 @@ static void drm_test_buddy_fragmentation_performance(struct kunit *test) list_move_tail(&block->link, &free_list); count++; } - drm_buddy_free_list(&mm, &free_list, DRM_BUDDY_CLEARED); + gpu_buddy_free_list(&mm, &free_list, GPU_BUDDY_CLEARED); list_for_each_entry_safe_reverse(block, tmp, &allocated_blocks, link) list_move(&block->link, &reverse_list); - drm_buddy_free_list(&mm, &reverse_list, DRM_BUDDY_CLEARED); + gpu_buddy_free_list(&mm, &reverse_list, GPU_BUDDY_CLEARED); end = ktime_get(); elapsed_ms = ktime_to_ms(ktime_sub(end, start)); kunit_info(test, "Reverse-ordered free took %lu ms\n", elapsed_ms); - drm_buddy_fini(&mm); + gpu_buddy_fini(&mm); } -static void drm_test_buddy_alloc_range_bias(struct kunit *test) +static void gpu_test_buddy_alloc_range_bias(struct kunit *test) { u32 mm_size, size, ps, bias_size, bias_start, bias_end, bias_rem; - DRM_RND_STATE(prng, random_seed); + GPU_RND_STATE(prng, random_seed); unsigned int i, count, *order; - struct drm_buddy_block *block; + struct gpu_buddy_block *block; unsigned long flags; - struct drm_buddy mm; + struct gpu_buddy mm; LIST_HEAD(allocated); bias_size = SZ_1M; @@ -142,11 +142,11 @@ static void drm_test_buddy_alloc_range_bias(struct kunit *test) kunit_info(test, "mm_size=%u, ps=%u\n", mm_size, ps); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, ps), + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_init(&mm, mm_size, ps), "buddy_init failed\n"); count = mm_size / bias_size; - order = drm_random_order(count, &prng); + order = gpu_random_order(count, &prng); KUNIT_EXPECT_TRUE(test, order); /* @@ -166,79 +166,79 @@ static void drm_test_buddy_alloc_range_bias(struct kunit *test) /* internal round_up too big */ KUNIT_ASSERT_TRUE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, + gpu_buddy_alloc_blocks(&mm, bias_start, bias_end, bias_size + ps, bias_size, &allocated, - DRM_BUDDY_RANGE_ALLOCATION), + GPU_BUDDY_RANGE_ALLOCATION), "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", bias_start, bias_end, bias_size, bias_size); /* size too big */ KUNIT_ASSERT_TRUE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, + gpu_buddy_alloc_blocks(&mm, bias_start, bias_end, bias_size + ps, ps, &allocated, - DRM_BUDDY_RANGE_ALLOCATION), + GPU_BUDDY_RANGE_ALLOCATION), "buddy_alloc didn't fail with bias(%x-%x), size=%u, ps=%u\n", bias_start, bias_end, bias_size + ps, ps); /* bias range too small for size */ KUNIT_ASSERT_TRUE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start + ps, + gpu_buddy_alloc_blocks(&mm, bias_start + ps, bias_end, bias_size, ps, &allocated, - DRM_BUDDY_RANGE_ALLOCATION), + GPU_BUDDY_RANGE_ALLOCATION), "buddy_alloc didn't fail with bias(%x-%x), size=%u, ps=%u\n", bias_start + ps, bias_end, bias_size, ps); /* bias misaligned */ KUNIT_ASSERT_TRUE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start + ps, + gpu_buddy_alloc_blocks(&mm, bias_start + ps, bias_end - ps, bias_size >> 1, bias_size >> 1, &allocated, - DRM_BUDDY_RANGE_ALLOCATION), + GPU_BUDDY_RANGE_ALLOCATION), "buddy_alloc h didn't fail with bias(%x-%x), size=%u, ps=%u\n", bias_start + ps, bias_end - ps, bias_size >> 1, bias_size >> 1); /* single big page */ KUNIT_ASSERT_FALSE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, + gpu_buddy_alloc_blocks(&mm, bias_start, bias_end, bias_size, bias_size, &tmp, - DRM_BUDDY_RANGE_ALLOCATION), + GPU_BUDDY_RANGE_ALLOCATION), "buddy_alloc i failed with bias(%x-%x), size=%u, ps=%u\n", bias_start, bias_end, bias_size, bias_size); - drm_buddy_free_list(&mm, &tmp, 0); + gpu_buddy_free_list(&mm, &tmp, 0); /* single page with internal round_up */ KUNIT_ASSERT_FALSE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, + gpu_buddy_alloc_blocks(&mm, bias_start, bias_end, ps, bias_size, &tmp, - DRM_BUDDY_RANGE_ALLOCATION), + GPU_BUDDY_RANGE_ALLOCATION), "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", bias_start, bias_end, ps, bias_size); - drm_buddy_free_list(&mm, &tmp, 0); + gpu_buddy_free_list(&mm, &tmp, 0); /* random size within */ size = max(round_up(prandom_u32_state(&prng) % bias_rem, ps), ps); if (size) KUNIT_ASSERT_FALSE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, + gpu_buddy_alloc_blocks(&mm, bias_start, bias_end, size, ps, &tmp, - DRM_BUDDY_RANGE_ALLOCATION), + GPU_BUDDY_RANGE_ALLOCATION), "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", bias_start, bias_end, size, ps); bias_rem -= size; /* too big for current avail */ KUNIT_ASSERT_TRUE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, + gpu_buddy_alloc_blocks(&mm, bias_start, bias_end, bias_rem + ps, ps, &allocated, - DRM_BUDDY_RANGE_ALLOCATION), + GPU_BUDDY_RANGE_ALLOCATION), "buddy_alloc didn't fail with bias(%x-%x), size=%u, ps=%u\n", bias_start, bias_end, bias_rem + ps, ps); @@ -248,10 +248,10 @@ static void drm_test_buddy_alloc_range_bias(struct kunit *test) size = max(size, ps); KUNIT_ASSERT_FALSE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, + gpu_buddy_alloc_blocks(&mm, bias_start, bias_end, size, ps, &allocated, - DRM_BUDDY_RANGE_ALLOCATION), + GPU_BUDDY_RANGE_ALLOCATION), "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", bias_start, bias_end, size, ps); /* @@ -259,15 +259,15 @@ static void drm_test_buddy_alloc_range_bias(struct kunit *test) * unallocated, and ideally not always on the bias * boundaries. */ - drm_buddy_free_list(&mm, &tmp, 0); + gpu_buddy_free_list(&mm, &tmp, 0); } else { list_splice_tail(&tmp, &allocated); } } kfree(order); - drm_buddy_free_list(&mm, &allocated, 0); - drm_buddy_fini(&mm); + gpu_buddy_free_list(&mm, &allocated, 0); + gpu_buddy_fini(&mm); /* * Something more free-form. Idea is to pick a random starting bias @@ -278,7 +278,7 @@ static void drm_test_buddy_alloc_range_bias(struct kunit *test) * allocated nodes in the middle of the address space. */ - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, ps), + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_init(&mm, mm_size, ps), "buddy_init failed\n"); bias_start = round_up(prandom_u32_state(&prng) % (mm_size - ps), ps); @@ -290,10 +290,10 @@ static void drm_test_buddy_alloc_range_bias(struct kunit *test) u32 size = max(round_up(prandom_u32_state(&prng) % bias_rem, ps), ps); KUNIT_ASSERT_FALSE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, + gpu_buddy_alloc_blocks(&mm, bias_start, bias_end, size, ps, &allocated, - DRM_BUDDY_RANGE_ALLOCATION), + GPU_BUDDY_RANGE_ALLOCATION), "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n", bias_start, bias_end, size, ps); bias_rem -= size; @@ -319,24 +319,24 @@ static void drm_test_buddy_alloc_range_bias(struct kunit *test) KUNIT_ASSERT_EQ(test, bias_start, 0); KUNIT_ASSERT_EQ(test, bias_end, mm_size); KUNIT_ASSERT_TRUE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, bias_end, + gpu_buddy_alloc_blocks(&mm, bias_start, bias_end, ps, ps, &allocated, - DRM_BUDDY_RANGE_ALLOCATION), + GPU_BUDDY_RANGE_ALLOCATION), "buddy_alloc passed with bias(%x-%x), size=%u\n", bias_start, bias_end, ps); - drm_buddy_free_list(&mm, &allocated, 0); - drm_buddy_fini(&mm); + gpu_buddy_free_list(&mm, &allocated, 0); + gpu_buddy_fini(&mm); /* - * Allocate cleared blocks in the bias range when the DRM buddy's clear avail is + * Allocate cleared blocks in the bias range when the GPU buddy's clear avail is * zero. This will validate the bias range allocation in scenarios like system boot * when no cleared blocks are available and exercise the fallback path too. The resulting * blocks should always be dirty. */ - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, ps), + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_init(&mm, mm_size, ps), "buddy_init failed\n"); bias_start = round_up(prandom_u32_state(&prng) % (mm_size - ps), ps); @@ -344,11 +344,11 @@ static void drm_test_buddy_alloc_range_bias(struct kunit *test) bias_end = max(bias_end, bias_start + ps); bias_rem = bias_end - bias_start; - flags = DRM_BUDDY_CLEAR_ALLOCATION | DRM_BUDDY_RANGE_ALLOCATION; + flags = GPU_BUDDY_CLEAR_ALLOCATION | GPU_BUDDY_RANGE_ALLOCATION; size = max(round_up(prandom_u32_state(&prng) % bias_rem, ps), ps); KUNIT_ASSERT_FALSE_MSG(test, - drm_buddy_alloc_blocks(&mm, bias_start, + gpu_buddy_alloc_blocks(&mm, bias_start, bias_end, size, ps, &allocated, flags), @@ -356,27 +356,27 @@ static void drm_test_buddy_alloc_range_bias(struct kunit *test) bias_start, bias_end, size, ps); list_for_each_entry(block, &allocated, link) - KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), false); + KUNIT_EXPECT_EQ(test, gpu_buddy_block_is_clear(block), false); - drm_buddy_free_list(&mm, &allocated, 0); - drm_buddy_fini(&mm); + gpu_buddy_free_list(&mm, &allocated, 0); + gpu_buddy_fini(&mm); } -static void drm_test_buddy_alloc_clear(struct kunit *test) +static void gpu_test_buddy_alloc_clear(struct kunit *test) { unsigned long n_pages, total, i = 0; const unsigned long ps = SZ_4K; - struct drm_buddy_block *block; + struct gpu_buddy_block *block; const int max_order = 12; LIST_HEAD(allocated); - struct drm_buddy mm; + struct gpu_buddy mm; unsigned int order; u32 mm_size, size; LIST_HEAD(dirty); LIST_HEAD(clean); mm_size = SZ_4K << max_order; - KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, mm_size, ps)); + KUNIT_EXPECT_FALSE(test, gpu_buddy_init(&mm, mm_size, ps)); KUNIT_EXPECT_EQ(test, mm.max_order, max_order); @@ -389,11 +389,11 @@ static void drm_test_buddy_alloc_clear(struct kunit *test) * is indeed all dirty pages and vice versa. Free it all again, * keeping the dirty/clear status. */ - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, 0, mm_size, 5 * ps, ps, &allocated, - DRM_BUDDY_TOPDOWN_ALLOCATION), + GPU_BUDDY_TOPDOWN_ALLOCATION), "buddy_alloc hit an error size=%lu\n", 5 * ps); - drm_buddy_free_list(&mm, &allocated, DRM_BUDDY_CLEARED); + gpu_buddy_free_list(&mm, &allocated, GPU_BUDDY_CLEARED); n_pages = 10; do { @@ -406,37 +406,37 @@ static void drm_test_buddy_alloc_clear(struct kunit *test) flags = 0; } else { list = &clean; - flags = DRM_BUDDY_CLEAR_ALLOCATION; + flags = GPU_BUDDY_CLEAR_ALLOCATION; } - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, 0, mm_size, ps, ps, list, flags), "buddy_alloc hit an error size=%lu\n", ps); } while (++i < n_pages); list_for_each_entry(block, &clean, link) - KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), true); + KUNIT_EXPECT_EQ(test, gpu_buddy_block_is_clear(block), true); list_for_each_entry(block, &dirty, link) - KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), false); + KUNIT_EXPECT_EQ(test, gpu_buddy_block_is_clear(block), false); - drm_buddy_free_list(&mm, &clean, DRM_BUDDY_CLEARED); + gpu_buddy_free_list(&mm, &clean, GPU_BUDDY_CLEARED); /* * Trying to go over the clear limit for some allocation. * The allocation should never fail with reasonable page-size. */ - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, 0, mm_size, 10 * ps, ps, &clean, - DRM_BUDDY_CLEAR_ALLOCATION), + GPU_BUDDY_CLEAR_ALLOCATION), "buddy_alloc hit an error size=%lu\n", 10 * ps); - drm_buddy_free_list(&mm, &clean, DRM_BUDDY_CLEARED); - drm_buddy_free_list(&mm, &dirty, 0); - drm_buddy_fini(&mm); + gpu_buddy_free_list(&mm, &clean, GPU_BUDDY_CLEARED); + gpu_buddy_free_list(&mm, &dirty, 0); + gpu_buddy_fini(&mm); - KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, mm_size, ps)); + KUNIT_EXPECT_FALSE(test, gpu_buddy_init(&mm, mm_size, ps)); /* * Create a new mm. Intentionally fragment the address space by creating @@ -458,34 +458,34 @@ static void drm_test_buddy_alloc_clear(struct kunit *test) else list = &clean; - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, 0, mm_size, ps, ps, list, 0), "buddy_alloc hit an error size=%lu\n", ps); } while (++i < n_pages); - drm_buddy_free_list(&mm, &clean, DRM_BUDDY_CLEARED); - drm_buddy_free_list(&mm, &dirty, 0); + gpu_buddy_free_list(&mm, &clean, GPU_BUDDY_CLEARED); + gpu_buddy_free_list(&mm, &dirty, 0); order = 1; do { size = SZ_4K << order; - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, 0, mm_size, size, size, &allocated, - DRM_BUDDY_CLEAR_ALLOCATION), + GPU_BUDDY_CLEAR_ALLOCATION), "buddy_alloc hit an error size=%u\n", size); total = 0; list_for_each_entry(block, &allocated, link) { if (size != mm_size) - KUNIT_EXPECT_EQ(test, drm_buddy_block_is_clear(block), false); - total += drm_buddy_block_size(&mm, block); + KUNIT_EXPECT_EQ(test, gpu_buddy_block_is_clear(block), false); + total += gpu_buddy_block_size(&mm, block); } KUNIT_EXPECT_EQ(test, total, size); - drm_buddy_free_list(&mm, &allocated, 0); + gpu_buddy_free_list(&mm, &allocated, 0); } while (++order <= max_order); - drm_buddy_fini(&mm); + gpu_buddy_fini(&mm); /* * Create a new mm with a non power-of-two size. Allocate a random size from each @@ -494,44 +494,44 @@ static void drm_test_buddy_alloc_clear(struct kunit *test) */ mm_size = (SZ_4K << max_order) + (SZ_4K << (max_order - 2)); - KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, mm_size, ps)); + KUNIT_EXPECT_FALSE(test, gpu_buddy_init(&mm, mm_size, ps)); KUNIT_EXPECT_EQ(test, mm.max_order, max_order); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, SZ_4K << max_order, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, 0, SZ_4K << max_order, 4 * ps, ps, &allocated, - DRM_BUDDY_RANGE_ALLOCATION), + GPU_BUDDY_RANGE_ALLOCATION), "buddy_alloc hit an error size=%lu\n", 4 * ps); - drm_buddy_free_list(&mm, &allocated, DRM_BUDDY_CLEARED); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, SZ_4K << max_order, + gpu_buddy_free_list(&mm, &allocated, GPU_BUDDY_CLEARED); + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, 0, SZ_4K << max_order, 2 * ps, ps, &allocated, - DRM_BUDDY_CLEAR_ALLOCATION), + GPU_BUDDY_CLEAR_ALLOCATION), "buddy_alloc hit an error size=%lu\n", 2 * ps); - drm_buddy_free_list(&mm, &allocated, DRM_BUDDY_CLEARED); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, SZ_4K << max_order, mm_size, + gpu_buddy_free_list(&mm, &allocated, GPU_BUDDY_CLEARED); + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, SZ_4K << max_order, mm_size, ps, ps, &allocated, - DRM_BUDDY_RANGE_ALLOCATION), + GPU_BUDDY_RANGE_ALLOCATION), "buddy_alloc hit an error size=%lu\n", ps); - drm_buddy_free_list(&mm, &allocated, DRM_BUDDY_CLEARED); - drm_buddy_fini(&mm); + gpu_buddy_free_list(&mm, &allocated, GPU_BUDDY_CLEARED); + gpu_buddy_fini(&mm); } -static void drm_test_buddy_alloc_contiguous(struct kunit *test) +static void gpu_test_buddy_alloc_contiguous(struct kunit *test) { const unsigned long ps = SZ_4K, mm_size = 16 * 3 * SZ_4K; unsigned long i, n_pages, total; - struct drm_buddy_block *block; - struct drm_buddy mm; + struct gpu_buddy_block *block; + struct gpu_buddy mm; LIST_HEAD(left); LIST_HEAD(middle); LIST_HEAD(right); LIST_HEAD(allocated); - KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, mm_size, ps)); + KUNIT_EXPECT_FALSE(test, gpu_buddy_init(&mm, mm_size, ps)); /* * Idea is to fragment the address space by alternating block * allocations between three different lists; one for left, middle and * right. We can then free a list to simulate fragmentation. In - * particular we want to exercise the DRM_BUDDY_CONTIGUOUS_ALLOCATION, + * particular we want to exercise the GPU_BUDDY_CONTIGUOUS_ALLOCATION, * including the try_harder path. */ @@ -548,66 +548,66 @@ static void drm_test_buddy_alloc_contiguous(struct kunit *test) else list = &right; KUNIT_ASSERT_FALSE_MSG(test, - drm_buddy_alloc_blocks(&mm, 0, mm_size, + gpu_buddy_alloc_blocks(&mm, 0, mm_size, ps, ps, list, 0), "buddy_alloc hit an error size=%lu\n", ps); } while (++i < n_pages); - KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + KUNIT_ASSERT_TRUE_MSG(test, gpu_buddy_alloc_blocks(&mm, 0, mm_size, 3 * ps, ps, &allocated, - DRM_BUDDY_CONTIGUOUS_ALLOCATION), + GPU_BUDDY_CONTIGUOUS_ALLOCATION), "buddy_alloc didn't error size=%lu\n", 3 * ps); - drm_buddy_free_list(&mm, &middle, 0); - KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + gpu_buddy_free_list(&mm, &middle, 0); + KUNIT_ASSERT_TRUE_MSG(test, gpu_buddy_alloc_blocks(&mm, 0, mm_size, 3 * ps, ps, &allocated, - DRM_BUDDY_CONTIGUOUS_ALLOCATION), + GPU_BUDDY_CONTIGUOUS_ALLOCATION), "buddy_alloc didn't error size=%lu\n", 3 * ps); - KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + KUNIT_ASSERT_TRUE_MSG(test, gpu_buddy_alloc_blocks(&mm, 0, mm_size, 2 * ps, ps, &allocated, - DRM_BUDDY_CONTIGUOUS_ALLOCATION), + GPU_BUDDY_CONTIGUOUS_ALLOCATION), "buddy_alloc didn't error size=%lu\n", 2 * ps); - drm_buddy_free_list(&mm, &right, 0); - KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + gpu_buddy_free_list(&mm, &right, 0); + KUNIT_ASSERT_TRUE_MSG(test, gpu_buddy_alloc_blocks(&mm, 0, mm_size, 3 * ps, ps, &allocated, - DRM_BUDDY_CONTIGUOUS_ALLOCATION), + GPU_BUDDY_CONTIGUOUS_ALLOCATION), "buddy_alloc didn't error size=%lu\n", 3 * ps); /* * At this point we should have enough contiguous space for 2 blocks, * however they are never buddies (since we freed middle and right) so * will require the try_harder logic to find them. */ - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, 0, mm_size, 2 * ps, ps, &allocated, - DRM_BUDDY_CONTIGUOUS_ALLOCATION), + GPU_BUDDY_CONTIGUOUS_ALLOCATION), "buddy_alloc hit an error size=%lu\n", 2 * ps); - drm_buddy_free_list(&mm, &left, 0); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, + gpu_buddy_free_list(&mm, &left, 0); + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, 0, mm_size, 3 * ps, ps, &allocated, - DRM_BUDDY_CONTIGUOUS_ALLOCATION), + GPU_BUDDY_CONTIGUOUS_ALLOCATION), "buddy_alloc hit an error size=%lu\n", 3 * ps); total = 0; list_for_each_entry(block, &allocated, link) - total += drm_buddy_block_size(&mm, block); + total += gpu_buddy_block_size(&mm, block); KUNIT_ASSERT_EQ(test, total, ps * 2 + ps * 3); - drm_buddy_free_list(&mm, &allocated, 0); - drm_buddy_fini(&mm); + gpu_buddy_free_list(&mm, &allocated, 0); + gpu_buddy_fini(&mm); } -static void drm_test_buddy_alloc_pathological(struct kunit *test) +static void gpu_test_buddy_alloc_pathological(struct kunit *test) { u64 mm_size, size, start = 0; - struct drm_buddy_block *block; + struct gpu_buddy_block *block; const int max_order = 3; unsigned long flags = 0; int order, top; - struct drm_buddy mm; + struct gpu_buddy mm; LIST_HEAD(blocks); LIST_HEAD(holes); LIST_HEAD(tmp); @@ -620,7 +620,7 @@ static void drm_test_buddy_alloc_pathological(struct kunit *test) */ mm_size = SZ_4K << max_order; - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_init(&mm, mm_size, SZ_4K), "buddy_init failed\n"); KUNIT_EXPECT_EQ(test, mm.max_order, max_order); @@ -630,18 +630,18 @@ static void drm_test_buddy_alloc_pathological(struct kunit *test) block = list_first_entry_or_null(&blocks, typeof(*block), link); if (block) { list_del(&block->link); - drm_buddy_free_block(&mm, block); + gpu_buddy_free_block(&mm, block); } for (order = top; order--;) { size = get_size(order, mm.chunk_size); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc hit -ENOMEM with order=%d, top=%d\n", order, top); - block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); + block = list_first_entry_or_null(&tmp, struct gpu_buddy_block, link); KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); list_move_tail(&block->link, &blocks); @@ -649,45 +649,45 @@ static void drm_test_buddy_alloc_pathological(struct kunit *test) /* There should be one final page for this sub-allocation */ size = get_size(0, mm.chunk_size); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc hit -ENOMEM for hole\n"); - block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); + block = list_first_entry_or_null(&tmp, struct gpu_buddy_block, link); KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); list_move_tail(&block->link, &holes); size = get_size(top, mm.chunk_size); - KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + KUNIT_ASSERT_TRUE_MSG(test, gpu_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc unexpectedly succeeded at top-order %d/%d, it should be full!", top, max_order); } - drm_buddy_free_list(&mm, &holes, 0); + gpu_buddy_free_list(&mm, &holes, 0); /* Nothing larger than blocks of chunk_size now available */ for (order = 1; order <= max_order; order++) { size = get_size(order, mm.chunk_size); - KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + KUNIT_ASSERT_TRUE_MSG(test, gpu_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc unexpectedly succeeded at order %d, it should be full!", order); } list_splice_tail(&holes, &blocks); - drm_buddy_free_list(&mm, &blocks, 0); - drm_buddy_fini(&mm); + gpu_buddy_free_list(&mm, &blocks, 0); + gpu_buddy_fini(&mm); } -static void drm_test_buddy_alloc_pessimistic(struct kunit *test) +static void gpu_test_buddy_alloc_pessimistic(struct kunit *test) { u64 mm_size, size, start = 0; - struct drm_buddy_block *block, *bn; + struct gpu_buddy_block *block, *bn; const unsigned int max_order = 16; unsigned long flags = 0; - struct drm_buddy mm; + struct gpu_buddy mm; unsigned int order; LIST_HEAD(blocks); LIST_HEAD(tmp); @@ -699,19 +699,19 @@ static void drm_test_buddy_alloc_pessimistic(struct kunit *test) */ mm_size = SZ_4K << max_order; - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_init(&mm, mm_size, SZ_4K), "buddy_init failed\n"); KUNIT_EXPECT_EQ(test, mm.max_order, max_order); for (order = 0; order < max_order; order++) { size = get_size(order, mm.chunk_size); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc hit -ENOMEM with order=%d\n", order); - block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); + block = list_first_entry_or_null(&tmp, struct gpu_buddy_block, link); KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); list_move_tail(&block->link, &blocks); @@ -719,11 +719,11 @@ static void drm_test_buddy_alloc_pessimistic(struct kunit *test) /* And now the last remaining block available */ size = get_size(0, mm.chunk_size); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc hit -ENOMEM on final alloc\n"); - block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); + block = list_first_entry_or_null(&tmp, struct gpu_buddy_block, link); KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); list_move_tail(&block->link, &blocks); @@ -731,58 +731,58 @@ static void drm_test_buddy_alloc_pessimistic(struct kunit *test) /* Should be completely full! */ for (order = max_order; order--;) { size = get_size(order, mm.chunk_size); - KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + KUNIT_ASSERT_TRUE_MSG(test, gpu_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc unexpectedly succeeded, it should be full!"); } block = list_last_entry(&blocks, typeof(*block), link); list_del(&block->link); - drm_buddy_free_block(&mm, block); + gpu_buddy_free_block(&mm, block); /* As we free in increasing size, we make available larger blocks */ order = 1; list_for_each_entry_safe(block, bn, &blocks, link) { list_del(&block->link); - drm_buddy_free_block(&mm, block); + gpu_buddy_free_block(&mm, block); size = get_size(order, mm.chunk_size); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc hit -ENOMEM with order=%d\n", order); - block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); + block = list_first_entry_or_null(&tmp, struct gpu_buddy_block, link); KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); list_del(&block->link); - drm_buddy_free_block(&mm, block); + gpu_buddy_free_block(&mm, block); order++; } /* To confirm, now the whole mm should be available */ size = get_size(max_order, mm.chunk_size); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc (realloc) hit -ENOMEM with order=%d\n", max_order); - block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); + block = list_first_entry_or_null(&tmp, struct gpu_buddy_block, link); KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); list_del(&block->link); - drm_buddy_free_block(&mm, block); - drm_buddy_free_list(&mm, &blocks, 0); - drm_buddy_fini(&mm); + gpu_buddy_free_block(&mm, block); + gpu_buddy_free_list(&mm, &blocks, 0); + gpu_buddy_fini(&mm); } -static void drm_test_buddy_alloc_optimistic(struct kunit *test) +static void gpu_test_buddy_alloc_optimistic(struct kunit *test) { u64 mm_size, size, start = 0; - struct drm_buddy_block *block; + struct gpu_buddy_block *block; unsigned long flags = 0; const int max_order = 16; - struct drm_buddy mm; + struct gpu_buddy mm; LIST_HEAD(blocks); LIST_HEAD(tmp); int order; @@ -794,19 +794,19 @@ static void drm_test_buddy_alloc_optimistic(struct kunit *test) mm_size = SZ_4K * ((1 << (max_order + 1)) - 1); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_init(&mm, mm_size, SZ_4K), "buddy_init failed\n"); KUNIT_EXPECT_EQ(test, mm.max_order, max_order); for (order = 0; order <= max_order; order++) { size = get_size(order, mm.chunk_size); - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc hit -ENOMEM with order=%d\n", order); - block = list_first_entry_or_null(&tmp, struct drm_buddy_block, link); + block = list_first_entry_or_null(&tmp, struct gpu_buddy_block, link); KUNIT_ASSERT_TRUE_MSG(test, block, "alloc_blocks has no blocks\n"); list_move_tail(&block->link, &blocks); @@ -814,115 +814,115 @@ static void drm_test_buddy_alloc_optimistic(struct kunit *test) /* Should be completely full! */ size = get_size(0, mm.chunk_size); - KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, start, mm_size, + KUNIT_ASSERT_TRUE_MSG(test, gpu_buddy_alloc_blocks(&mm, start, mm_size, size, size, &tmp, flags), "buddy_alloc unexpectedly succeeded, it should be full!"); - drm_buddy_free_list(&mm, &blocks, 0); - drm_buddy_fini(&mm); + gpu_buddy_free_list(&mm, &blocks, 0); + gpu_buddy_fini(&mm); } -static void drm_test_buddy_alloc_limit(struct kunit *test) +static void gpu_test_buddy_alloc_limit(struct kunit *test) { u64 size = U64_MAX, start = 0; - struct drm_buddy_block *block; + struct gpu_buddy_block *block; unsigned long flags = 0; LIST_HEAD(allocated); - struct drm_buddy mm; + struct gpu_buddy mm; - KUNIT_EXPECT_FALSE(test, drm_buddy_init(&mm, size, SZ_4K)); + KUNIT_EXPECT_FALSE(test, gpu_buddy_init(&mm, size, SZ_4K)); - KUNIT_EXPECT_EQ_MSG(test, mm.max_order, DRM_BUDDY_MAX_ORDER, + KUNIT_EXPECT_EQ_MSG(test, mm.max_order, GPU_BUDDY_MAX_ORDER, "mm.max_order(%d) != %d\n", mm.max_order, - DRM_BUDDY_MAX_ORDER); + GPU_BUDDY_MAX_ORDER); size = mm.chunk_size << mm.max_order; - KUNIT_EXPECT_FALSE(test, drm_buddy_alloc_blocks(&mm, start, size, size, + KUNIT_EXPECT_FALSE(test, gpu_buddy_alloc_blocks(&mm, start, size, size, mm.chunk_size, &allocated, flags)); - block = list_first_entry_or_null(&allocated, struct drm_buddy_block, link); + block = list_first_entry_or_null(&allocated, struct gpu_buddy_block, link); KUNIT_EXPECT_TRUE(test, block); - KUNIT_EXPECT_EQ_MSG(test, drm_buddy_block_order(block), mm.max_order, + KUNIT_EXPECT_EQ_MSG(test, gpu_buddy_block_order(block), mm.max_order, "block order(%d) != %d\n", - drm_buddy_block_order(block), mm.max_order); + gpu_buddy_block_order(block), mm.max_order); - KUNIT_EXPECT_EQ_MSG(test, drm_buddy_block_size(&mm, block), + KUNIT_EXPECT_EQ_MSG(test, gpu_buddy_block_size(&mm, block), BIT_ULL(mm.max_order) * mm.chunk_size, "block size(%llu) != %llu\n", - drm_buddy_block_size(&mm, block), + gpu_buddy_block_size(&mm, block), BIT_ULL(mm.max_order) * mm.chunk_size); - drm_buddy_free_list(&mm, &allocated, 0); - drm_buddy_fini(&mm); + gpu_buddy_free_list(&mm, &allocated, 0); + gpu_buddy_fini(&mm); } -static void drm_test_buddy_alloc_exceeds_max_order(struct kunit *test) +static void gpu_test_buddy_alloc_exceeds_max_order(struct kunit *test) { u64 mm_size = SZ_8G + SZ_2G, size = SZ_8G + SZ_1G, min_block_size = SZ_8G; - struct drm_buddy mm; + struct gpu_buddy mm; LIST_HEAD(blocks); int err; - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_init(&mm, mm_size, SZ_4K), + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_init(&mm, mm_size, SZ_4K), "buddy_init failed\n"); /* CONTIGUOUS allocation should succeed via try_harder fallback */ - KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size, size, + KUNIT_ASSERT_FALSE_MSG(test, gpu_buddy_alloc_blocks(&mm, 0, mm_size, size, SZ_4K, &blocks, - DRM_BUDDY_CONTIGUOUS_ALLOCATION), + GPU_BUDDY_CONTIGUOUS_ALLOCATION), "buddy_alloc hit an error size=%llu\n", size); - drm_buddy_free_list(&mm, &blocks, 0); + gpu_buddy_free_list(&mm, &blocks, 0); /* Non-CONTIGUOUS with large min_block_size should return -EINVAL */ - err = drm_buddy_alloc_blocks(&mm, 0, mm_size, size, min_block_size, &blocks, 0); + err = gpu_buddy_alloc_blocks(&mm, 0, mm_size, size, min_block_size, &blocks, 0); KUNIT_EXPECT_EQ(test, err, -EINVAL); /* Non-CONTIGUOUS + RANGE with large min_block_size should return -EINVAL */ - err = drm_buddy_alloc_blocks(&mm, 0, mm_size, size, min_block_size, &blocks, - DRM_BUDDY_RANGE_ALLOCATION); + err = gpu_buddy_alloc_blocks(&mm, 0, mm_size, size, min_block_size, &blocks, + GPU_BUDDY_RANGE_ALLOCATION); KUNIT_EXPECT_EQ(test, err, -EINVAL); /* CONTIGUOUS + RANGE should return -EINVAL (no try_harder for RANGE) */ - err = drm_buddy_alloc_blocks(&mm, 0, mm_size, size, SZ_4K, &blocks, - DRM_BUDDY_CONTIGUOUS_ALLOCATION | DRM_BUDDY_RANGE_ALLOCATION); + err = gpu_buddy_alloc_blocks(&mm, 0, mm_size, size, SZ_4K, &blocks, + GPU_BUDDY_CONTIGUOUS_ALLOCATION | GPU_BUDDY_RANGE_ALLOCATION); KUNIT_EXPECT_EQ(test, err, -EINVAL); - drm_buddy_fini(&mm); + gpu_buddy_fini(&mm); } -static int drm_buddy_suite_init(struct kunit_suite *suite) +static int gpu_buddy_suite_init(struct kunit_suite *suite) { while (!random_seed) random_seed = get_random_u32(); - kunit_info(suite, "Testing DRM buddy manager, with random_seed=0x%x\n", + kunit_info(suite, "Testing GPU buddy manager, with random_seed=0x%x\n", random_seed); return 0; } -static struct kunit_case drm_buddy_tests[] = { - KUNIT_CASE(drm_test_buddy_alloc_limit), - KUNIT_CASE(drm_test_buddy_alloc_optimistic), - KUNIT_CASE(drm_test_buddy_alloc_pessimistic), - KUNIT_CASE(drm_test_buddy_alloc_pathological), - KUNIT_CASE(drm_test_buddy_alloc_contiguous), - KUNIT_CASE(drm_test_buddy_alloc_clear), - KUNIT_CASE(drm_test_buddy_alloc_range_bias), - KUNIT_CASE(drm_test_buddy_fragmentation_performance), - KUNIT_CASE(drm_test_buddy_alloc_exceeds_max_order), +static struct kunit_case gpu_buddy_tests[] = { + KUNIT_CASE(gpu_test_buddy_alloc_limit), + KUNIT_CASE(gpu_test_buddy_alloc_optimistic), + KUNIT_CASE(gpu_test_buddy_alloc_pessimistic), + KUNIT_CASE(gpu_test_buddy_alloc_pathological), + KUNIT_CASE(gpu_test_buddy_alloc_contiguous), + KUNIT_CASE(gpu_test_buddy_alloc_clear), + KUNIT_CASE(gpu_test_buddy_alloc_range_bias), + KUNIT_CASE(gpu_test_buddy_fragmentation_performance), + KUNIT_CASE(gpu_test_buddy_alloc_exceeds_max_order), {} }; -static struct kunit_suite drm_buddy_test_suite = { - .name = "drm_buddy", - .suite_init = drm_buddy_suite_init, - .test_cases = drm_buddy_tests, +static struct kunit_suite gpu_buddy_test_suite = { + .name = "gpu_buddy", + .suite_init = gpu_buddy_suite_init, + .test_cases = gpu_buddy_tests, }; -kunit_test_suite(drm_buddy_test_suite); +kunit_test_suite(gpu_buddy_test_suite); MODULE_AUTHOR("Intel Corporation"); -MODULE_DESCRIPTION("Kunit test for drm_buddy functions"); +MODULE_DESCRIPTION("Kunit test for gpu_buddy functions"); MODULE_LICENSE("GPL"); diff --git a/drivers/gpu/tests/gpu_random.c b/drivers/gpu/tests/gpu_random.c index ddd1f594b5d5..6356372f7e52 100644 --- a/drivers/gpu/tests/gpu_random.c +++ b/drivers/gpu/tests/gpu_random.c @@ -8,26 +8,26 @@ #include "gpu_random.h" -u32 drm_prandom_u32_max_state(u32 ep_ro, struct rnd_state *state) +u32 gpu_prandom_u32_max_state(u32 ep_ro, struct rnd_state *state) { return upper_32_bits((u64)prandom_u32_state(state) * ep_ro); } -EXPORT_SYMBOL(drm_prandom_u32_max_state); +EXPORT_SYMBOL(gpu_prandom_u32_max_state); -void drm_random_reorder(unsigned int *order, unsigned int count, +void gpu_random_reorder(unsigned int *order, unsigned int count, struct rnd_state *state) { unsigned int i, j; for (i = 0; i < count; ++i) { BUILD_BUG_ON(sizeof(unsigned int) > sizeof(u32)); - j = drm_prandom_u32_max_state(count, state); + j = gpu_prandom_u32_max_state(count, state); swap(order[i], order[j]); } } -EXPORT_SYMBOL(drm_random_reorder); +EXPORT_SYMBOL(gpu_random_reorder); -unsigned int *drm_random_order(unsigned int count, struct rnd_state *state) +unsigned int *gpu_random_order(unsigned int count, struct rnd_state *state) { unsigned int *order, i; @@ -38,7 +38,7 @@ unsigned int *drm_random_order(unsigned int count, struct rnd_state *state) for (i = 0; i < count; i++) order[i] = i; - drm_random_reorder(order, count, state); + gpu_random_reorder(order, count, state); return order; } -EXPORT_SYMBOL(drm_random_order); +EXPORT_SYMBOL(gpu_random_order); diff --git a/drivers/gpu/tests/gpu_random.h b/drivers/gpu/tests/gpu_random.h index 9f827260a89d..b68cf3448264 100644 --- a/drivers/gpu/tests/gpu_random.h +++ b/drivers/gpu/tests/gpu_random.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __DRM_RANDOM_H__ -#define __DRM_RANDOM_H__ +#ifndef __GPU_RANDOM_H__ +#define __GPU_RANDOM_H__ /* This is a temporary home for a couple of utility functions that should * be transposed to lib/ at the earliest convenience. @@ -8,21 +8,21 @@ #include -#define DRM_RND_STATE_INITIALIZER(seed__) ({ \ +#define GPU_RND_STATE_INITIALIZER(seed__) ({ \ struct rnd_state state__; \ prandom_seed_state(&state__, (seed__)); \ state__; \ }) -#define DRM_RND_STATE(name__, seed__) \ - struct rnd_state name__ = DRM_RND_STATE_INITIALIZER(seed__) +#define GPU_RND_STATE(name__, seed__) \ + struct rnd_state name__ = GPU_RND_STATE_INITIALIZER(seed__) -unsigned int *drm_random_order(unsigned int count, +unsigned int *gpu_random_order(unsigned int count, struct rnd_state *state); -void drm_random_reorder(unsigned int *order, +void gpu_random_reorder(unsigned int *order, unsigned int count, struct rnd_state *state); -u32 drm_prandom_u32_max_state(u32 ep_ro, +u32 gpu_prandom_u32_max_state(u32 ep_ro, struct rnd_state *state); -#endif /* !__DRM_RANDOM_H__ */ +#endif /* !__GPU_RANDOM_H__ */ diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig index d51777df12d1..0adb1e2fa533 100644 --- a/drivers/video/Kconfig +++ b/drivers/video/Kconfig @@ -37,6 +37,7 @@ source "drivers/char/agp/Kconfig" source "drivers/gpu/vga/Kconfig" +source "drivers/gpu/Kconfig" source "drivers/gpu/host1x/Kconfig" source "drivers/gpu/ipu-v3/Kconfig" source "drivers/gpu/nova-core/Kconfig" diff --git a/include/drm/drm_buddy.h b/include/drm/drm_buddy.h new file mode 100644 index 000000000000..3054369bebff --- /dev/null +++ b/include/drm/drm_buddy.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2021 Intel Corporation + */ + +#ifndef __DRM_BUDDY_H__ +#define __DRM_BUDDY_H__ + +#include + +struct drm_printer; + +/* DRM-specific GPU Buddy Allocator print helpers */ +void drm_buddy_print(struct gpu_buddy *mm, struct drm_printer *p); +void drm_buddy_block_print(struct gpu_buddy *mm, + struct gpu_buddy_block *block, + struct drm_printer *p); +#endif diff --git a/include/linux/gpu_buddy.h b/include/linux/gpu_buddy.h index b909fa8f810a..07ac65db6d2e 100644 --- a/include/linux/gpu_buddy.h +++ b/include/linux/gpu_buddy.h @@ -3,8 +3,8 @@ * Copyright © 2021 Intel Corporation */ -#ifndef __DRM_BUDDY_H__ -#define __DRM_BUDDY_H__ +#ifndef __GPU_BUDDY_H__ +#define __GPU_BUDDY_H__ #include #include @@ -12,38 +12,45 @@ #include #include -struct drm_printer; - -#define DRM_BUDDY_RANGE_ALLOCATION BIT(0) -#define DRM_BUDDY_TOPDOWN_ALLOCATION BIT(1) -#define DRM_BUDDY_CONTIGUOUS_ALLOCATION BIT(2) -#define DRM_BUDDY_CLEAR_ALLOCATION BIT(3) -#define DRM_BUDDY_CLEARED BIT(4) -#define DRM_BUDDY_TRIM_DISABLE BIT(5) - -struct drm_buddy_block { -#define DRM_BUDDY_HEADER_OFFSET GENMASK_ULL(63, 12) -#define DRM_BUDDY_HEADER_STATE GENMASK_ULL(11, 10) -#define DRM_BUDDY_ALLOCATED (1 << 10) -#define DRM_BUDDY_FREE (2 << 10) -#define DRM_BUDDY_SPLIT (3 << 10) -#define DRM_BUDDY_HEADER_CLEAR GENMASK_ULL(9, 9) +#define GPU_BUDDY_RANGE_ALLOCATION BIT(0) +#define GPU_BUDDY_TOPDOWN_ALLOCATION BIT(1) +#define GPU_BUDDY_CONTIGUOUS_ALLOCATION BIT(2) +#define GPU_BUDDY_CLEAR_ALLOCATION BIT(3) +#define GPU_BUDDY_CLEARED BIT(4) +#define GPU_BUDDY_TRIM_DISABLE BIT(5) + +enum gpu_buddy_free_tree { + GPU_BUDDY_CLEAR_TREE = 0, + GPU_BUDDY_DIRTY_TREE, + GPU_BUDDY_MAX_FREE_TREES, +}; + +#define for_each_free_tree(tree) \ + for ((tree) = 0; (tree) < GPU_BUDDY_MAX_FREE_TREES; (tree)++) + +struct gpu_buddy_block { +#define GPU_BUDDY_HEADER_OFFSET GENMASK_ULL(63, 12) +#define GPU_BUDDY_HEADER_STATE GENMASK_ULL(11, 10) +#define GPU_BUDDY_ALLOCATED (1 << 10) +#define GPU_BUDDY_FREE (2 << 10) +#define GPU_BUDDY_SPLIT (3 << 10) +#define GPU_BUDDY_HEADER_CLEAR GENMASK_ULL(9, 9) /* Free to be used, if needed in the future */ -#define DRM_BUDDY_HEADER_UNUSED GENMASK_ULL(8, 6) -#define DRM_BUDDY_HEADER_ORDER GENMASK_ULL(5, 0) +#define GPU_BUDDY_HEADER_UNUSED GENMASK_ULL(8, 6) +#define GPU_BUDDY_HEADER_ORDER GENMASK_ULL(5, 0) u64 header; - struct drm_buddy_block *left; - struct drm_buddy_block *right; - struct drm_buddy_block *parent; + struct gpu_buddy_block *left; + struct gpu_buddy_block *right; + struct gpu_buddy_block *parent; void *private; /* owned by creator */ /* - * While the block is allocated by the user through drm_buddy_alloc*, + * While the block is allocated by the user through gpu_buddy_alloc*, * the user has ownership of the link, for example to maintain within * a list, if so desired. As soon as the block is freed with - * drm_buddy_free* ownership is given back to the mm. + * gpu_buddy_free* ownership is given back to the mm. */ union { struct rb_node rb; @@ -54,15 +61,15 @@ struct drm_buddy_block { }; /* Order-zero must be at least SZ_4K */ -#define DRM_BUDDY_MAX_ORDER (63 - 12) +#define GPU_BUDDY_MAX_ORDER (63 - 12) /* * Binary Buddy System. * * Locking should be handled by the user, a simple mutex around - * drm_buddy_alloc* and drm_buddy_free* should suffice. + * gpu_buddy_alloc* and gpu_buddy_free* should suffice. */ -struct drm_buddy { +struct gpu_buddy { /* Maintain a free list for each order. */ struct rb_root **free_trees; @@ -73,7 +80,7 @@ struct drm_buddy { * block. Nodes are either allocated or free, in which case they will * also exist on the respective free list. */ - struct drm_buddy_block **roots; + struct gpu_buddy_block **roots; /* * Anything from here is public, and remains static for the lifetime of @@ -90,82 +97,81 @@ struct drm_buddy { }; static inline u64 -drm_buddy_block_offset(const struct drm_buddy_block *block) +gpu_buddy_block_offset(const struct gpu_buddy_block *block) { - return block->header & DRM_BUDDY_HEADER_OFFSET; + return block->header & GPU_BUDDY_HEADER_OFFSET; } static inline unsigned int -drm_buddy_block_order(struct drm_buddy_block *block) +gpu_buddy_block_order(struct gpu_buddy_block *block) { - return block->header & DRM_BUDDY_HEADER_ORDER; + return block->header & GPU_BUDDY_HEADER_ORDER; } static inline unsigned int -drm_buddy_block_state(struct drm_buddy_block *block) +gpu_buddy_block_state(struct gpu_buddy_block *block) { - return block->header & DRM_BUDDY_HEADER_STATE; + return block->header & GPU_BUDDY_HEADER_STATE; } static inline bool -drm_buddy_block_is_allocated(struct drm_buddy_block *block) +gpu_buddy_block_is_allocated(struct gpu_buddy_block *block) { - return drm_buddy_block_state(block) == DRM_BUDDY_ALLOCATED; + return gpu_buddy_block_state(block) == GPU_BUDDY_ALLOCATED; } static inline bool -drm_buddy_block_is_clear(struct drm_buddy_block *block) +gpu_buddy_block_is_clear(struct gpu_buddy_block *block) { - return block->header & DRM_BUDDY_HEADER_CLEAR; + return block->header & GPU_BUDDY_HEADER_CLEAR; } static inline bool -drm_buddy_block_is_free(struct drm_buddy_block *block) +gpu_buddy_block_is_free(struct gpu_buddy_block *block) { - return drm_buddy_block_state(block) == DRM_BUDDY_FREE; + return gpu_buddy_block_state(block) == GPU_BUDDY_FREE; } static inline bool -drm_buddy_block_is_split(struct drm_buddy_block *block) +gpu_buddy_block_is_split(struct gpu_buddy_block *block) { - return drm_buddy_block_state(block) == DRM_BUDDY_SPLIT; + return gpu_buddy_block_state(block) == GPU_BUDDY_SPLIT; } static inline u64 -drm_buddy_block_size(struct drm_buddy *mm, - struct drm_buddy_block *block) +gpu_buddy_block_size(struct gpu_buddy *mm, + struct gpu_buddy_block *block) { - return mm->chunk_size << drm_buddy_block_order(block); + return mm->chunk_size << gpu_buddy_block_order(block); } -int drm_buddy_init(struct drm_buddy *mm, u64 size, u64 chunk_size); +int gpu_buddy_init(struct gpu_buddy *mm, u64 size, u64 chunk_size); -void drm_buddy_fini(struct drm_buddy *mm); +void gpu_buddy_fini(struct gpu_buddy *mm); -struct drm_buddy_block * -drm_get_buddy(struct drm_buddy_block *block); +struct gpu_buddy_block * +gpu_get_buddy(struct gpu_buddy_block *block); -int drm_buddy_alloc_blocks(struct drm_buddy *mm, +int gpu_buddy_alloc_blocks(struct gpu_buddy *mm, u64 start, u64 end, u64 size, u64 min_page_size, struct list_head *blocks, unsigned long flags); -int drm_buddy_block_trim(struct drm_buddy *mm, +int gpu_buddy_block_trim(struct gpu_buddy *mm, u64 *start, u64 new_size, struct list_head *blocks); -void drm_buddy_reset_clear(struct drm_buddy *mm, bool is_clear); +void gpu_buddy_reset_clear(struct gpu_buddy *mm, bool is_clear); -void drm_buddy_free_block(struct drm_buddy *mm, struct drm_buddy_block *block); +void gpu_buddy_free_block(struct gpu_buddy *mm, struct gpu_buddy_block *block); -void drm_buddy_free_list(struct drm_buddy *mm, +void gpu_buddy_free_list(struct gpu_buddy *mm, struct list_head *objects, unsigned int flags); -void drm_buddy_print(struct drm_buddy *mm, struct drm_printer *p); -void drm_buddy_block_print(struct drm_buddy *mm, - struct drm_buddy_block *block, - struct drm_printer *p); +void gpu_buddy_print(struct gpu_buddy *mm); +void gpu_buddy_block_print(struct gpu_buddy *mm, + struct gpu_buddy_block *block); #endif -- cgit v1.2.3 From a69d1ab971a624c6f112cea61536569d579c3215 Mon Sep 17 00:00:00 2001 From: Thomas Hellström Date: Tue, 10 Feb 2026 12:56:53 +0100 Subject: mm: Fix a hmm_range_fault() livelock / starvation problem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If hmm_range_fault() fails a folio_trylock() in do_swap_page, trying to acquire the lock of a device-private folio for migration, to ram, the function will spin until it succeeds grabbing the lock. However, if the process holding the lock is depending on a work item to be completed, which is scheduled on the same CPU as the spinning hmm_range_fault(), that work item might be starved and we end up in a livelock / starvation situation which is never resolved. This can happen, for example if the process holding the device-private folio lock is stuck in migrate_device_unmap()->lru_add_drain_all() sinc lru_add_drain_all() requires a short work-item to be run on all online cpus to complete. A prerequisite for this to happen is: a) Both zone device and system memory folios are considered in migrate_device_unmap(), so that there is a reason to call lru_add_drain_all() for a system memory folio while a folio lock is held on a zone device folio. b) The zone device folio has an initial mapcount > 1 which causes at least one migration PTE entry insertion to be deferred to try_to_migrate(), which can happen after the call to lru_add_drain_all(). c) No or voluntary only preemption. This all seems pretty unlikely to happen, but indeed is hit by the "xe_exec_system_allocator" igt test. Resolve this by waiting for the folio to be unlocked if the folio_trylock() fails in do_swap_page(). Rename migration_entry_wait_on_locked() to softleaf_entry_wait_unlock() and update its documentation to indicate the new use-case. Future code improvements might consider moving the lru_add_drain_all() call in migrate_device_unmap() to be called *after* all pages have migration entries inserted. That would eliminate also b) above. v2: - Instead of a cond_resched() in hmm_range_fault(), eliminate the problem by waiting for the folio to be unlocked in do_swap_page() (Alistair Popple, Andrew Morton) v3: - Add a stub migration_entry_wait_on_locked() for the !CONFIG_MIGRATION case. (Kernel Test Robot) v4: - Rename migrate_entry_wait_on_locked() to softleaf_entry_wait_on_locked() and update docs (Alistair Popple) v5: - Add a WARN_ON_ONCE() for the !CONFIG_MIGRATION version of softleaf_entry_wait_on_locked(). - Modify wording around function names in the commit message (Andrew Morton) Suggested-by: Alistair Popple Fixes: 1afaeb8293c9 ("mm/migrate: Trylock device page in do_swap_page") Cc: Ralph Campbell Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: Leon Romanovsky Cc: Andrew Morton Cc: Matthew Brost Cc: John Hubbard Cc: Alistair Popple Cc: linux-mm@kvack.org Cc: Signed-off-by: Thomas Hellström Cc: # v6.15+ Reviewed-by: John Hubbard #v3 Reviewed-by: Alistair Popple Link: https://patch.msgid.link/20260210115653.92413-1-thomas.hellstrom@linux.intel.com --- include/linux/migrate.h | 10 +++++++++- mm/filemap.c | 15 ++++++++++----- mm/memory.c | 3 ++- mm/migrate.c | 8 ++++---- mm/migrate_device.c | 2 +- 5 files changed, 26 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 26ca00c325d9..d5af2b7f577b 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -65,7 +65,7 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list); int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src); -void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) +void softleaf_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) __releases(ptl); void folio_migrate_flags(struct folio *newfolio, struct folio *folio); int folio_migrate_mapping(struct address_space *mapping, @@ -97,6 +97,14 @@ static inline int set_movable_ops(const struct movable_operations *ops, enum pag return -ENOSYS; } +static inline void softleaf_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) + __releases(ptl) +{ + WARN_ON_ONCE(1); + + spin_unlock(ptl); +} + #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_NUMA_BALANCING diff --git a/mm/filemap.c b/mm/filemap.c index ebd75684cb0a..d98e4883f13d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1379,14 +1379,16 @@ repeat: #ifdef CONFIG_MIGRATION /** - * migration_entry_wait_on_locked - Wait for a migration entry to be removed - * @entry: migration swap entry. + * softleaf_entry_wait_on_locked - Wait for a migration entry or + * device_private entry to be removed. + * @entry: migration or device_private swap entry. * @ptl: already locked ptl. This function will drop the lock. * - * Wait for a migration entry referencing the given page to be removed. This is + * Wait for a migration entry referencing the given page, or device_private + * entry referencing a dvice_private page to be unlocked. This is * equivalent to folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE) except * this can be called without taking a reference on the page. Instead this - * should be called while holding the ptl for the migration entry referencing + * should be called while holding the ptl for @entry referencing * the page. * * Returns after unlocking the ptl. @@ -1394,7 +1396,7 @@ repeat: * This follows the same logic as folio_wait_bit_common() so see the comments * there. */ -void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) +void softleaf_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) __releases(ptl) { struct wait_page_queue wait_page; @@ -1428,6 +1430,9 @@ void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) * If a migration entry exists for the page the migration path must hold * a valid reference to the page, and it must take the ptl to remove the * migration entry. So the page is valid until the ptl is dropped. + * Similarly any path attempting to drop the last reference to a + * device-private page needs to grab the ptl to remove the device-private + * entry. */ spin_unlock(ptl); diff --git a/mm/memory.c b/mm/memory.c index 2a55edc48a65..0ad50df25846 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4681,7 +4681,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) unlock_page(vmf->page); put_page(vmf->page); } else { - pte_unmap_unlock(vmf->pte, vmf->ptl); + pte_unmap(vmf->pte); + softleaf_entry_wait_on_locked(entry, vmf->ptl); } } else if (softleaf_is_hwpoison(entry)) { ret = VM_FAULT_HWPOISON; diff --git a/mm/migrate.c b/mm/migrate.c index 5169f9717f60..75e384b042ef 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -499,7 +499,7 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, if (!softleaf_is_migration(entry)) goto out; - migration_entry_wait_on_locked(entry, ptl); + softleaf_entry_wait_on_locked(entry, ptl); return; out: spin_unlock(ptl); @@ -531,10 +531,10 @@ void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, p * If migration entry existed, safe to release vma lock * here because the pgtable page won't be freed without the * pgtable lock released. See comment right above pgtable - * lock release in migration_entry_wait_on_locked(). + * lock release in softleaf_entry_wait_on_locked(). */ hugetlb_vma_unlock_read(vma); - migration_entry_wait_on_locked(entry, ptl); + softleaf_entry_wait_on_locked(entry, ptl); return; } @@ -552,7 +552,7 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) ptl = pmd_lock(mm, pmd); if (!pmd_is_migration_entry(*pmd)) goto unlock; - migration_entry_wait_on_locked(softleaf_from_pmd(*pmd), ptl); + softleaf_entry_wait_on_locked(softleaf_from_pmd(*pmd), ptl); return; unlock: spin_unlock(ptl); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 23379663b1e1..deab89fd4541 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -176,7 +176,7 @@ static int migrate_vma_collect_huge_pmd(pmd_t *pmdp, unsigned long start, } if (softleaf_is_migration(entry)) { - migration_entry_wait_on_locked(entry, ptl); + softleaf_entry_wait_on_locked(entry, ptl); spin_unlock(ptl); return -EAGAIN; } -- cgit v1.2.3 From 022ac075088366b62e130da5e1b200bc93a47191 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 12 Feb 2026 13:34:22 -0800 Subject: bpf: use reg->var_off instead of reg->off for pointers This commit consolidates static and varying pointer offset tracking logic. All offsets are now represented solely using `.var_off` and min/max fields. The reasons are twofold: - This simplifies pointer tracking code, as each relevant function needs to check the `.var_off` field anyway. - It makes it easier to widen pointer registers for the purpose of loop convergence checks, by forgoing the `regsafe()` logic demanding `.off` fields to be identical. The changes are spread across many functions and are hard to group into smaller patches. Some of the logical changes include: - Checks in __check_ptr_off_reg() are reordered so that the tnum_is_const() check is done before operating on reg->var_off.value. - check_packet_access() now uses check_mem_region_access() to handle possible 'off' overflow cases. - In check_helper_mem_access() utility functions like check_packet_access() are now called with 'off=0', as these utility functions now account for the complete register offset range. - In check_reg_type() a call to __check_ptr_off_reg() is added before a call to btf_struct_ids_match(). This prevents btf_struct_ids_match() from potentially working on non-constant reg->var_off.value. - regsafe() is relaxed to avoid comparing '.off' field for pointers. As a precaution, the changes are verified in [1] by adding a pass checking that no pointer has non-zero '.off' field on each do_check_insn() iteration. [1] https://github.com/eddyz87/bpf/tree/ptrs-off-migration Notable selftests changes: - `.var_off` value changed because it now combines static and varying offsets. Affected tests: - linked_list/incorrect_node_var_off - linked_list/incorrect_head_var_off2 - verifier_align/packet_variable_offset - Overflowing `smax_value` bound leads to a pointer with big negative or positive offset to be rejected immediately (previously overflowing `rX += const` instruction updated `.off` field avoiding the overflow). Affected tests: - verifier_align/dubious_pointer_arithmetic - verifier_bounds/var_off_insn_off_test1 - Invalid access to packet now reports full offset inside a packet. Affected tests: - verifier_direct_packet_access/test23_x_pkt_ptr_4 - A change in check_mem_region_access() behavior: when register `.smin_value` is negative, it reports "rX min value is negative..." before calling into __check_mem_access() which reports "invalid access to ...". In the tests below, the `.off` field was negative, while `.smin_value` remained positive. This is no longer the case after the changes in this commit. Affected tests: - verifier_gotox/jump_table_invalid_mem_acceess_neg - verifier_helper_packet_access/test15_cls_helper_fail_sub - verifier_helper_value_access/imm_out_of_bound_2 - verifier_helper_value_access/reg_out_of_bound_2 - verifier_meta_access/meta_access_test2 - verifier_value_ptr_arith/known_scalar_from_different_maps - lower_oob_arith_test_1 - value_ptr_known_scalar_3 - access_value_ptr_known_scalar - Usage of check_mem_region_access() instead of __check_mem_access() in check_packet_access() changes the reported message from "rX offset is outside ..." to "rX min/max value is outside ...". Affected tests: - verifier_xdp_direct_packet_access/* - In check_func_arg_reg_off() the check for zero offset now operates on `.var_off` field instead of `.off` field. For tests where the pattern looks like `kfunc(reg_with_var_off, ...)`, this changes the reported error: - previously the error "variable ... access ... disallowed" was reported by __check_ptr_off_reg(); - now "R1 must have zero offset ..." is reported by check_func_arg_reg_off() itself. Affected tests: - verifier/calls.c "calls: invalid kfunc call: PTR_TO_BTF_ID with variable offset" Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260212-ptrs-off-migration-v2-2-00820e4d3438@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 3 +- kernel/bpf/log.c | 2 + kernel/bpf/verifier.c | 317 ++++++++------------- .../testing/selftests/bpf/prog_tests/linked_list.c | 4 +- .../selftests/bpf/progs/exceptions_assert.c | 2 +- tools/testing/selftests/bpf/progs/iters.c | 6 +- .../selftests/bpf/progs/mem_rdonly_untrusted.c | 2 +- tools/testing/selftests/bpf/progs/verifier_align.c | 40 ++- .../testing/selftests/bpf/progs/verifier_bounds.c | 2 +- .../bpf/progs/verifier_direct_packet_access.c | 4 +- tools/testing/selftests/bpf/progs/verifier_gotox.c | 4 +- .../bpf/progs/verifier_helper_packet_access.c | 2 +- .../bpf/progs/verifier_helper_value_access.c | 4 +- .../testing/selftests/bpf/progs/verifier_int_ptr.c | 2 +- .../selftests/bpf/progs/verifier_meta_access.c | 2 +- .../selftests/bpf/progs/verifier_spill_fill.c | 8 +- .../selftests/bpf/progs/verifier_stack_ptr.c | 4 +- .../selftests/bpf/progs/verifier_value_ptr_arith.c | 10 +- .../bpf/progs/verifier_xdp_direct_packet_access.c | 64 ++--- tools/testing/selftests/bpf/verifier/calls.c | 2 +- 20 files changed, 206 insertions(+), 278 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index ef8e45a362d9..a97bdbf3a07b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -38,8 +38,7 @@ struct bpf_reg_state { /* Ordering of fields matters. See states_equal() */ enum bpf_reg_type type; /* - * Fixed part of pointer offset, pointer types only. - * Or constant delta between "linked" scalars with the same ID. + * Constant delta between "linked" scalars with the same ID. */ s32 off; union { diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index a0c3b35de2ce..39a731392d65 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -581,6 +581,8 @@ int tnum_strn(char *str, size_t size, struct tnum a) if (a.mask == 0) { if (is_unum_decimal(a.value)) return snprintf(str, size, "%llu", a.value); + if (is_snum_decimal(a.value)) + return snprintf(str, size, "%lld", a.value); else return snprintf(str, size, "%#llx", a.value); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3bf72eacbec2..2c5794dad668 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -654,7 +654,7 @@ static int stack_slot_obj_get_spi(struct bpf_verifier_env *env, struct bpf_reg_s return -EINVAL; } - off = reg->off + reg->var_off.value; + off = reg->var_off.value; if (off % BPF_REG_SIZE) { verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off); return -EINVAL; @@ -2281,11 +2281,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) static void mark_reg_graph_node(struct bpf_reg_state *regs, u32 regno, struct btf_field_graph_root *ds_head) { - __mark_reg_known_zero(®s[regno]); + __mark_reg_known(®s[regno], ds_head->node_offset); regs[regno].type = PTR_TO_BTF_ID | MEM_ALLOC; regs[regno].btf = ds_head->btf; regs[regno].btf_id = ds_head->value_btf_id; - regs[regno].off = ds_head->node_offset; } static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg) @@ -2316,7 +2315,6 @@ static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg, */ return reg->type == which && reg->id == 0 && - reg->off == 0 && tnum_equals_const(reg->var_off, 0); } @@ -5302,7 +5300,6 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, * tracks the effects of the write, considering that each stack slot in the * dynamic range is potentially written to. * - * 'off' includes 'regno->off'. * 'value_regno' can be -1, meaning that an unknown value is being written to * the stack. * @@ -5724,7 +5721,6 @@ static int check_stack_read(struct bpf_verifier_env *env, * check_stack_write_var_off. * * 'ptr_regno' is the register used as a pointer into the stack. - * 'off' includes 'ptr_regno->off', but not its variable offset (if any). * 'value_regno' is the register whose value we're writing to the stack. It can * be -1, meaning that we're not writing from a register. * @@ -5761,14 +5757,14 @@ static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, u32 cap = bpf_map_flags_to_cap(map); if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) { - verbose(env, "write into map forbidden, value_size=%d off=%d size=%d\n", - map->value_size, off, size); + verbose(env, "write into map forbidden, value_size=%d off=%lld size=%d\n", + map->value_size, reg->smin_value + off, size); return -EACCES; } if (type == BPF_READ && !(cap & BPF_MAP_CAN_READ)) { - verbose(env, "read from map forbidden, value_size=%d off=%d size=%d\n", - map->value_size, off, size); + verbose(env, "read from map forbidden, value_size=%d off=%lld size=%d\n", + map->value_size, reg->smin_value + off, size); return -EACCES; } @@ -5875,24 +5871,24 @@ static int __check_ptr_off_reg(struct bpf_verifier_env *env, * is only allowed in its original, unmodified form. */ - if (reg->off < 0) { - verbose(env, "negative offset %s ptr R%d off=%d disallowed\n", - reg_type_str(env, reg->type), regno, reg->off); + if (!tnum_is_const(reg->var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "variable %s access var_off=%s disallowed\n", + reg_type_str(env, reg->type), tn_buf); return -EACCES; } - if (!fixed_off_ok && reg->off) { - verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n", - reg_type_str(env, reg->type), regno, reg->off); + if (reg->smin_value < 0) { + verbose(env, "negative offset %s ptr R%d off=%lld disallowed\n", + reg_type_str(env, reg->type), regno, reg->var_off.value); return -EACCES; } - if (!tnum_is_const(reg->var_off) || reg->var_off.value) { - char tn_buf[48]; - - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "variable %s access var_off=%s disallowed\n", - reg_type_str(env, reg->type), tn_buf); + if (!fixed_off_ok && reg->var_off.value != 0) { + verbose(env, "dereference of modified %s ptr R%d off=%lld disallowed\n", + reg_type_str(env, reg->type), regno, reg->var_off.value); return -EACCES; } @@ -5934,14 +5930,14 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, /* For ref_ptr case, release function check should ensure we get one * referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the * normal store of unreferenced kptr, we must ensure var_off is zero. - * Since ref_ptr cannot be accessed directly by BPF insns, checks for - * reg->off and reg->ref_obj_id are not needed here. + * Since ref_ptr cannot be accessed directly by BPF insns, check for + * reg->ref_obj_id is not needed here. */ if (__check_ptr_off_reg(env, reg, regno, true)) return -EACCES; /* A full type match is needed, as BTF can be vmlinux, module or prog BTF, and - * we also need to take into account the reg->off. + * we also need to take into account the reg->var_off. * * We want to support cases like: * @@ -5952,19 +5948,19 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, * * struct foo *v; * v = func(); // PTR_TO_BTF_ID - * val->foo = v; // reg->off is zero, btf and btf_id match type - * val->bar = &v->br; // reg->off is still zero, but we need to retry with + * val->foo = v; // reg->var_off is zero, btf and btf_id match type + * val->bar = &v->br; // reg->var_off is still zero, but we need to retry with * // first member type of struct after comparison fails - * val->baz = &v->bz; // reg->off is non-zero, so struct needs to be walked + * val->baz = &v->bz; // reg->var_off is non-zero, so struct needs to be walked * // to match type * - * In the kptr_ref case, check_func_arg_reg_off already ensures reg->off + * In the kptr_ref case, check_func_arg_reg_off already ensures reg->var_off * is zero. We must also ensure that btf_struct_ids_match does not walk * the struct to match type against first member of struct, i.e. reject * second case from above. Hence, when type is BPF_KPTR_REF, we set * strict mode to true for type match. */ - if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, + if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->var_off.value, kptr_field->kptr.btf, kptr_field->kptr.btf_id, kptr_field->type != BPF_KPTR_UNREF)) goto bad_type; @@ -6273,27 +6269,14 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, struct bpf_reg_state *reg = reg_state(env, regno); int err; - /* We may have added a variable offset to the packet pointer; but any - * reg->range we have comes after that. We are only checking the fixed - * offset. - */ - - /* We don't allow negative numbers, because we aren't tracking enough - * detail to prove they're safe. - */ - if (reg->smin_value < 0) { - verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", - regno); - return -EACCES; + if (reg->range < 0) { + verbose(env, "R%d offset is outside of the packet\n", regno); + return -EINVAL; } - err = reg->range < 0 ? -EINVAL : - __check_mem_access(env, regno, off, size, reg->range, - zero_size_allowed); - if (err) { - verbose(env, "R%d offset is outside of the packet\n", regno); + err = check_mem_region_access(env, regno, off, size, reg->range, zero_size_allowed); + if (err) return err; - } /* __check_mem_access has made sure "off + size - 1" is within u16. * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff, @@ -6305,7 +6288,7 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, max_t(u32, env->prog->aux->max_pkt_offset, off + reg->umax_value + size - 1); - return err; + return 0; } /* check access to 'struct bpf_context' fields. Supports fixed offsets only */ @@ -6522,14 +6505,14 @@ static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, */ ip_align = 2; - reg_off = tnum_add(reg->var_off, tnum_const(ip_align + reg->off + off)); + reg_off = tnum_add(reg->var_off, tnum_const(ip_align + off)); if (!tnum_is_aligned(reg_off, size)) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose(env, - "misaligned packet access off %d+%s+%d+%d size %d\n", - ip_align, tn_buf, reg->off, off, size); + "misaligned packet access off %d+%s+%d size %d\n", + ip_align, tn_buf, off, size); return -EACCES; } @@ -6547,13 +6530,13 @@ static int check_generic_ptr_alignment(struct bpf_verifier_env *env, if (!strict || size == 1) return 0; - reg_off = tnum_add(reg->var_off, tnum_const(reg->off + off)); + reg_off = tnum_add(reg->var_off, tnum_const(off)); if (!tnum_is_aligned(reg_off, size)) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "misaligned %saccess off %s+%d+%d size %d\n", - pointer_desc, tn_buf, reg->off, off, size); + verbose(env, "misaligned %saccess off %s+%d size %d\n", + pointer_desc, tn_buf, off, size); return -EACCES; } @@ -6891,7 +6874,7 @@ static int __check_buffer_access(struct bpf_verifier_env *env, regno, buf_info, off, size); return -EACCES; } - if (!tnum_is_const(reg->var_off) || reg->var_off.value) { + if (!tnum_is_const(reg->var_off)) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); @@ -6914,8 +6897,8 @@ static int check_tp_buffer_access(struct bpf_verifier_env *env, if (err) return err; - if (off + size > env->prog->aux->max_tp_access) - env->prog->aux->max_tp_access = off + size; + env->prog->aux->max_tp_access = max(reg->var_off.value + off + size, + env->prog->aux->max_tp_access); return 0; } @@ -6933,8 +6916,7 @@ static int check_buffer_access(struct bpf_verifier_env *env, if (err) return err; - if (off + size > *max_access) - *max_access = off + size; + *max_access = max(reg->var_off.value + off + size, *max_access); return 0; } @@ -7327,13 +7309,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, tname); return -EINVAL; } - if (off < 0) { - verbose(env, - "R%d is ptr_%s invalid negative access: off=%d\n", - regno, tname, off); - return -EACCES; - } - if (!tnum_is_const(reg->var_off) || reg->var_off.value) { + + if (!tnum_is_const(reg->var_off)) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); @@ -7343,6 +7320,15 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, return -EACCES; } + off += reg->var_off.value; + + if (off < 0) { + verbose(env, + "R%d is ptr_%s invalid negative access: off=%d\n", + regno, tname, off); + return -EACCES; + } + if (reg->type & MEM_USER) { verbose(env, "R%d is ptr_%s access user memory: off=%d\n", @@ -7589,8 +7575,8 @@ static int check_stack_access_within_bounds( if (err) { if (tnum_is_const(reg->var_off)) { - verbose(env, "invalid%s stack R%d off=%d size=%d\n", - err_extra, regno, off, access_size); + verbose(env, "invalid%s stack R%d off=%lld size=%d\n", + err_extra, regno, min_off, access_size); } else { char tn_buf[48]; @@ -7636,14 +7622,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (size < 0) return size; - /* alignment checks will add in reg->off themselves */ err = check_ptr_alignment(env, reg, off, size, strict_alignment_once); if (err) return err; - /* for access checks, reg->off is just part of off */ - off += reg->off; - if (reg->type == PTR_TO_MAP_KEY) { if (t == BPF_WRITE) { verbose(env, "write to change key R%d not allowed\n", regno); @@ -8122,8 +8104,6 @@ static int check_atomic(struct bpf_verifier_env *env, struct bpf_insn *insn) * on the access type and privileges, that all elements of the stack are * initialized. * - * 'off' includes 'regno->off', but not its dynamic part (if any). - * * All registers that have been spilled on the stack in the slots within the * read offsets are marked as read. */ @@ -8284,7 +8264,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, switch (base_type(reg->type)) { case PTR_TO_PACKET: case PTR_TO_PACKET_META: - return check_packet_access(env, regno, reg->off, access_size, + return check_packet_access(env, regno, 0, access_size, zero_size_allowed); case PTR_TO_MAP_KEY: if (access_type == BPF_WRITE) { @@ -8292,12 +8272,12 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, reg_type_str(env, reg->type)); return -EACCES; } - return check_mem_region_access(env, regno, reg->off, access_size, + return check_mem_region_access(env, regno, 0, access_size, reg->map_ptr->key_size, false); case PTR_TO_MAP_VALUE: - if (check_map_access_type(env, regno, reg->off, access_size, access_type)) + if (check_map_access_type(env, regno, 0, access_size, access_type)) return -EACCES; - return check_map_access(env, regno, reg->off, access_size, + return check_map_access(env, regno, 0, access_size, zero_size_allowed, ACCESS_HELPER); case PTR_TO_MEM: if (type_is_rdonly_mem(reg->type)) { @@ -8307,7 +8287,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return -EACCES; } } - return check_mem_region_access(env, regno, reg->off, + return check_mem_region_access(env, regno, 0, access_size, reg->mem_size, zero_size_allowed); case PTR_TO_BUF: @@ -8322,16 +8302,16 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, } else { max_access = &env->prog->aux->max_rdwr_access; } - return check_buffer_access(env, reg, regno, reg->off, + return check_buffer_access(env, reg, regno, 0, access_size, zero_size_allowed, max_access); case PTR_TO_STACK: return check_stack_range_initialized( env, - regno, reg->off, access_size, + regno, 0, access_size, zero_size_allowed, access_type, meta); case PTR_TO_BTF_ID: - return check_ptr_to_btf_access(env, regs, regno, reg->off, + return check_ptr_to_btf_access(env, regs, regno, 0, access_size, BPF_READ, -1); case PTR_TO_CTX: /* in case the function doesn't know how to access the context, @@ -8543,9 +8523,9 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, int flags) return -EINVAL; } spin_lock_off = is_res_lock ? rec->res_spin_lock_off : rec->spin_lock_off; - if (spin_lock_off != val + reg->off) { + if (spin_lock_off != val) { verbose(env, "off %lld doesn't point to 'struct %s_lock' that is at %d\n", - val + reg->off, lock_str, spin_lock_off); + val, lock_str, spin_lock_off); return -EINVAL; } if (is_lock) { @@ -8660,9 +8640,9 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, verifier_bug(env, "unsupported BTF field type: %s\n", struct_name); return -EINVAL; } - if (field_off != val + reg->off) { + if (field_off != val) { verbose(env, "off %lld doesn't point to 'struct %s' that is at %d\n", - val + reg->off, struct_name, field_off); + val, struct_name, field_off); return -EINVAL; } if (map_desc->ptr) { @@ -8730,7 +8710,7 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, return -EINVAL; } - kptr_off = reg->off + reg->var_off.value; + kptr_off = reg->var_off.value; kptr_field = btf_record_find(rec, kptr_off, BPF_KPTR); if (!kptr_field) { verbose(env, "off=%d doesn't point to kptr\n", kptr_off); @@ -9373,7 +9353,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, struct bpf_reg_state *reg = reg_state(env, regno); enum bpf_reg_type expected, type = reg->type; const struct bpf_reg_types *compatible; - int i, j; + int i, j, err; compatible = compatible_reg_types[base_type(arg_type)]; if (!compatible) { @@ -9476,8 +9456,12 @@ found: return -EACCES; } - if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, - btf_vmlinux, *arg_btf_id, + err = __check_ptr_off_reg(env, reg, regno, true); + if (err) + return err; + + if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, + reg->var_off.value, btf_vmlinux, *arg_btf_id, strict_type_match)) { verbose(env, "R%d is of type %s but %s is expected\n", regno, btf_type_name(reg->btf, reg->btf_id), @@ -9555,12 +9539,11 @@ static int check_func_arg_reg_off(struct bpf_verifier_env *env, * because fixed_off_ok is false, but checking here allows us * to give the user a better error message. */ - if (reg->off) { + if (!tnum_is_const(reg->var_off) || reg->var_off.value != 0) { verbose(env, "R%d must have zero offset when passed to release func or trusted arg to kfunc\n", regno); return -EINVAL; } - return __check_ptr_off_reg(env, reg, regno, false); } switch (type) { @@ -9657,7 +9640,7 @@ static enum bpf_dynptr_type dynptr_get_type(struct bpf_verifier_env *env, if (reg->type == CONST_PTR_TO_DYNPTR) return reg->dynptr.type; - spi = __get_spi(reg->off); + spi = __get_spi(reg->var_off.value); if (spi < 0) { verbose(env, "verifier internal error: invalid spi when querying dynptr type\n"); return BPF_DYNPTR_TYPE_INVALID; @@ -9698,13 +9681,13 @@ static int check_reg_const_str(struct bpf_verifier_env *env, return -EACCES; } - err = check_map_access(env, regno, reg->off, - map->value_size - reg->off, false, + err = check_map_access(env, regno, 0, + map->value_size - reg->var_off.value, false, ACCESS_HELPER); if (err) return err; - map_off = reg->off + reg->var_off.value; + map_off = reg->var_off.value; err = map->ops->map_direct_value_addr(map, &map_addr, map_off); if (err) { verbose(env, "direct value access on string failed\n"); @@ -9741,7 +9724,7 @@ static int get_constant_map_key(struct bpf_verifier_env *env, if (!tnum_is_const(key->var_off)) return -EOPNOTSUPP; - stack_off = key->off + key->var_off.value; + stack_off = key->var_off.value; slot = -stack_off - 1; spi = slot / BPF_REG_SIZE; off = slot % BPF_REG_SIZE; @@ -11073,7 +11056,8 @@ static int set_rbtree_add_callback_state(struct bpf_verifier_env *env, */ struct btf_field *field; - field = reg_find_field_offset(&caller->regs[BPF_REG_1], caller->regs[BPF_REG_1].off, + field = reg_find_field_offset(&caller->regs[BPF_REG_1], + caller->regs[BPF_REG_1].var_off.value, BPF_RB_ROOT); if (!field || !field->graph_root.value_btf_id) return -EFAULT; @@ -11449,7 +11433,7 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env, /* fmt being ARG_PTR_TO_CONST_STR guarantees that var_off is const * and map_direct_value_addr is set. */ - fmt_map_off = fmt_reg->off + fmt_reg->var_off.value; + fmt_map_off = fmt_reg->var_off.value; err = fmt_map->ops->map_direct_value_addr(fmt_map, &fmt_addr, fmt_map_off); if (err) { @@ -12755,13 +12739,12 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id)) strict_type_match = true; - WARN_ON_ONCE(is_kfunc_release(meta) && - (reg->off || !tnum_is_const(reg->var_off) || - reg->var_off.value)); + WARN_ON_ONCE(is_kfunc_release(meta) && !tnum_is_const(reg->var_off)); reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, ®_ref_id); reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off); - struct_same = btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match); + struct_same = btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->var_off.value, + meta->btf, ref_id, strict_type_match); /* If kfunc is accepting a projection type (ie. __sk_buff), it cannot * actually use it -- it must cast to the underlying type. So we allow * caller to pass in the underlying type. @@ -13133,7 +13116,7 @@ __process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env, } rec = reg_btf_record(reg); - head_off = reg->off + reg->var_off.value; + head_off = reg->var_off.value; field = btf_record_find(rec, head_off, head_field_type); if (!field) { verbose(env, "%s not found at offset=%u\n", head_type_name, head_off); @@ -13200,7 +13183,7 @@ __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env, return -EINVAL; } - node_off = reg->off + reg->var_off.value; + node_off = reg->var_off.value; field = reg_find_field_offset(reg, node_off, node_field_type); if (!field) { verbose(env, "%s not found at offset=%u\n", node_type_name, node_off); @@ -14228,7 +14211,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, meta.func_id == special_kfunc_list[KF_bpf_list_push_back_impl] || meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) { release_ref_obj_id = regs[BPF_REG_2].ref_obj_id; - insn_aux->insert_off = regs[BPF_REG_2].off; + insn_aux->insert_off = regs[BPF_REG_2].var_off.value; insn_aux->kptr_struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id); err = ref_convert_owning_non_owning(env, release_ref_obj_id); if (err) { @@ -14459,11 +14442,13 @@ static bool check_reg_sane_offset_ptr(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, enum bpf_reg_type type) { + bool known = tnum_is_const(reg->var_off); + s64 val = reg->var_off.value; s64 smin = reg->smin_value; - if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) { - verbose(env, "%s pointer offset %d is not allowed\n", - reg_type_str(env, type), reg->off); + if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) { + verbose(env, "%s pointer offset %lld is not allowed\n", + reg_type_str(env, type), val); return false; } @@ -14497,13 +14482,11 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, * currently prohibited for unprivileged. */ max = MAX_BPF_STACK + mask_to_left; - ptr_limit = -(ptr_reg->var_off.value + ptr_reg->off); + ptr_limit = -ptr_reg->var_off.value; break; case PTR_TO_MAP_VALUE: max = ptr_reg->map_ptr->value_size; - ptr_limit = (mask_to_left ? - ptr_reg->smin_value : - ptr_reg->umax_value) + ptr_reg->off; + ptr_limit = mask_to_left ? ptr_reg->smin_value : ptr_reg->umax_value; break; default: return REASON_TYPE; @@ -14734,9 +14717,6 @@ static int sanitize_err(struct bpf_verifier_env *env, * Variable offset is prohibited for unprivileged mode for simplicity since it * requires corresponding support in Spectre masking for stack ALU. See also * retrieve_ptr_limit(). - * - * - * 'off' includes 'reg->off'. */ static int check_stack_access_for_ptr_arithmetic( struct bpf_verifier_env *env, @@ -14777,11 +14757,11 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env, switch (dst_reg->type) { case PTR_TO_STACK: if (check_stack_access_for_ptr_arithmetic(env, dst, dst_reg, - dst_reg->off + dst_reg->var_off.value)) + dst_reg->var_off.value)) return -EACCES; break; case PTR_TO_MAP_VALUE: - if (check_map_access(env, dst, dst_reg->off, 1, false, ACCESS_HELPER)) { + if (check_map_access(env, dst, 0, 1, false, ACCESS_HELPER)) { verbose(env, "R%d pointer arithmetic of map value goes out of range, " "prohibited for !root\n", dst); return -EACCES; @@ -14905,23 +14885,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, switch (opcode) { case BPF_ADD: - /* We can take a fixed offset as long as it doesn't overflow - * the s32 'off' field - */ - if (known && (ptr_reg->off + smin_val == - (s64)(s32)(ptr_reg->off + smin_val))) { - /* pointer += K. Accumulate it into fixed offset */ - dst_reg->smin_value = smin_ptr; - dst_reg->smax_value = smax_ptr; - dst_reg->umin_value = umin_ptr; - dst_reg->umax_value = umax_ptr; - dst_reg->var_off = ptr_reg->var_off; - dst_reg->off = ptr_reg->off + smin_val; - dst_reg->raw = ptr_reg->raw; - break; - } - /* A new variable offset is created. Note that off_reg->off - * == 0, since it's a scalar. + /* * dst_reg gets the pointer type and since some positive * integer value was added to the pointer, give it a new 'id' * if it's a PTR_TO_PACKET. @@ -14940,9 +14904,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg->umax_value = U64_MAX; } dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off); - dst_reg->off = ptr_reg->off; dst_reg->raw = ptr_reg->raw; - if (reg_is_pkt_pointer(ptr_reg)) { + if (!known && reg_is_pkt_pointer(ptr_reg)) { dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ memset(&dst_reg->raw, 0, sizeof(dst_reg->raw)); @@ -14964,19 +14927,6 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst); return -EACCES; } - if (known && (ptr_reg->off - smin_val == - (s64)(s32)(ptr_reg->off - smin_val))) { - /* pointer -= K. Subtract it from fixed offset */ - dst_reg->smin_value = smin_ptr; - dst_reg->smax_value = smax_ptr; - dst_reg->umin_value = umin_ptr; - dst_reg->umax_value = umax_ptr; - dst_reg->var_off = ptr_reg->var_off; - dst_reg->id = ptr_reg->id; - dst_reg->off = ptr_reg->off - smin_val; - dst_reg->raw = ptr_reg->raw; - break; - } /* A new variable offset is created. If the subtrahend is known * nonnegative, then any reg->range we had before is still good. */ @@ -14996,9 +14946,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg->umax_value = umax_ptr - umin_val; } dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off); - dst_reg->off = ptr_reg->off; dst_reg->raw = ptr_reg->raw; - if (reg_is_pkt_pointer(ptr_reg)) { + if (!known && reg_is_pkt_pointer(ptr_reg)) { dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ if (smin_val < 0) @@ -16542,19 +16491,17 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, struct bpf_reg_state *reg; int new_range; - if (dst_reg->off < 0 || - (dst_reg->off == 0 && range_right_open)) + if (dst_reg->umax_value == 0 && range_right_open) /* This doesn't give us any range */ return; - if (dst_reg->umax_value > MAX_PACKET_OFF || - dst_reg->umax_value + dst_reg->off > MAX_PACKET_OFF) + if (dst_reg->umax_value > MAX_PACKET_OFF) /* Risk of overflow. For instance, ptr + (1<<63) may be less * than pkt_end, but that's because it's also less than pkt. */ return; - new_range = dst_reg->off; + new_range = dst_reg->umax_value; if (range_right_open) new_range++; @@ -16603,7 +16550,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, /* If our ids match, then we must have the same max_value. And we * don't care about the other reg's fixed offset, since if it's too big * the range won't allow anything. - * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16. + * dst_reg->umax_value is known < MAX_PACKET_OFF, therefore it fits in a u16. */ bpf_for_each_reg_in_vstate(vstate, state, reg, ({ if (reg->type == type && reg->id == dst_reg->id) @@ -17129,29 +17076,24 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, { if (type_may_be_null(reg->type) && reg->id == id && (is_rcu_reg(reg) || !WARN_ON_ONCE(!reg->id))) { - /* Old offset (both fixed and variable parts) should have been - * known-zero, because we don't allow pointer arithmetic on - * pointers that might be NULL. If we see this happening, don't - * convert the register. + /* Old offset should have been known-zero, because we don't + * allow pointer arithmetic on pointers that might be NULL. + * If we see this happening, don't convert the register. * * But in some cases, some helpers that return local kptrs - * advance offset for the returned pointer. In those cases, it - * is fine to expect to see reg->off. + * advance offset for the returned pointer. In those cases, + * it is fine to expect to see reg->var_off. */ - if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || !tnum_equals_const(reg->var_off, 0))) - return; if (!(type_is_ptr_alloc_obj(reg->type) || type_is_non_owning_ref(reg->type)) && - WARN_ON_ONCE(reg->off)) + WARN_ON_ONCE(!tnum_equals_const(reg->var_off, 0))) return; - if (is_null) { - reg->type = SCALAR_VALUE; /* We don't need id and ref_obj_id from this point * onwards anymore, thus we should better reset it, * so that state pruning has chances to take effect. */ - reg->id = 0; - reg->ref_obj_id = 0; + __mark_reg_known_zero(reg); + reg->type = SCALAR_VALUE; return; } @@ -17731,22 +17673,24 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) } map = env->used_maps[aux->map_index]; - dst_reg->map_ptr = map; if (insn->src_reg == BPF_PSEUDO_MAP_VALUE || insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) { if (map->map_type == BPF_MAP_TYPE_ARENA) { __mark_reg_unknown(env, dst_reg); + dst_reg->map_ptr = map; return 0; } + __mark_reg_known(dst_reg, aux->map_off); dst_reg->type = PTR_TO_MAP_VALUE; - dst_reg->off = aux->map_off; + dst_reg->map_ptr = map; WARN_ON_ONCE(map->map_type != BPF_MAP_TYPE_INSN_ARRAY && map->max_entries != 1); /* We want reg->id to be same (0) as map_value is not distinct */ } else if (insn->src_reg == BPF_PSEUDO_MAP_FD || insn->src_reg == BPF_PSEUDO_MAP_IDX) { dst_reg->type = CONST_PTR_TO_MAP; + dst_reg->map_ptr = map; } else { verifier_bug(env, "unexpected src reg value for ldimm64"); return -EFAULT; @@ -19852,11 +19796,6 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, */ if (rold->range > rcur->range) return false; - /* If the offsets don't match, we can't trust our alignment; - * nor can we be sure that we won't fall out of range. - */ - if (rold->off != rcur->off) - return false; /* id relations must be preserved */ if (!check_ids(rold->id, rcur->id, idmap)) return false; @@ -19872,8 +19811,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, return true; case PTR_TO_INSN: return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 && - rold->off == rcur->off && range_within(rold, rcur) && - tnum_in(rold->var_off, rcur->var_off); + range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off); default: return regs_exact(rold, rcur, idmap); } @@ -20486,7 +20424,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * so we can assume valid iter and reg state, * no need for extra (re-)validations */ - spi = __get_spi(iter_reg->off + iter_reg->var_off.value); + spi = __get_spi(iter_reg->var_off.value); iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr; if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) { loop = true; @@ -20892,19 +20830,16 @@ static int indirect_jump_min_max_index(struct bpf_verifier_env *env, u32 *pmin_index, u32 *pmax_index) { struct bpf_reg_state *reg = reg_state(env, regno); - u64 min_index, max_index; + u64 min_index = reg->umin_value; + u64 max_index = reg->umax_value; const u32 size = 8; - if (check_add_overflow(reg->umin_value, reg->off, &min_index) || - (min_index > (u64) U32_MAX * size)) { - verbose(env, "the sum of R%u umin_value %llu and off %u is too big\n", - regno, reg->umin_value, reg->off); + if (min_index > (u64) U32_MAX * size) { + verbose(env, "the sum of R%u umin_value %llu is too big\n", regno, reg->umin_value); return -ERANGE; } - if (check_add_overflow(reg->umax_value, reg->off, &max_index) || - (max_index > (u64) U32_MAX * size)) { - verbose(env, "the sum of R%u umax_value %llu and off %u is too big\n", - regno, reg->umax_value, reg->off); + if (max_index > (u64) U32_MAX * size) { + verbose(env, "the sum of R%u umax_value %llu is too big\n", regno, reg->umax_value); return -ERANGE; } diff --git a/tools/testing/selftests/bpf/prog_tests/linked_list.c b/tools/testing/selftests/bpf/prog_tests/linked_list.c index 14c5a7ef0e87..6f25b5f39a79 100644 --- a/tools/testing/selftests/bpf/prog_tests/linked_list.c +++ b/tools/testing/selftests/bpf/prog_tests/linked_list.c @@ -87,12 +87,12 @@ static struct { { "incorrect_value_type", "operation on bpf_list_head expects arg#1 bpf_list_node at offset=48 in struct foo, " "but arg is at offset=0 in struct bar" }, - { "incorrect_node_var_off", "variable ptr_ access var_off=(0x0; 0xffffffff) disallowed" }, + { "incorrect_node_var_off", "variable ptr_ access var_off=(0x0; 0x1ffffffff) disallowed" }, { "incorrect_node_off1", "bpf_list_node not found at offset=49" }, { "incorrect_node_off2", "arg#1 offset=0, but expected bpf_list_node at offset=48 in struct foo" }, { "no_head_type", "bpf_list_head not found at offset=0" }, { "incorrect_head_var_off1", "R1 doesn't have constant offset" }, - { "incorrect_head_var_off2", "variable ptr_ access var_off=(0x0; 0xffffffff) disallowed" }, + { "incorrect_head_var_off2", "variable ptr_ access var_off=(0x0; 0x1ffffffff) disallowed" }, { "incorrect_head_off1", "bpf_list_head not found at offset=25" }, { "incorrect_head_off2", "bpf_list_head not found at offset=1" }, { "pop_front_off", "off 48 doesn't point to 'struct bpf_spin_lock' that is at 40" }, diff --git a/tools/testing/selftests/bpf/progs/exceptions_assert.c b/tools/testing/selftests/bpf/progs/exceptions_assert.c index a01c2736890f..ed00dd551ffb 100644 --- a/tools/testing/selftests/bpf/progs/exceptions_assert.c +++ b/tools/testing/selftests/bpf/progs/exceptions_assert.c @@ -114,7 +114,7 @@ int check_assert_single_range_u64(struct __sk_buff *ctx) SEC("?tc") __log_level(2) __failure -__msg(": R1=pkt(off=64,r=64) R2=pkt_end() R6=pkt(r=64) R10=fp0") +__msg(": R1=pkt(r=64,imm=64) R2=pkt_end() R6=pkt(r=64) R10=fp0") int check_assert_generic(struct __sk_buff *ctx) { u8 *data_end = (void *)(long)ctx->data_end; diff --git a/tools/testing/selftests/bpf/progs/iters.c b/tools/testing/selftests/bpf/progs/iters.c index 7f27b517d5d5..86b74e3579d9 100644 --- a/tools/testing/selftests/bpf/progs/iters.c +++ b/tools/testing/selftests/bpf/progs/iters.c @@ -1651,7 +1651,7 @@ int clean_live_states(const void *ctx) SEC("?raw_tp") __flag(BPF_F_TEST_STATE_FREQ) -__failure __msg("misaligned stack access off 0+-31+0 size 8") +__failure __msg("misaligned stack access off -31+0 size 8") __naked int absent_mark_in_the_middle_state(void) { /* This is equivalent to C program below. @@ -1726,7 +1726,7 @@ static int noop(void) SEC("?raw_tp") __flag(BPF_F_TEST_STATE_FREQ) -__failure __msg("misaligned stack access off 0+-31+0 size 8") +__failure __msg("misaligned stack access off -31+0 size 8") __naked int absent_mark_in_the_middle_state2(void) { /* This is equivalent to C program below. @@ -1802,7 +1802,7 @@ __naked int absent_mark_in_the_middle_state2(void) SEC("?raw_tp") __flag(BPF_F_TEST_STATE_FREQ) -__failure __msg("misaligned stack access off 0+-31+0 size 8") +__failure __msg("misaligned stack access off -31+0 size 8") __naked int absent_mark_in_the_middle_state3(void) { /* diff --git a/tools/testing/selftests/bpf/progs/mem_rdonly_untrusted.c b/tools/testing/selftests/bpf/progs/mem_rdonly_untrusted.c index 3b984b6ae7c0..5b4453747c23 100644 --- a/tools/testing/selftests/bpf/progs/mem_rdonly_untrusted.c +++ b/tools/testing/selftests/bpf/progs/mem_rdonly_untrusted.c @@ -8,7 +8,7 @@ SEC("tp_btf/sys_enter") __success __log_level(2) -__msg("r8 = *(u64 *)(r7 +0) ; R7=ptr_nameidata(off={{[0-9]+}}) R8=rdonly_untrusted_mem(sz=0)") +__msg("r8 = *(u64 *)(r7 +0) ; R7=ptr_nameidata(imm={{[0-9]+}}) R8=rdonly_untrusted_mem(sz=0)") __msg("r9 = *(u8 *)(r8 +0) ; R8=rdonly_untrusted_mem(sz=0) R9=scalar") int btf_id_to_ptr_mem(void *ctx) { diff --git a/tools/testing/selftests/bpf/progs/verifier_align.c b/tools/testing/selftests/bpf/progs/verifier_align.c index 90362d61f1fe..24553ce62881 100644 --- a/tools/testing/selftests/bpf/progs/verifier_align.c +++ b/tools/testing/selftests/bpf/progs/verifier_align.c @@ -131,7 +131,7 @@ LBL ":" \ SEC("tc") __success __log_level(2) __flag(BPF_F_ANY_ALIGNMENT) -__msg("6: R0=pkt(off=8,r=8)") +__msg("6: R0=pkt(r=8,imm=8)") __msg("6: {{.*}} R3={{[^)]*}}var_off=(0x0; 0xff)") __msg("7: {{.*}} R3={{[^)]*}}var_off=(0x0; 0x1fe)") __msg("8: {{.*}} R3={{[^)]*}}var_off=(0x0; 0x3fc)") @@ -203,10 +203,10 @@ __naked void unknown_mul(void) SEC("tc") __success __log_level(2) __msg("2: {{.*}} R5=pkt(r=0)") -__msg("4: {{.*}} R5=pkt(off=14,r=0)") -__msg("5: {{.*}} R4=pkt(off=14,r=0)") +__msg("4: {{.*}} R5=pkt(r=0,imm=14)") +__msg("5: {{.*}} R4=pkt(r=0,imm=14)") __msg("9: {{.*}} R2=pkt(r=18)") -__msg("10: {{.*}} R4={{[^)]*}}var_off=(0x0; 0xff){{.*}} R5=pkt(off=14,r=18)") +__msg("10: {{.*}} R4={{[^)]*}}var_off=(0x0; 0xff){{.*}} R5=pkt(r=18,imm=14)") __msg("13: {{.*}} R4={{[^)]*}}var_off=(0x0; 0xffff)") __msg("14: {{.*}} R4={{[^)]*}}var_off=(0x0; 0xffff)") __naked void packet_const_offset(void) @@ -247,14 +247,14 @@ __msg("7: {{.*}} R6={{[^)]*}}var_off=(0x0; 0x3fc)") /* Offset is added to packet pointer R5, resulting in * known fixed offset, and variable offset from R6. */ -__msg("11: {{.*}} R5=pkt(id=1,off=14,") +__msg("11: {{.*}} R5=pkt(id=1,{{[^)]*}},var_off=(0x2; 0x7fc)") /* At the time the word size load is performed from R5, * it's total offset is NET_IP_ALIGN + reg->off (0) + * reg->aux_off (14) which is 16. Then the variable * offset is considered using reg->aux_off_align which * is 4 and meets the load's requirements. */ -__msg("15: {{.*}} R4={{[^)]*}}var_off=(0x0; 0x3fc){{.*}} R5={{[^)]*}}var_off=(0x0; 0x3fc)") +__msg("15: {{.*}} R4={{[^)]*}}var_off=(0x2; 0x7fc){{.*}} R5={{[^)]*}}var_off=(0x2; 0x7fc)") /* Variable offset is added to R5 packet pointer, * resulting in auxiliary alignment of 4. To avoid BPF * verifier's precision backtracking logging @@ -266,37 +266,37 @@ __msg("18: {{.*}} R4={{[^)]*}}var_off=(0x0; 0x3fc){{.*}} R5={{[^)]*}}var_off=(0x /* Constant offset is added to R5, resulting in * reg->off of 14. */ -__msg("19: {{.*}} R5=pkt(id=2,off=14,") +__msg("19: {{.*}} R5=pkt(id=2,{{[^)]*}}var_off=(0x2; 0x7fc)") /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off * (14) which is 16. Then the variable offset is 4-byte * aligned, so the total offset is 4-byte aligned and * meets the load's requirements. */ -__msg("24: {{.*}} R4={{[^)]*}}var_off=(0x0; 0x3fc){{.*}} R5={{[^)]*}}var_off=(0x0; 0x3fc)") +__msg("24: {{.*}} R4={{[^)]*}}var_off=(0x2; 0x7fc){{.*}} R5={{[^)]*}}var_off=(0x2; 0x7fc)") /* Constant offset is added to R5 packet pointer, * resulting in reg->off value of 14. */ -__msg("26: {{.*}} R5=pkt(off=14,r=8)") +__msg("26: {{.*}} R5=pkt(r=8,imm=14)") /* Variable offset is added to R5, resulting in a * variable offset of (4n). See comment for insn #18 * for R4 = R5 trick. */ -__msg("28: {{.*}} R4={{[^)]*}}var_off=(0x0; 0x3fc){{.*}} R5={{[^)]*}}var_off=(0x0; 0x3fc)") +__msg("28: {{.*}} R4={{[^)]*}}var_off=(0x2; 0x7fc){{.*}} R5={{[^)]*}}var_off=(0x2; 0x7fc)") /* Constant is added to R5 again, setting reg->off to 18. */ -__msg("29: {{.*}} R5=pkt(id=3,off=18,") +__msg("29: {{.*}} R5=pkt(id=3,{{[^)]*}}var_off=(0x2; 0x7fc)") /* And once more we add a variable; resulting {{[^)]*}}var_off * is still (4n), fixed offset is not changed. * Also, we create a new reg->id. */ -__msg("31: {{.*}} R4={{[^)]*}}var_off=(0x0; 0x7fc){{.*}} R5={{[^)]*}}var_off=(0x0; 0x7fc)") +__msg("31: {{.*}} R4={{[^)]*}}var_off=(0x2; 0xffc){{.*}} R5={{[^)]*}}var_off=(0x2; 0xffc)") /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (18) * which is 20. Then the variable offset is (4n), so * the total offset is 4-byte aligned and meets the * load's requirements. */ -__msg("35: {{.*}} R4={{[^)]*}}var_off=(0x0; 0x7fc){{.*}} R5={{[^)]*}}var_off=(0x0; 0x7fc)") +__msg("35: {{.*}} R4={{[^)]*}}var_off=(0x2; 0xffc){{.*}} R5={{[^)]*}}var_off=(0x2; 0xffc)") __naked void packet_variable_offset(void) { asm volatile (" \ @@ -430,16 +430,10 @@ __msg("6: {{.*}} R5={{[^)]*}}var_off=(0x2; 0xfffffffffffffffc)") /* Checked s>=0 */ __msg("9: {{.*}} R5={{[^)]*}}var_off=(0x2; 0x7ffffffffffffffc)") /* packet pointer + nonnegative (4n+2) */ -__msg("11: {{.*}} R6={{[^)]*}}var_off=(0x2; 0x7ffffffffffffffc)") -__msg("12: {{.*}} R4={{[^)]*}}var_off=(0x2; 0x7ffffffffffffffc)") -/* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine. - * We checked the bounds, but it might have been able - * to overflow if the packet pointer started in the - * upper half of the address space. - * So we did not get a 'range' on R6, and the access - * attempt will fail. - */ -__msg("15: {{.*}} R6={{[^)]*}}var_off=(0x2; 0x7ffffffffffffffc)") +__msg("11: {{.*}} R4={{[^)]*}}var_off=(0x2; 0x7ffffffffffffffc){{.*}} R6={{[^)]*}}var_off=(0x2; 0x7ffffffffffffffc)") +__msg("12: (07) r4 += 4") +/* packet smax bound overflow */ +__msg("pkt pointer offset -9223372036854775808 is not allowed") __naked void dubious_pointer_arithmetic(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/progs/verifier_bounds.c b/tools/testing/selftests/bpf/progs/verifier_bounds.c index 560531404bce..d195eaa67d75 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bounds.c +++ b/tools/testing/selftests/bpf/progs/verifier_bounds.c @@ -202,7 +202,7 @@ l0_%=: /* exit */ \ SEC("tc") __description("bounds check based on reg_off + var_off + insn_off. test1") -__failure __msg("value_size=8 off=1073741825") +__failure __msg("map_value pointer offset 1073741822 is not allowed") __naked void var_off_insn_off_test1(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/progs/verifier_direct_packet_access.c b/tools/testing/selftests/bpf/progs/verifier_direct_packet_access.c index 911caa8fd1b7..4ee3b7a708f7 100644 --- a/tools/testing/selftests/bpf/progs/verifier_direct_packet_access.c +++ b/tools/testing/selftests/bpf/progs/verifier_direct_packet_access.c @@ -412,7 +412,7 @@ l0_%=: r0 = 0; \ SEC("tc") __description("direct packet access: test17 (pruning, alignment)") -__failure __msg("misaligned packet access off 2+0+15+-4 size 4") +__failure __msg("misaligned packet access off 2+15+-4 size 4") __flag(BPF_F_STRICT_ALIGNMENT) __naked void packet_access_test17_pruning_alignment(void) { @@ -569,7 +569,7 @@ l0_%=: r0 = 0; \ SEC("tc") __description("direct packet access: test23 (x += pkt_ptr, 4)") -__failure __msg("invalid access to packet, off=0 size=8, R5(id=3,off=0,r=0)") +__failure __msg("invalid access to packet, off=31 size=8, R5(id=3,off=31,r=0)") __flag(BPF_F_ANY_ALIGNMENT) __naked void test23_x_pkt_ptr_4(void) { diff --git a/tools/testing/selftests/bpf/progs/verifier_gotox.c b/tools/testing/selftests/bpf/progs/verifier_gotox.c index 607dad058ca1..548dce00f5fb 100644 --- a/tools/testing/selftests/bpf/progs/verifier_gotox.c +++ b/tools/testing/selftests/bpf/progs/verifier_gotox.c @@ -131,7 +131,7 @@ DEFINE_INVALID_SIZE_PROG(u16, __failure __msg("Invalid read of 2 bytes from insn DEFINE_INVALID_SIZE_PROG(u8, __failure __msg("Invalid read of 1 bytes from insn_array")) SEC("socket") -__failure __msg("misaligned value access off 0+1+0 size 8") +__failure __msg("misaligned value access off 1+0 size 8") __naked void jump_table_misaligned_access(void) { asm volatile (" \ @@ -187,7 +187,7 @@ jt0_%=: \ } SEC("socket") -__failure __msg("invalid access to map value, value_size=16 off=-24 size=8") +__failure __msg("R0 min value is negative") __naked void jump_table_invalid_mem_acceess_neg(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/progs/verifier_helper_packet_access.c b/tools/testing/selftests/bpf/progs/verifier_helper_packet_access.c index 74f5f9cd153d..71cee3f58324 100644 --- a/tools/testing/selftests/bpf/progs/verifier_helper_packet_access.c +++ b/tools/testing/selftests/bpf/progs/verifier_helper_packet_access.c @@ -360,7 +360,7 @@ l0_%=: r0 = 0; \ SEC("tc") __description("helper access to packet: test15, cls helper fail sub") -__failure __msg("invalid access to packet") +__failure __msg("R1 min value is negative") __naked void test15_cls_helper_fail_sub(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/progs/verifier_helper_value_access.c b/tools/testing/selftests/bpf/progs/verifier_helper_value_access.c index 886498b5e6f3..6d2a38597c34 100644 --- a/tools/testing/selftests/bpf/progs/verifier_helper_value_access.c +++ b/tools/testing/selftests/bpf/progs/verifier_helper_value_access.c @@ -1100,7 +1100,7 @@ l0_%=: exit; \ SEC("tracepoint") __description("map helper access to adjusted map (via const imm): out-of-bound 2") -__failure __msg("invalid access to map value, value_size=16 off=-4 size=8") +__failure __msg("R2 min value is negative") __naked void imm_out_of_bound_2(void) { asm volatile (" \ @@ -1176,7 +1176,7 @@ l0_%=: exit; \ SEC("tracepoint") __description("map helper access to adjusted map (via const reg): out-of-bound 2") -__failure __msg("invalid access to map value, value_size=16 off=-4 size=8") +__failure __msg("R2 min value is negative") __naked void reg_out_of_bound_2(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/progs/verifier_int_ptr.c b/tools/testing/selftests/bpf/progs/verifier_int_ptr.c index 59e34d558654..6627f44faf4b 100644 --- a/tools/testing/selftests/bpf/progs/verifier_int_ptr.c +++ b/tools/testing/selftests/bpf/progs/verifier_int_ptr.c @@ -65,7 +65,7 @@ __naked void ptr_to_long_half_uninitialized(void) SEC("cgroup/sysctl") __description("arg pointer to long misaligned") -__failure __msg("misaligned stack access off 0+-20+0 size 8") +__failure __msg("misaligned stack access off -20+0 size 8") __naked void arg_ptr_to_long_misaligned(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/progs/verifier_meta_access.c b/tools/testing/selftests/bpf/progs/verifier_meta_access.c index d81722fb5f19..62235f032ffe 100644 --- a/tools/testing/selftests/bpf/progs/verifier_meta_access.c +++ b/tools/testing/selftests/bpf/progs/verifier_meta_access.c @@ -27,7 +27,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("meta access, test2") -__failure __msg("invalid access to packet, off=-8") +__failure __msg("R0 min value is negative") __naked void meta_access_test2(void) { asm volatile (" \ diff --git a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c index 7a13dbd794b2..893d3bb024a0 100644 --- a/tools/testing/selftests/bpf/progs/verifier_spill_fill.c +++ b/tools/testing/selftests/bpf/progs/verifier_spill_fill.c @@ -656,7 +656,7 @@ __msg("mark_precise: frame0: regs= stack=-8 before 6: (05) goto pc+0") __msg("mark_precise: frame0: regs= stack=-8 before 5: (7b) *(u64 *)(r10 -16) = r0") __msg("mark_precise: frame0: regs= stack=-8 before 4: (b7) r0 = 1") __msg("mark_precise: frame0: regs= stack=-8 before 3: (7a) *(u64 *)(r10 -8) = 1") -__msg("10: R1=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2=1") +__msg("10: R1=map_value(map=.data.two_byte_,ks=4,vs=2,imm=1) R2=1") /* validate load from fp-16, which was initialized using BPF_STX_MEM */ __msg("12: (79) r2 = *(u64 *)(r10 -16) ; R2=1 R10=fp0 fp-16=1") __msg("13: (0f) r1 += r2") @@ -673,7 +673,7 @@ __msg("mark_precise: frame0: last_idx 6 first_idx 3 subseq_idx 7") __msg("mark_precise: frame0: regs= stack=-16 before 6: (05) goto pc+0") __msg("mark_precise: frame0: regs= stack=-16 before 5: (7b) *(u64 *)(r10 -16) = r0") __msg("mark_precise: frame0: regs=r0 stack= before 4: (b7) r0 = 1") -__msg("14: R1=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2=1") +__msg("14: R1=map_value(map=.data.two_byte_,ks=4,vs=2,imm=1) R2=1") __naked void stack_load_preserves_const_precision(void) { asm volatile ( @@ -732,7 +732,7 @@ __msg("mark_precise: frame0: regs= stack=-8 before 6: (05) goto pc+0") __msg("mark_precise: frame0: regs= stack=-8 before 5: (63) *(u32 *)(r10 -16) = r0") __msg("mark_precise: frame0: regs= stack=-8 before 4: (b7) r0 = 1") __msg("mark_precise: frame0: regs= stack=-8 before 3: (62) *(u32 *)(r10 -8) = 1") -__msg("10: R1=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2=1") +__msg("10: R1=map_value(map=.data.two_byte_,ks=4,vs=2,imm=1) R2=1") /* validate load from fp-16, which was initialized using BPF_STX_MEM */ __msg("12: (61) r2 = *(u32 *)(r10 -16) ; R2=1 R10=fp0 fp-16=????1") __msg("13: (0f) r1 += r2") @@ -748,7 +748,7 @@ __msg("mark_precise: frame0: last_idx 6 first_idx 3 subseq_idx 7") __msg("mark_precise: frame0: regs= stack=-16 before 6: (05) goto pc+0") __msg("mark_precise: frame0: regs= stack=-16 before 5: (63) *(u32 *)(r10 -16) = r0") __msg("mark_precise: frame0: regs=r0 stack= before 4: (b7) r0 = 1") -__msg("14: R1=map_value(map=.data.two_byte_,ks=4,vs=2,off=1) R2=1") +__msg("14: R1=map_value(map=.data.two_byte_,ks=4,vs=2,imm=1) R2=1") __naked void stack_load_preserves_const_precision_subreg(void) { asm volatile ( diff --git a/tools/testing/selftests/bpf/progs/verifier_stack_ptr.c b/tools/testing/selftests/bpf/progs/verifier_stack_ptr.c index 24aabc6083fd..8e8cf8232255 100644 --- a/tools/testing/selftests/bpf/progs/verifier_stack_ptr.c +++ b/tools/testing/selftests/bpf/progs/verifier_stack_ptr.c @@ -37,7 +37,7 @@ __naked void ptr_to_stack_store_load(void) SEC("socket") __description("PTR_TO_STACK store/load - bad alignment on off") -__failure __msg("misaligned stack access off 0+-8+2 size 8") +__failure __msg("misaligned stack access off -8+2 size 8") __failure_unpriv __naked void load_bad_alignment_on_off(void) { @@ -53,7 +53,7 @@ __naked void load_bad_alignment_on_off(void) SEC("socket") __description("PTR_TO_STACK store/load - bad alignment on reg") -__failure __msg("misaligned stack access off 0+-10+8 size 8") +__failure __msg("misaligned stack access off -10+8 size 8") __failure_unpriv __naked void load_bad_alignment_on_reg(void) { diff --git a/tools/testing/selftests/bpf/progs/verifier_value_ptr_arith.c b/tools/testing/selftests/bpf/progs/verifier_value_ptr_arith.c index af7938ce56cb..b3b701b44550 100644 --- a/tools/testing/selftests/bpf/progs/verifier_value_ptr_arith.c +++ b/tools/testing/selftests/bpf/progs/verifier_value_ptr_arith.c @@ -346,7 +346,7 @@ l2_%=: r0 = 1; \ SEC("socket") __description("map access: value_ptr -= known scalar from different maps") __success __failure_unpriv -__msg_unpriv("R0 min value is outside of the allowed memory range") +__msg_unpriv("R0 min value is negative") __retval(1) __naked void known_scalar_from_different_maps(void) { @@ -683,9 +683,7 @@ l0_%=: r0 = 1; \ SEC("socket") __description("map access: value_ptr -= known scalar, lower oob arith, test 1") -__failure __msg("R0 min value is outside of the allowed memory range") -__failure_unpriv -__msg_unpriv("R0 pointer arithmetic of map value goes out of range") +__failure __msg("R0 min value is negative") __naked void lower_oob_arith_test_1(void) { asm volatile (" \ @@ -840,7 +838,7 @@ l0_%=: r0 = 1; \ SEC("socket") __description("map access: value_ptr += known scalar, 3") -__failure __msg("invalid access to map value") +__failure __msg("R0 min value is negative") __failure_unpriv __naked void value_ptr_known_scalar_3(void) { @@ -1207,7 +1205,7 @@ l0_%=: r0 = 1; \ SEC("socket") __description("map access: value_ptr -= known scalar") -__failure __msg("R0 min value is outside of the allowed memory range") +__failure __msg("R0 min value is negative") __failure_unpriv __naked void access_value_ptr_known_scalar(void) { diff --git a/tools/testing/selftests/bpf/progs/verifier_xdp_direct_packet_access.c b/tools/testing/selftests/bpf/progs/verifier_xdp_direct_packet_access.c index df2dfd1b15d1..0b86d95a4133 100644 --- a/tools/testing/selftests/bpf/progs/verifier_xdp_direct_packet_access.c +++ b/tools/testing/selftests/bpf/progs/verifier_xdp_direct_packet_access.c @@ -69,7 +69,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_data' > pkt_end, bad access 1") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void pkt_end_bad_access_1_1(void) { @@ -131,7 +131,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_data' > pkt_end, corner case -1, bad access") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void corner_case_1_bad_access_1(void) { @@ -173,7 +173,7 @@ l1_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_end > pkt_data', corner case -1, bad access") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void corner_case_1_bad_access_2(void) { @@ -279,7 +279,7 @@ l1_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_data' < pkt_end, corner case -1, bad access") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void corner_case_1_bad_access_3(void) { @@ -384,7 +384,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_end < pkt_data', bad access 1") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void pkt_data_bad_access_1_1(void) { @@ -446,7 +446,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_end < pkt_data', corner case -1, bad access") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void corner_case_1_bad_access_4(void) { @@ -487,7 +487,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_data' >= pkt_end, corner case -1, bad access") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void corner_case_1_bad_access_5(void) { @@ -590,7 +590,7 @@ l1_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_end >= pkt_data', bad access 1") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void pkt_data_bad_access_1_2(void) { @@ -654,7 +654,7 @@ l1_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_end >= pkt_data', corner case -1, bad access") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void corner_case_1_bad_access_6(void) { @@ -697,7 +697,7 @@ l1_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_data' <= pkt_end, bad access 1") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void pkt_end_bad_access_1_2(void) { @@ -761,7 +761,7 @@ l1_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_data' <= pkt_end, corner case -1, bad access") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void corner_case_1_bad_access_7(void) { @@ -803,7 +803,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_end <= pkt_data', corner case -1, bad access") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void corner_case_1_bad_access_8(void) { @@ -905,7 +905,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_meta' > pkt_data, bad access 1") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void pkt_data_bad_access_1_3(void) { @@ -926,7 +926,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_meta' > pkt_data, bad access 2") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void pkt_data_bad_access_2_5(void) { @@ -967,7 +967,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_meta' > pkt_data, corner case -1, bad access") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void corner_case_1_bad_access_9(void) { @@ -1009,7 +1009,7 @@ l1_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_data > pkt_meta', corner case -1, bad access") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void corner_case_1_bad_access_10(void) { @@ -1031,7 +1031,7 @@ l1_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_data > pkt_meta', bad access 2") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void pkt_meta_bad_access_2_1(void) { @@ -1115,7 +1115,7 @@ l1_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_meta' < pkt_data, corner case -1, bad access") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void corner_case_1_bad_access_11(void) { @@ -1137,7 +1137,7 @@ l1_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_meta' < pkt_data, bad access 2") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void pkt_data_bad_access_2_6(void) { @@ -1220,7 +1220,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_data < pkt_meta', bad access 1") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void pkt_meta_bad_access_1_1(void) { @@ -1241,7 +1241,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_data < pkt_meta', bad access 2") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void pkt_meta_bad_access_2_2(void) { @@ -1282,7 +1282,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_data < pkt_meta', corner case -1, bad access") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void corner_case_1_bad_access_12(void) { @@ -1323,7 +1323,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_meta' >= pkt_data, corner case -1, bad access") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void corner_case_1_bad_access_13(void) { @@ -1344,7 +1344,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_meta' >= pkt_data, bad access 2") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void pkt_data_bad_access_2_7(void) { @@ -1426,7 +1426,7 @@ l1_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_data >= pkt_meta', bad access 1") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void pkt_meta_bad_access_1_2(void) { @@ -1448,7 +1448,7 @@ l1_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_data >= pkt_meta', bad access 2") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void pkt_meta_bad_access_2_3(void) { @@ -1490,7 +1490,7 @@ l1_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_data >= pkt_meta', corner case -1, bad access") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void corner_case_1_bad_access_14(void) { @@ -1533,7 +1533,7 @@ l1_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_meta' <= pkt_data, bad access 1") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void pkt_data_bad_access_1_4(void) { @@ -1555,7 +1555,7 @@ l1_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_meta' <= pkt_data, bad access 2") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void pkt_data_bad_access_2_8(void) { @@ -1597,7 +1597,7 @@ l1_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_meta' <= pkt_data, corner case -1, bad access") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void corner_case_1_bad_access_15(void) { @@ -1639,7 +1639,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_data <= pkt_meta', corner case -1, bad access") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void corner_case_1_bad_access_16(void) { @@ -1660,7 +1660,7 @@ l0_%=: r0 = 0; \ SEC("xdp") __description("XDP pkt read, pkt_data <= pkt_meta', bad access 2") -__failure __msg("R1 offset is outside of the packet") +__failure __msg("R1 {{min|max}} value is outside of the allowed memory range") __flag(BPF_F_ANY_ALIGNMENT) __naked void pkt_meta_bad_access_2_4(void) { diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index 9ca83dce100d..86887130a0ef 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -220,7 +220,7 @@ }, .result_unpriv = REJECT, .result = REJECT, - .errstr = "variable trusted_ptr_ access var_off=(0x0; 0x7) disallowed", + .errstr = "R1 must have zero offset when passed to release func or trusted arg to kfunc", }, { "calls: invalid kfunc call: referenced arg needs refcounted PTR_TO_BTF_ID", -- cgit v1.2.3 From 3d91c618aca403a7f7d2064272f528a97b849475 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Thu, 12 Feb 2026 13:34:24 -0800 Subject: bpf: rename bpf_reg_state->off to bpf_reg_state->delta This field is now used only for linked scalar registers tracking. Rename it to 'delta' to better describe it's purpose: constant delta between "linked" scalars with the same ID. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20260212-ptrs-off-migration-v2-4-00820e4d3438@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 6 +++--- kernel/bpf/log.c | 8 ++++---- kernel/bpf/verifier.c | 18 +++++++++--------- 3 files changed, 16 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index a97bdbf3a07b..c1e30096ea7b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -40,7 +40,7 @@ struct bpf_reg_state { /* * Constant delta between "linked" scalars with the same ID. */ - s32 off; + s32 delta; union { /* valid when type == PTR_TO_PACKET */ int range; @@ -145,9 +145,9 @@ struct bpf_reg_state { * Upper bit of ID is used to remember relationship between "linked" * registers. Example: * r1 = r2; both will have r1->id == r2->id == N - * r1 += 10; r1->id == N | BPF_ADD_CONST and r1->off == 10 + * r1 += 10; r1->id == N | BPF_ADD_CONST and r1->delta == 10 * r3 = r2; both will have r3->id == r2->id == N - * w3 += 10; r3->id == N | BPF_ADD_CONST32 and r3->off == 10 + * w3 += 10; r3->id == N | BPF_ADD_CONST32 and r3->delta == 10 */ #define BPF_ADD_CONST64 (1U << 31) #define BPF_ADD_CONST32 (1U << 30) diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 39a731392d65..37d72b052192 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -694,7 +694,7 @@ static void print_reg_state(struct bpf_verifier_env *env, if (state->frameno != reg->frameno) verbose(env, "[%d]", reg->frameno); if (tnum_is_const(reg->var_off)) { - verbose_snum(env, reg->var_off.value + reg->off); + verbose_snum(env, reg->var_off.value + reg->delta); return; } } @@ -704,7 +704,7 @@ static void print_reg_state(struct bpf_verifier_env *env, if (reg->id) verbose_a("id=%d", reg->id & ~BPF_ADD_CONST); if (reg->id & BPF_ADD_CONST) - verbose(env, "%+d", reg->off); + verbose(env, "%+d", reg->delta); if (reg->ref_obj_id) verbose_a("ref_obj_id=%d", reg->ref_obj_id); if (type_is_non_owning_ref(reg->type)) @@ -716,9 +716,9 @@ static void print_reg_state(struct bpf_verifier_env *env, reg->map_ptr->key_size, reg->map_ptr->value_size); } - if (t != SCALAR_VALUE && reg->off) { + if (t != SCALAR_VALUE && reg->delta) { verbose_a("off="); - verbose_snum(env, reg->off); + verbose_snum(env, reg->delta); } if (type_is_pkt_pointer(t)) { verbose_a("r="); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2c5794dad668..0162f946032f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5095,7 +5095,7 @@ static void assign_scalar_id_before_mov(struct bpf_verifier_env *env, * Cleared it, since multiple rX += const are not supported. */ src_reg->id = 0; - src_reg->off = 0; + src_reg->delta = 0; } if (!src_reg->id && !tnum_is_const(src_reg->var_off)) @@ -16219,14 +16219,14 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * we cannot accumulate another val into rx->off. */ clear_id: - dst_reg->off = 0; + dst_reg->delta = 0; dst_reg->id = 0; } else { if (alu32) dst_reg->id |= BPF_ADD_CONST32; else dst_reg->id |= BPF_ADD_CONST64; - dst_reg->off = off; + dst_reg->delta = off; } } else { /* @@ -17305,18 +17305,18 @@ static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_s if ((reg->id & ~BPF_ADD_CONST) != (known_reg->id & ~BPF_ADD_CONST)) continue; if ((!(reg->id & BPF_ADD_CONST) && !(known_reg->id & BPF_ADD_CONST)) || - reg->off == known_reg->off) { + reg->delta == known_reg->delta) { s32 saved_subreg_def = reg->subreg_def; copy_register_state(reg, known_reg); reg->subreg_def = saved_subreg_def; } else { s32 saved_subreg_def = reg->subreg_def; - s32 saved_off = reg->off; + s32 saved_off = reg->delta; u32 saved_id = reg->id; fake_reg.type = SCALAR_VALUE; - __mark_reg_known(&fake_reg, (s64)reg->off - (s64)known_reg->off); + __mark_reg_known(&fake_reg, (s64)reg->delta - (s64)known_reg->delta); /* reg = known_reg; reg += delta */ copy_register_state(reg, known_reg); @@ -17324,7 +17324,7 @@ static void sync_linked_regs(struct bpf_verifier_env *env, struct bpf_verifier_s * Must preserve off, id and subreg_def flag, * otherwise another sync_linked_regs() will be incorrect. */ - reg->off = saved_off; + reg->delta = saved_off; reg->id = saved_id; reg->subreg_def = saved_subreg_def; @@ -19629,7 +19629,7 @@ static void clear_singular_ids(struct bpf_verifier_env *env, continue; if (idset_cnt_get(idset, reg->id & ~BPF_ADD_CONST) == 1) { reg->id = 0; - reg->off = 0; + reg->delta = 0; } })); } @@ -19766,7 +19766,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, return false; /* Both have offset linkage: offsets must match */ - if ((rold->id & BPF_ADD_CONST) && rold->off != rcur->off) + if ((rold->id & BPF_ADD_CONST) && rold->delta != rcur->delta) return false; if (!check_scalar_ids(rold->id, rcur->id, idmap)) -- cgit v1.2.3 From a235d3bcd28ba2d472f5b4cb6b259baeab75bafd Mon Sep 17 00:00:00 2001 From: Kundan Kumar Date: Fri, 13 Feb 2026 11:16:31 +0530 Subject: writeback: prep helpers for dirty-limit and writeback accounting Add helper APIs needed by filesystems to avoid poking into writeback internals. Reviewed-by: Jan Kara Reviewed-by: Jeff Layton Reviewed-by: Andreas Gruenbacher Suggested-by: Christoph Hellwig Signed-off-by: Kundan Kumar Signed-off-by: Anuj Gupta Link: https://patch.msgid.link/20260213054634.79785-2-kundan.kumar@samsung.com Signed-off-by: Christian Brauner --- include/linux/backing-dev.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 0c8342747cab..5b7d12b40d5e 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -136,6 +136,19 @@ static inline bool mapping_can_writeback(struct address_space *mapping) return inode_to_bdi(mapping->host)->capabilities & BDI_CAP_WRITEBACK; } +/* Must not be used by file systems that support cgroup writeback */ +static inline int bdi_wb_dirty_exceeded(struct backing_dev_info *bdi) +{ + return bdi->wb.dirty_exceeded; +} + +/* Must not be used by file systems that support cgroup writeback */ +static inline void bdi_wb_stat_mod(struct inode *inode, enum wb_stat_item item, + s64 amount) +{ + wb_stat_mod(&inode_to_bdi(inode)->wb, item, amount); +} + #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi, -- cgit v1.2.3 From 0d799df5b147e08828887fe7299efd7a9e0eea40 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 19 Feb 2026 07:50:01 +0100 Subject: fs: mark bool_names static The bool_names array is only used in fs_parser.c so mark it static. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20260219065014.3550402-2-hch@lst.de Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fs_parser.c | 3 +-- include/linux/fs_parser.h | 2 -- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/fs/fs_parser.c b/fs/fs_parser.c index c092a9f79e32..46993e31137d 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -13,7 +13,7 @@ #include #include "internal.h" -const struct constant_table bool_names[] = { +static const struct constant_table bool_names[] = { { "0", false }, { "1", true }, { "false", false }, @@ -22,7 +22,6 @@ const struct constant_table bool_names[] = { { "yes", true }, { }, }; -EXPORT_SYMBOL(bool_names); static const struct constant_table * __lookup_constant(const struct constant_table *tbl, const char *name) diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h index 5e8a3b546033..ac8253cca2bc 100644 --- a/include/linux/fs_parser.h +++ b/include/linux/fs_parser.h @@ -84,8 +84,6 @@ extern int fs_lookup_param(struct fs_context *fc, extern int lookup_constant(const struct constant_table tbl[], const char *name, int not_found); -extern const struct constant_table bool_names[]; - #ifdef CONFIG_VALIDATE_FS_PARSER extern bool fs_validate_description(const char *name, const struct fs_parameter_spec *desc); -- cgit v1.2.3 From 8823db29744fceda9f94e674f74deea446c620b3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 19 Feb 2026 07:50:02 +0100 Subject: fs: remove fsparam_blob / fs_param_is_blob These are not used anywhere even after the fs_context conversion is finished, so remove them. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20260219065014.3550402-3-hch@lst.de Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- Documentation/filesystems/mount_api.rst | 2 -- fs/fs_parser.c | 9 --------- include/linux/fs_parser.h | 3 +-- 3 files changed, 1 insertion(+), 13 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/mount_api.rst b/Documentation/filesystems/mount_api.rst index a064234fed5b..b4a0f23914a6 100644 --- a/Documentation/filesystems/mount_api.rst +++ b/Documentation/filesystems/mount_api.rst @@ -647,7 +647,6 @@ The members are as follows: fs_param_is_u64 64-bit unsigned int result->uint_64 fs_param_is_enum Enum value name result->uint_32 fs_param_is_string Arbitrary string param->string - fs_param_is_blob Binary blob param->blob fs_param_is_blockdev Blockdev path * Needs lookup fs_param_is_path Path * Needs lookup fs_param_is_fd File descriptor result->int_32 @@ -681,7 +680,6 @@ The members are as follows: fsparam_u64() fs_param_is_u64 fsparam_enum() fs_param_is_enum fsparam_string() fs_param_is_string - fsparam_blob() fs_param_is_blob fsparam_bdev() fs_param_is_blockdev fsparam_path() fs_param_is_path fsparam_fd() fs_param_is_fd diff --git a/fs/fs_parser.c b/fs/fs_parser.c index 46993e31137d..79e8fe9176fa 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -277,15 +277,6 @@ int fs_param_is_string(struct p_log *log, const struct fs_parameter_spec *p, } EXPORT_SYMBOL(fs_param_is_string); -int fs_param_is_blob(struct p_log *log, const struct fs_parameter_spec *p, - struct fs_parameter *param, struct fs_parse_result *result) -{ - if (param->type != fs_value_is_blob) - return fs_param_bad_value(log, param); - return 0; -} -EXPORT_SYMBOL(fs_param_is_blob); - int fs_param_is_fd(struct p_log *log, const struct fs_parameter_spec *p, struct fs_parameter *param, struct fs_parse_result *result) { diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h index ac8253cca2bc..961562b101c5 100644 --- a/include/linux/fs_parser.h +++ b/include/linux/fs_parser.h @@ -27,7 +27,7 @@ typedef int fs_param_type(struct p_log *, * The type of parameter expected. */ fs_param_type fs_param_is_bool, fs_param_is_u32, fs_param_is_s32, fs_param_is_u64, - fs_param_is_enum, fs_param_is_string, fs_param_is_blob, fs_param_is_blockdev, + fs_param_is_enum, fs_param_is_string, fs_param_is_blockdev, fs_param_is_path, fs_param_is_fd, fs_param_is_uid, fs_param_is_gid, fs_param_is_file_or_string; @@ -125,7 +125,6 @@ static inline bool fs_validate_description(const char *name, #define fsparam_enum(NAME, OPT, array) __fsparam(fs_param_is_enum, NAME, OPT, 0, array) #define fsparam_string(NAME, OPT) \ __fsparam(fs_param_is_string, NAME, OPT, 0, NULL) -#define fsparam_blob(NAME, OPT) __fsparam(fs_param_is_blob, NAME, OPT, 0, NULL) #define fsparam_bdev(NAME, OPT) __fsparam(fs_param_is_blockdev, NAME, OPT, 0, NULL) #define fsparam_path(NAME, OPT) __fsparam(fs_param_is_path, NAME, OPT, 0, NULL) #define fsparam_fd(NAME, OPT) __fsparam(fs_param_is_fd, NAME, OPT, 0, NULL) -- cgit v1.2.3 From d2f2f7cf8e898f7b80fe031a263df9c7de94b0b7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 19 Feb 2026 07:50:03 +0100 Subject: fs: remove fsparam_path / fs_param_is_path These are not used anywhere even after the fs_context conversion is finished, so remove them. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20260219065014.3550402-4-hch@lst.de Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- Documentation/filesystems/mount_api.rst | 2 -- fs/fs_parser.c | 7 ------- include/linux/fs_parser.h | 3 +-- 3 files changed, 1 insertion(+), 11 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/mount_api.rst b/Documentation/filesystems/mount_api.rst index b4a0f23914a6..e8b94357b4df 100644 --- a/Documentation/filesystems/mount_api.rst +++ b/Documentation/filesystems/mount_api.rst @@ -648,7 +648,6 @@ The members are as follows: fs_param_is_enum Enum value name result->uint_32 fs_param_is_string Arbitrary string param->string fs_param_is_blockdev Blockdev path * Needs lookup - fs_param_is_path Path * Needs lookup fs_param_is_fd File descriptor result->int_32 fs_param_is_uid User ID (u32) result->uid fs_param_is_gid Group ID (u32) result->gid @@ -681,7 +680,6 @@ The members are as follows: fsparam_enum() fs_param_is_enum fsparam_string() fs_param_is_string fsparam_bdev() fs_param_is_blockdev - fsparam_path() fs_param_is_path fsparam_fd() fs_param_is_fd fsparam_uid() fs_param_is_uid fsparam_gid() fs_param_is_gid diff --git a/fs/fs_parser.c b/fs/fs_parser.c index 79e8fe9176fa..b4cc4cce518a 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -361,13 +361,6 @@ int fs_param_is_blockdev(struct p_log *log, const struct fs_parameter_spec *p, } EXPORT_SYMBOL(fs_param_is_blockdev); -int fs_param_is_path(struct p_log *log, const struct fs_parameter_spec *p, - struct fs_parameter *param, struct fs_parse_result *result) -{ - return 0; -} -EXPORT_SYMBOL(fs_param_is_path); - #ifdef CONFIG_VALIDATE_FS_PARSER /** * fs_validate_description - Validate a parameter specification array diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h index 961562b101c5..98b83708f92b 100644 --- a/include/linux/fs_parser.h +++ b/include/linux/fs_parser.h @@ -28,7 +28,7 @@ typedef int fs_param_type(struct p_log *, */ fs_param_type fs_param_is_bool, fs_param_is_u32, fs_param_is_s32, fs_param_is_u64, fs_param_is_enum, fs_param_is_string, fs_param_is_blockdev, - fs_param_is_path, fs_param_is_fd, fs_param_is_uid, fs_param_is_gid, + fs_param_is_fd, fs_param_is_uid, fs_param_is_gid, fs_param_is_file_or_string; /* @@ -126,7 +126,6 @@ static inline bool fs_validate_description(const char *name, #define fsparam_string(NAME, OPT) \ __fsparam(fs_param_is_string, NAME, OPT, 0, NULL) #define fsparam_bdev(NAME, OPT) __fsparam(fs_param_is_blockdev, NAME, OPT, 0, NULL) -#define fsparam_path(NAME, OPT) __fsparam(fs_param_is_path, NAME, OPT, 0, NULL) #define fsparam_fd(NAME, OPT) __fsparam(fs_param_is_fd, NAME, OPT, 0, NULL) #define fsparam_file_or_string(NAME, OPT) \ __fsparam(fs_param_is_file_or_string, NAME, OPT, 0, NULL) -- cgit v1.2.3 From 95cef38e70250234a254e6228eb7342b6deaaffa Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 17 Feb 2026 16:56:18 +0100 Subject: firmware: google: Export coreboot table entries Move types for coreboot table entries to . Allows drivers in other subsystems to use these structures. Signed-off-by: Thomas Zimmermann Acked-by: Tzung-Bi Shih Acked-by: Julius Werner Link: https://patch.msgid.link/20260217155836.96267-9-tzimmermann@suse.de --- MAINTAINERS | 1 + drivers/firmware/google/coreboot_table.c | 10 ++++ drivers/firmware/google/coreboot_table.h | 60 +---------------------- drivers/firmware/google/framebuffer-coreboot.c | 2 - include/linux/coreboot.h | 66 ++++++++++++++++++++++++++ 5 files changed, 78 insertions(+), 61 deletions(-) create mode 100644 include/linux/coreboot.h (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 4a2d5e8f0f63..d0dfcfd15e59 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10753,6 +10753,7 @@ L: chrome-platform@lists.linux.dev S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/chrome-platform/linux.git F: drivers/firmware/google/ +F: include/linux/coreboot.h GOOGLE TENSOR SoC SUPPORT M: Peter Griffin diff --git a/drivers/firmware/google/coreboot_table.c b/drivers/firmware/google/coreboot_table.c index a031d6fe6bc5..c769631ea15d 100644 --- a/drivers/firmware/google/coreboot_table.c +++ b/drivers/firmware/google/coreboot_table.c @@ -22,6 +22,16 @@ #include "coreboot_table.h" +/* Coreboot table header structure */ +struct coreboot_table_header { + char signature[4]; + u32 header_bytes; + u32 header_checksum; + u32 table_bytes; + u32 table_checksum; + u32 table_entries; +}; + #define CB_DEV(d) container_of(d, struct coreboot_device, dev) #define CB_DRV(d) container_of_const(d, struct coreboot_driver, drv) diff --git a/drivers/firmware/google/coreboot_table.h b/drivers/firmware/google/coreboot_table.h index 17e9e5c3f6e1..616ca3903e5c 100644 --- a/drivers/firmware/google/coreboot_table.h +++ b/drivers/firmware/google/coreboot_table.h @@ -12,67 +12,9 @@ #ifndef __COREBOOT_TABLE_H #define __COREBOOT_TABLE_H +#include #include -struct coreboot_device_id; - -/* Coreboot table header structure */ -struct coreboot_table_header { - char signature[4]; - u32 header_bytes; - u32 header_checksum; - u32 table_bytes; - u32 table_checksum; - u32 table_entries; -}; - -/* List of coreboot entry structures that is used */ -/* Generic */ -struct coreboot_table_entry { - u32 tag; - u32 size; -}; - -/* Points to a CBMEM entry */ -struct lb_cbmem_ref { - u32 tag; - u32 size; - - u64 cbmem_addr; -}; - -#define LB_TAG_CBMEM_ENTRY 0x31 - -/* Corresponds to LB_TAG_CBMEM_ENTRY */ -struct lb_cbmem_entry { - u32 tag; - u32 size; - - u64 address; - u32 entry_size; - u32 id; -}; - -/* Describes framebuffer setup by coreboot */ -struct lb_framebuffer { - u32 tag; - u32 size; - - u64 physical_address; - u32 x_resolution; - u32 y_resolution; - u32 bytes_per_line; - u8 bits_per_pixel; - u8 red_mask_pos; - u8 red_mask_size; - u8 green_mask_pos; - u8 green_mask_size; - u8 blue_mask_pos; - u8 blue_mask_size; - u8 reserved_mask_pos; - u8 reserved_mask_size; -}; - /* A device, additionally with information from coreboot. */ struct coreboot_device { struct device dev; diff --git a/drivers/firmware/google/framebuffer-coreboot.c b/drivers/firmware/google/framebuffer-coreboot.c index 81aa522edb1e..fab3f28655d3 100644 --- a/drivers/firmware/google/framebuffer-coreboot.c +++ b/drivers/firmware/google/framebuffer-coreboot.c @@ -21,8 +21,6 @@ #include "coreboot_table.h" -#define CB_TAG_FRAMEBUFFER 0x12 - #if defined(CONFIG_PCI) static bool framebuffer_pci_dev_is_enabled(struct pci_dev *pdev) { diff --git a/include/linux/coreboot.h b/include/linux/coreboot.h new file mode 100644 index 000000000000..48705b439c6e --- /dev/null +++ b/include/linux/coreboot.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * coreboot.h + * + * Coreboot device and driver interfaces. + * + * Copyright 2014 Gerd Hoffmann + * Copyright 2017 Google Inc. + * Copyright 2017 Samuel Holland + */ + +#ifndef _LINUX_COREBOOT_H +#define _LINUX_COREBOOT_H + +#include + +/* List of coreboot entry structures that is used */ + +#define CB_TAG_FRAMEBUFFER 0x12 +#define LB_TAG_CBMEM_ENTRY 0x31 + +/* Generic */ +struct coreboot_table_entry { + u32 tag; + u32 size; +}; + +/* Points to a CBMEM entry */ +struct lb_cbmem_ref { + u32 tag; + u32 size; + + u64 cbmem_addr; +}; + +/* Corresponds to LB_TAG_CBMEM_ENTRY */ +struct lb_cbmem_entry { + u32 tag; + u32 size; + + u64 address; + u32 entry_size; + u32 id; +}; + +/* Describes framebuffer setup by coreboot */ +struct lb_framebuffer { + u32 tag; + u32 size; + + u64 physical_address; + u32 x_resolution; + u32 y_resolution; + u32 bytes_per_line; + u8 bits_per_pixel; + u8 red_mask_pos; + u8 red_mask_size; + u8 green_mask_pos; + u8 green_mask_size; + u8 blue_mask_pos; + u8 blue_mask_size; + u8 reserved_mask_pos; + u8 reserved_mask_size; +}; + +#endif /* _LINUX_COREBOOT_H */ -- cgit v1.2.3 From 27fc52b5505a3acca96b884a4bf1345344e5a566 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 17 Feb 2026 16:56:19 +0100 Subject: firmware: google: Pack structures for coreboot table entries Pack the fields in the coreboot table entries. These entries are part of the coreboot ABI, so they don't follow regular calling conventions. Fields of type u64 are aligned to boundaries of 4 bytes instead of 8. [1] So far this has not been a problem. In the future, padding bytes should be added where explicit alignment is required. Signed-off-by: Thomas Zimmermann Link: https://github.com/coreboot/coreboot/blob/main/payloads/libpayload/include/coreboot_tables.h#L96 # [1] Suggested-by: Julius Werner Acked-by: Julius Werner Acked-by: Tzung-Bi Shih Link: https://patch.msgid.link/20260217155836.96267-10-tzimmermann@suse.de --- include/linux/coreboot.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/coreboot.h b/include/linux/coreboot.h index 48705b439c6e..5746b99a070d 100644 --- a/include/linux/coreboot.h +++ b/include/linux/coreboot.h @@ -12,8 +12,11 @@ #ifndef _LINUX_COREBOOT_H #define _LINUX_COREBOOT_H +#include #include +typedef __aligned(4) u64 cb_u64; + /* List of coreboot entry structures that is used */ #define CB_TAG_FRAMEBUFFER 0x12 @@ -30,7 +33,7 @@ struct lb_cbmem_ref { u32 tag; u32 size; - u64 cbmem_addr; + cb_u64 cbmem_addr; }; /* Corresponds to LB_TAG_CBMEM_ENTRY */ @@ -38,7 +41,7 @@ struct lb_cbmem_entry { u32 tag; u32 size; - u64 address; + cb_u64 address; u32 entry_size; u32 id; }; @@ -48,7 +51,7 @@ struct lb_framebuffer { u32 tag; u32 size; - u64 physical_address; + cb_u64 physical_address; u32 x_resolution; u32 y_resolution; u32 bytes_per_line; -- cgit v1.2.3 From a29a1f0ec8d69ee917a9d4c84b844df0decff0ef Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 17 Feb 2026 16:56:21 +0100 Subject: drm/sysfb: corebootdrm: Add DRM driver for coreboot framebuffers Add corebootdrm, a DRM driver for coreboot framebuffers. The driver supports a pre-initialized framebuffer with various packed RGB formats. The driver code is fairly small and uses the same logic as the other sysfb drivers. Most of the implementation comes from existing sysfb helpers. Until now, coreboot relied on simpledrm or simplefb for boot-up graphics output. Initialize the platform device for corebootdrm in the same place in framebuffer_probe(). With a later commit, the simple-framebuffer should be removed. v4: - sort include statements (Tzung-Bi) v3: - comment on _HAS_LFB semantics (Tzung-Bi) - fix typo in commit description (Tzung-Bi) - comment on simple-framebuffer being obsolete for coreboot v2: - reimplement as platform driver - limit resources and mappings to known framebuffer memory; no page alignment - create corebootdrm device from coreboot framebuffer code Signed-off-by: Thomas Zimmermann Reviewed-by: Javier Martinez Canillas Acked-by: Julius Werner Acked-by: Tzung-Bi Shih # coreboot Link: https://patch.msgid.link/20260217155836.96267-12-tzimmermann@suse.de --- drivers/firmware/google/Kconfig | 3 +- drivers/firmware/google/framebuffer-coreboot.c | 22 +- drivers/gpu/drm/sysfb/Kconfig | 16 + drivers/gpu/drm/sysfb/Makefile | 1 + drivers/gpu/drm/sysfb/corebootdrm.c | 412 +++++++++++++++++++++++++ include/linux/coreboot.h | 8 + 6 files changed, 458 insertions(+), 4 deletions(-) create mode 100644 drivers/gpu/drm/sysfb/corebootdrm.c (limited to 'include/linux') diff --git a/drivers/firmware/google/Kconfig b/drivers/firmware/google/Kconfig index 3ab3e089328b..b78c644fa253 100644 --- a/drivers/firmware/google/Kconfig +++ b/drivers/firmware/google/Kconfig @@ -63,7 +63,8 @@ config GOOGLE_FRAMEBUFFER_COREBOOT help This option enables the kernel to search for a framebuffer in the coreboot table. If found, it is registered with a platform - device of type simple-framebuffer. + device of type coreboot-framebuffer. Using the old device of + type simple-framebuffer is deprecated. config GOOGLE_MEMCONSOLE_COREBOOT tristate "Firmware Memory Console" diff --git a/drivers/firmware/google/framebuffer-coreboot.c b/drivers/firmware/google/framebuffer-coreboot.c index fab3f28655d3..2c63a9bd0dcb 100644 --- a/drivers/firmware/google/framebuffer-coreboot.c +++ b/drivers/firmware/google/framebuffer-coreboot.c @@ -76,22 +76,23 @@ static struct device *framebuffer_parent_dev(struct resource *res) return NULL; } -static const struct simplefb_format formats[] = SIMPLEFB_FORMATS; - static int framebuffer_probe(struct coreboot_device *dev) { - int i; struct lb_framebuffer *fb = &dev->framebuffer; struct device *parent; struct platform_device *pdev; struct resource res; int ret; +#if !IS_ENABLED(CONFIG_DRM_COREBOOTDRM) struct simplefb_platform_data pdata = { .width = fb->x_resolution, .height = fb->y_resolution, .stride = fb->bytes_per_line, .format = NULL, }; + int i; + static const struct simplefb_format formats[] = SIMPLEFB_FORMATS; +#endif /* * On coreboot systems, the advertised LB_TAG_FRAMEBUFFER entry @@ -118,6 +119,20 @@ static int framebuffer_probe(struct coreboot_device *dev) if (IS_ERR(parent)) return PTR_ERR(parent); +#if IS_ENABLED(CONFIG_DRM_COREBOOTDRM) + pdev = platform_device_register_resndata(parent, "coreboot-framebuffer", 0, + &res, 1, fb, fb->size); + if (IS_ERR(pdev)) { + pr_warn("coreboot: could not register framebuffer\n"); + ret = PTR_ERR(pdev); + goto out_put_device_parent; + } +#else + /* + * FIXME: Coreboot systems should use a driver that binds to + * coreboot-framebuffer devices. Remove support for + * simple-framebuffer at some point. + */ for (i = 0; i < ARRAY_SIZE(formats); ++i) { if (fb->bits_per_pixel == formats[i].bits_per_pixel && fb->red_mask_pos == formats[i].red.offset && @@ -142,6 +157,7 @@ static int framebuffer_probe(struct coreboot_device *dev) pr_warn("coreboot: could not register framebuffer\n"); goto out_put_device_parent; } +#endif ret = 0; diff --git a/drivers/gpu/drm/sysfb/Kconfig b/drivers/gpu/drm/sysfb/Kconfig index 9c9884c7efc6..2559ead6cf1f 100644 --- a/drivers/gpu/drm/sysfb/Kconfig +++ b/drivers/gpu/drm/sysfb/Kconfig @@ -7,6 +7,22 @@ config DRM_SYSFB_HELPER tristate depends on DRM +config DRM_COREBOOTDRM + tristate "Coreboot framebuffer driver" + depends on DRM && MMU + depends on GOOGLE_FRAMEBUFFER_COREBOOT + select APERTURE_HELPERS + select DRM_CLIENT_SELECTION + select DRM_GEM_SHMEM_HELPER + select DRM_KMS_HELPER + select DRM_SYSFB_HELPER + help + DRM driver for coreboot-provided framebuffers. + + This driver assumes that the display hardware has been initialized + by coreboot firmware before the kernel boots. Scanout buffer, size, + and display format must be provided via coreboot framebuffer device. + config DRM_EFIDRM tristate "EFI framebuffer driver" depends on DRM && MMU && EFI && (!SYSFB_SIMPLEFB || COMPILE_TEST) diff --git a/drivers/gpu/drm/sysfb/Makefile b/drivers/gpu/drm/sysfb/Makefile index a156c496413d..85c9087ab03d 100644 --- a/drivers/gpu/drm/sysfb/Makefile +++ b/drivers/gpu/drm/sysfb/Makefile @@ -6,6 +6,7 @@ drm_sysfb_helper-y := \ drm_sysfb_helper-$(CONFIG_SCREEN_INFO) += drm_sysfb_screen_info.o obj-$(CONFIG_DRM_SYSFB_HELPER) += drm_sysfb_helper.o +obj-$(CONFIG_DRM_COREBOOTDRM) += corebootdrm.o obj-$(CONFIG_DRM_EFIDRM) += efidrm.o obj-$(CONFIG_DRM_OFDRM) += ofdrm.o obj-$(CONFIG_DRM_SIMPLEDRM) += simpledrm.o diff --git a/drivers/gpu/drm/sysfb/corebootdrm.c b/drivers/gpu/drm/sysfb/corebootdrm.c new file mode 100644 index 000000000000..745318580a5d --- /dev/null +++ b/drivers/gpu/drm/sysfb/corebootdrm.c @@ -0,0 +1,412 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "drm_sysfb_helper.h" + +#define DRIVER_NAME "corebootdrm" +#define DRIVER_DESC "DRM driver for Coreboot framebuffers" +#define DRIVER_MAJOR 1 +#define DRIVER_MINOR 0 + +static const struct drm_format_info * +corebootdrm_get_format_fb(struct drm_device *dev, const struct lb_framebuffer *fb) +{ + static const struct drm_sysfb_format formats[] = { + { PIXEL_FORMAT_XRGB1555, DRM_FORMAT_XRGB1555, }, + { PIXEL_FORMAT_RGB565, DRM_FORMAT_RGB565, }, + { PIXEL_FORMAT_RGB888, DRM_FORMAT_RGB888, }, + { PIXEL_FORMAT_XRGB8888, DRM_FORMAT_XRGB8888, }, + { PIXEL_FORMAT_XBGR8888, DRM_FORMAT_XBGR8888, }, + { PIXEL_FORMAT_XRGB2101010, DRM_FORMAT_XRGB2101010, }, + }; + const struct pixel_format pixel = { + .bits_per_pixel = fb->bits_per_pixel, + .indexed = false, + .alpha = { + .offset = 0, + .length = 0, + }, + .red = { + .offset = fb->red_mask_pos, + .length = fb->red_mask_size, + }, + .green = { + .offset = fb->green_mask_pos, + .length = fb->green_mask_size, + }, + .blue = { + .offset = fb->blue_mask_pos, + .length = fb->blue_mask_size, + }, + }; + + return drm_sysfb_get_format(dev, formats, ARRAY_SIZE(formats), &pixel); +} + +static int corebootdrm_get_width_fb(struct drm_device *dev, const struct lb_framebuffer *fb) +{ + return drm_sysfb_get_validated_int0(dev, "width", fb->x_resolution, INT_MAX); +} + +static int corebootdrm_get_height_fb(struct drm_device *dev, const struct lb_framebuffer *fb) +{ + return drm_sysfb_get_validated_int0(dev, "height", fb->y_resolution, INT_MAX); +} + +static int corebootdrm_get_pitch_fb(struct drm_device *dev, const struct drm_format_info *format, + unsigned int width, const struct lb_framebuffer *fb) +{ + u64 bytes_per_line = fb->bytes_per_line; + + if (!bytes_per_line) + bytes_per_line = drm_format_info_min_pitch(format, 0, width); + + return drm_sysfb_get_validated_int0(dev, "pitch", bytes_per_line, INT_MAX); +} + +static resource_size_t corebootdrm_get_size_fb(struct drm_device *dev, unsigned int height, + unsigned int pitch, + const struct lb_framebuffer *fb) +{ + resource_size_t size; + + if (check_mul_overflow(height, pitch, &size)) + return 0; + + return size; +} + +static phys_addr_t corebootdrm_get_address_fb(struct drm_device *dev, resource_size_t size, + const struct lb_framebuffer *fb) +{ + if (size > PHYS_ADDR_MAX) + return 0; + if (!fb->physical_address) + return 0; + if (fb->physical_address > (PHYS_ADDR_MAX - size)) + return 0; + + return fb->physical_address; +} + +/* + * Simple Framebuffer device + */ + +struct corebootdrm_device { + struct drm_sysfb_device sysfb; + + /* modesetting */ + u32 formats[DRM_SYSFB_PLANE_NFORMATS(1)]; + struct drm_plane primary_plane; + struct drm_crtc crtc; + struct drm_encoder encoder; + struct drm_connector connector; +}; + +/* + * Modesetting + */ + +static const u64 corebootdrm_primary_plane_format_modifiers[] = { + DRM_SYSFB_PLANE_FORMAT_MODIFIERS, +}; + +static const struct drm_plane_helper_funcs corebootdrm_primary_plane_helper_funcs = { + DRM_SYSFB_PLANE_HELPER_FUNCS, +}; + +static const struct drm_plane_funcs corebootdrm_primary_plane_funcs = { + DRM_SYSFB_PLANE_FUNCS, + .destroy = drm_plane_cleanup, +}; + +static const struct drm_crtc_helper_funcs corebootdrm_crtc_helper_funcs = { + DRM_SYSFB_CRTC_HELPER_FUNCS, +}; + +static const struct drm_crtc_funcs corebootdrm_crtc_funcs = { + DRM_SYSFB_CRTC_FUNCS, + .destroy = drm_crtc_cleanup, +}; + +static const struct drm_encoder_funcs corebootdrm_encoder_funcs = { + .destroy = drm_encoder_cleanup, +}; + +static const struct drm_connector_helper_funcs corebootdrm_connector_helper_funcs = { + DRM_SYSFB_CONNECTOR_HELPER_FUNCS, +}; + +static const struct drm_connector_funcs corebootdrm_connector_funcs = { + DRM_SYSFB_CONNECTOR_FUNCS, + .destroy = drm_connector_cleanup, +}; + +static const struct drm_mode_config_funcs corebootdrm_mode_config_funcs = { + DRM_SYSFB_MODE_CONFIG_FUNCS, +}; + +static int corebootdrm_mode_config_init(struct corebootdrm_device *cdev) +{ + struct drm_sysfb_device *sysfb = &cdev->sysfb; + struct drm_device *dev = &sysfb->dev; + const struct drm_format_info *format = sysfb->fb_format; + unsigned int width = sysfb->fb_mode.hdisplay; + unsigned int height = sysfb->fb_mode.vdisplay; + struct drm_plane *primary_plane; + struct drm_crtc *crtc; + struct drm_encoder *encoder; + struct drm_connector *connector; + size_t nformats; + int ret; + + ret = drmm_mode_config_init(dev); + if (ret) + return ret; + + dev->mode_config.min_width = width; + dev->mode_config.max_width = max_t(unsigned int, width, DRM_SHADOW_PLANE_MAX_WIDTH); + dev->mode_config.min_height = height; + dev->mode_config.max_height = max_t(unsigned int, height, DRM_SHADOW_PLANE_MAX_HEIGHT); + dev->mode_config.funcs = &corebootdrm_mode_config_funcs; + dev->mode_config.preferred_depth = format->depth; + + /* Primary plane */ + + nformats = drm_sysfb_build_fourcc_list(dev, &format->format, 1, + cdev->formats, ARRAY_SIZE(cdev->formats)); + + primary_plane = &cdev->primary_plane; + ret = drm_universal_plane_init(dev, primary_plane, 0, &corebootdrm_primary_plane_funcs, + cdev->formats, nformats, + corebootdrm_primary_plane_format_modifiers, + DRM_PLANE_TYPE_PRIMARY, NULL); + if (ret) + return ret; + drm_plane_helper_add(primary_plane, &corebootdrm_primary_plane_helper_funcs); + drm_plane_enable_fb_damage_clips(primary_plane); + + /* CRTC */ + + crtc = &cdev->crtc; + ret = drm_crtc_init_with_planes(dev, crtc, primary_plane, NULL, + &corebootdrm_crtc_funcs, NULL); + if (ret) + return ret; + drm_crtc_helper_add(crtc, &corebootdrm_crtc_helper_funcs); + + /* Encoder */ + + encoder = &cdev->encoder; + ret = drm_encoder_init(dev, encoder, &corebootdrm_encoder_funcs, + DRM_MODE_ENCODER_NONE, NULL); + if (ret) + return ret; + encoder->possible_crtcs = drm_crtc_mask(crtc); + + /* Connector */ + + connector = &cdev->connector; + ret = drm_connector_init(dev, connector, &corebootdrm_connector_funcs, + DRM_MODE_CONNECTOR_Unknown); + if (ret) + return ret; + drm_connector_helper_add(connector, &corebootdrm_connector_helper_funcs); + drm_connector_set_panel_orientation_with_quirk(connector, + DRM_MODE_PANEL_ORIENTATION_UNKNOWN, + width, height); + + ret = drm_connector_attach_encoder(connector, encoder); + if (ret) + return ret; + + return 0; +} + +/* + * DRM driver + */ + +DEFINE_DRM_GEM_FOPS(corebootdrm_fops); + +static struct drm_driver corebootdrm_drm_driver = { + DRM_GEM_SHMEM_DRIVER_OPS, + DRM_FBDEV_SHMEM_DRIVER_OPS, + .name = DRIVER_NAME, + .desc = DRIVER_DESC, + .major = DRIVER_MAJOR, + .minor = DRIVER_MINOR, + .driver_features = DRIVER_ATOMIC | DRIVER_GEM | DRIVER_MODESET, + .fops = &corebootdrm_fops, +}; + +/* + * Coreboot driver + */ + +static int corebootdrm_probe(struct platform_device *pdev) +{ + const struct lb_framebuffer *fb = dev_get_platdata(&pdev->dev); + struct corebootdrm_device *cdev; + struct drm_sysfb_device *sysfb; + struct drm_device *dev; + const struct drm_format_info *format; + int width, height, pitch; + resource_size_t size; + phys_addr_t address; + struct resource *res, *mem = NULL; + struct resource aperture; + void __iomem *screen_base; + int ret; + + cdev = devm_drm_dev_alloc(&pdev->dev, &corebootdrm_drm_driver, + struct corebootdrm_device, sysfb.dev); + if (IS_ERR(cdev)) + return PTR_ERR(cdev); + platform_set_drvdata(pdev, cdev); + + sysfb = &cdev->sysfb; + dev = &sysfb->dev; + + if (!fb) { + drm_err(dev, "coreboot framebuffer not found\n"); + return -EINVAL; + } else if (!LB_FRAMEBUFFER_HAS_LFB(fb)) { + drm_err(dev, "coreboot framebuffer entry too small\n"); + return -EINVAL; + } + + /* + * Hardware settings + */ + + format = corebootdrm_get_format_fb(dev, fb); + if (!format) + return -EINVAL; + width = corebootdrm_get_width_fb(dev, fb); + if (width < 0) + return width; + height = corebootdrm_get_height_fb(dev, fb); + if (height < 0) + return height; + pitch = corebootdrm_get_pitch_fb(dev, format, width, fb); + if (pitch < 0) + return pitch; + size = corebootdrm_get_size_fb(dev, height, pitch, fb); + if (!size) + return -EINVAL; + address = corebootdrm_get_address_fb(dev, size, fb); + if (!address) + return -EINVAL; + + sysfb->fb_mode = drm_sysfb_mode(width, height, 0, 0); + sysfb->fb_format = format; + sysfb->fb_pitch = pitch; + + drm_dbg(dev, "display mode={" DRM_MODE_FMT "}\n", DRM_MODE_ARG(&sysfb->fb_mode)); + drm_dbg(dev, "framebuffer format=%p4cc, size=%dx%d, pitch=%d byte\n", + &format->format, width, height, pitch); + + /* + * Memory management + */ + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) { + drm_err(dev, "memory resource not found\n"); + return -EINVAL; + } + + mem = devm_request_mem_region(&pdev->dev, res->start, resource_size(res), + dev->driver->name); + if (!mem) { + drm_warn(dev, "could not acquire memory resource at %pr\n", res); + /* + * We cannot make this fatal. Sometimes this comes from magic + * spaces our resource handlers simply don't know about. Use + * the memory resource as-is and try to map that instead. + */ + mem = res; + } + + drm_dbg(dev, "using memory resource at %pr\n", mem); + + aperture = DEFINE_RES_MEM(address, size); + if (!resource_contains(mem, &aperture)) { + drm_err(dev, "framebuffer aperture at invalid memory range %pr\n", &aperture); + return -EINVAL; + } + + ret = devm_aperture_acquire_for_platform_device(pdev, address, size); + if (ret) { + drm_err(dev, "could not acquire framebuffer aperture: %d\n", ret); + return ret; + } + + screen_base = devm_ioremap_wc(&pdev->dev, address, size); + if (!screen_base) + return -ENOMEM; + + iosys_map_set_vaddr_iomem(&sysfb->fb_addr, screen_base); + + /* + * DRM mode setting and registration + */ + + ret = corebootdrm_mode_config_init(cdev); + if (ret) + return ret; + + drm_mode_config_reset(dev); + + ret = drm_dev_register(dev, 0); + if (ret) + return ret; + + drm_client_setup(dev, sysfb->fb_format); + + return 0; +} + +static void corebootdrm_remove(struct platform_device *pdev) +{ + struct corebootdrm_device *cdev = platform_get_drvdata(pdev); + struct drm_device *dev = &cdev->sysfb.dev; + + drm_dev_unplug(dev); +} + +static struct platform_driver corebootdrm_platform_driver = { + .driver = { + .name = "coreboot-framebuffer", + }, + .probe = corebootdrm_probe, + .remove = corebootdrm_remove, +}; + +module_platform_driver(corebootdrm_platform_driver); + +MODULE_DESCRIPTION(DRIVER_DESC); +MODULE_LICENSE("GPL"); diff --git a/include/linux/coreboot.h b/include/linux/coreboot.h index 5746b99a070d..885da106fee3 100644 --- a/include/linux/coreboot.h +++ b/include/linux/coreboot.h @@ -13,6 +13,7 @@ #define _LINUX_COREBOOT_H #include +#include #include typedef __aligned(4) u64 cb_u64; @@ -66,4 +67,11 @@ struct lb_framebuffer { u8 reserved_mask_size; }; +/* + * True if the coreboot-provided data is large enough to hold information + * on the linear framebuffer. False otherwise. + */ +#define LB_FRAMEBUFFER_HAS_LFB(__fb) \ + ((__fb)->size >= offsetofend(struct lb_framebuffer, reserved_mask_size)) + #endif /* _LINUX_COREBOOT_H */ -- cgit v1.2.3 From 058fc04b8587ad07a86dfa8f99d8d99db0a55443 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 17 Feb 2026 16:56:22 +0100 Subject: drm/sysfb: corebootdrm: Support panel orientation Add fields and constants for coreboot framebuffer orientation. Set corebootdrm's DRM connector state from the values. Not all firmware provides orientation, so make it optional. Systems without, continue to use unknown orientation. v3: - comment on _HAS_ORIENTATION semantics (Tzung-Bi) Signed-off-by: Thomas Zimmermann Reviewed-by: Javier Martinez Canillas Acked-by: Julius Werner Acked-by: Tzung-Bi Shih # coreboot Link: https://patch.msgid.link/20260217155836.96267-13-tzimmermann@suse.de --- drivers/gpu/drm/sysfb/corebootdrm.c | 30 ++++++++++++++++++++++++++---- include/linux/coreboot.h | 13 +++++++++++++ 2 files changed, 39 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpu/drm/sysfb/corebootdrm.c b/drivers/gpu/drm/sysfb/corebootdrm.c index 745318580a5d..5dc6f3c76f7b 100644 --- a/drivers/gpu/drm/sysfb/corebootdrm.c +++ b/drivers/gpu/drm/sysfb/corebootdrm.c @@ -110,6 +110,26 @@ static phys_addr_t corebootdrm_get_address_fb(struct drm_device *dev, resource_s return fb->physical_address; } +static enum drm_panel_orientation corebootdrm_get_orientation_fb(struct drm_device *dev, + const struct lb_framebuffer *fb) +{ + if (!LB_FRAMEBUFFER_HAS_ORIENTATION(fb)) + return DRM_MODE_PANEL_ORIENTATION_UNKNOWN; + + switch (fb->orientation) { + case LB_FRAMEBUFFER_ORIENTATION_NORMAL: + return DRM_MODE_PANEL_ORIENTATION_NORMAL; + case LB_FRAMEBUFFER_ORIENTATION_BOTTOM_UP: + return DRM_MODE_PANEL_ORIENTATION_BOTTOM_UP; + case LB_FRAMEBUFFER_ORIENTATION_LEFT_UP: + return DRM_MODE_PANEL_ORIENTATION_LEFT_UP; + case LB_FRAMEBUFFER_ORIENTATION_RIGHT_UP: + return DRM_MODE_PANEL_ORIENTATION_RIGHT_UP; + } + + return DRM_MODE_PANEL_ORIENTATION_UNKNOWN; +} + /* * Simple Framebuffer device */ @@ -168,7 +188,8 @@ static const struct drm_mode_config_funcs corebootdrm_mode_config_funcs = { DRM_SYSFB_MODE_CONFIG_FUNCS, }; -static int corebootdrm_mode_config_init(struct corebootdrm_device *cdev) +static int corebootdrm_mode_config_init(struct corebootdrm_device *cdev, + enum drm_panel_orientation orientation) { struct drm_sysfb_device *sysfb = &cdev->sysfb; struct drm_device *dev = &sysfb->dev; @@ -234,8 +255,7 @@ static int corebootdrm_mode_config_init(struct corebootdrm_device *cdev) if (ret) return ret; drm_connector_helper_add(connector, &corebootdrm_connector_helper_funcs); - drm_connector_set_panel_orientation_with_quirk(connector, - DRM_MODE_PANEL_ORIENTATION_UNKNOWN, + drm_connector_set_panel_orientation_with_quirk(connector, orientation, width, height); ret = drm_connector_attach_encoder(connector, encoder); @@ -276,6 +296,7 @@ static int corebootdrm_probe(struct platform_device *pdev) int width, height, pitch; resource_size_t size; phys_addr_t address; + enum drm_panel_orientation orientation; struct resource *res, *mem = NULL; struct resource aperture; void __iomem *screen_base; @@ -320,6 +341,7 @@ static int corebootdrm_probe(struct platform_device *pdev) address = corebootdrm_get_address_fb(dev, size, fb); if (!address) return -EINVAL; + orientation = corebootdrm_get_orientation_fb(dev, fb); sysfb->fb_mode = drm_sysfb_mode(width, height, 0, 0); sysfb->fb_format = format; @@ -375,7 +397,7 @@ static int corebootdrm_probe(struct platform_device *pdev) * DRM mode setting and registration */ - ret = corebootdrm_mode_config_init(cdev); + ret = corebootdrm_mode_config_init(cdev, orientation); if (ret) return ret; diff --git a/include/linux/coreboot.h b/include/linux/coreboot.h index 885da106fee3..5d40ca7a1d89 100644 --- a/include/linux/coreboot.h +++ b/include/linux/coreboot.h @@ -47,6 +47,11 @@ struct lb_cbmem_entry { u32 id; }; +#define LB_FRAMEBUFFER_ORIENTATION_NORMAL 0 +#define LB_FRAMEBUFFER_ORIENTATION_BOTTOM_UP 1 +#define LB_FRAMEBUFFER_ORIENTATION_LEFT_UP 2 +#define LB_FRAMEBUFFER_ORIENTATION_RIGHT_UP 3 + /* Describes framebuffer setup by coreboot */ struct lb_framebuffer { u32 tag; @@ -65,6 +70,7 @@ struct lb_framebuffer { u8 blue_mask_size; u8 reserved_mask_pos; u8 reserved_mask_size; + u8 orientation; }; /* @@ -74,4 +80,11 @@ struct lb_framebuffer { #define LB_FRAMEBUFFER_HAS_LFB(__fb) \ ((__fb)->size >= offsetofend(struct lb_framebuffer, reserved_mask_size)) +/* + * True if the coreboot-provided data is large enough to hold information + * on the display orientation. False otherwise. + */ +#define LB_FRAMEBUFFER_HAS_ORIENTATION(__fb) \ + ((__fb)->size >= offsetofend(struct lb_framebuffer, orientation)) + #endif /* _LINUX_COREBOOT_H */ -- cgit v1.2.3 From dc652a33cf08ecd7c9935bf9168a1a27c9a246f0 Mon Sep 17 00:00:00 2001 From: Brian Masney Date: Fri, 12 Dec 2025 08:41:42 +0900 Subject: clk: remove round_rate() clk ops The round_rate() clk ops is deprecated, and all in tree drivers have been converted, so let's go ahead and remove any references to the round_rate() clk ops. Signed-off-by: Brian Masney --- Documentation/driver-api/clk.rst | 9 +-------- drivers/clk/clk.c | 39 ++++++++++++++------------------------- include/linux/clk-provider.h | 18 ++++++------------ 3 files changed, 21 insertions(+), 45 deletions(-) (limited to 'include/linux') diff --git a/Documentation/driver-api/clk.rst b/Documentation/driver-api/clk.rst index 93bab5336dfd..c6aca8186a78 100644 --- a/Documentation/driver-api/clk.rst +++ b/Documentation/driver-api/clk.rst @@ -77,9 +77,6 @@ the operations defined in clk-provider.h:: void (*disable_unused)(struct clk_hw *hw); unsigned long (*recalc_rate)(struct clk_hw *hw, unsigned long parent_rate); - long (*round_rate)(struct clk_hw *hw, - unsigned long rate, - unsigned long *parent_rate); int (*determine_rate)(struct clk_hw *hw, struct clk_rate_request *req); int (*set_parent)(struct clk_hw *hw, u8 index); @@ -220,9 +217,7 @@ optional or must be evaluated on a case-by-case basis. +----------------+------+-------------+---------------+-------------+------+ |.recalc_rate | | y | | | | +----------------+------+-------------+---------------+-------------+------+ - |.round_rate | | y [1]_ | | | | - +----------------+------+-------------+---------------+-------------+------+ - |.determine_rate | | y [1]_ | | | | + |.determine_rate | | y | | | | +----------------+------+-------------+---------------+-------------+------+ |.set_rate | | y | | | | +----------------+------+-------------+---------------+-------------+------+ @@ -238,8 +233,6 @@ optional or must be evaluated on a case-by-case basis. |.init | | | | | | +----------------+------+-------------+---------------+-------------+------+ -.. [1] either one of round_rate or determine_rate is required. - Finally, register your clock at run-time with a hardware-specific registration function. This function simply populates struct clk_foo's data and then passes the common struct clk parameters to the framework diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index 47093cda9df3..fd418dc988b1 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -1560,8 +1560,6 @@ late_initcall_sync(clk_disable_unused); static int clk_core_determine_round_nolock(struct clk_core *core, struct clk_rate_request *req) { - long rate; - lockdep_assert_held(&prepare_lock); if (!core) @@ -1591,13 +1589,6 @@ static int clk_core_determine_round_nolock(struct clk_core *core, req->rate = core->rate; } else if (core->ops->determine_rate) { return core->ops->determine_rate(core->hw, req); - } else if (core->ops->round_rate) { - rate = core->ops->round_rate(core->hw, req->rate, - &req->best_parent_rate); - if (rate < 0) - return rate; - - req->rate = rate; } else { return -EINVAL; } @@ -1682,7 +1673,7 @@ EXPORT_SYMBOL_GPL(clk_hw_forward_rate_request); static bool clk_core_can_round(struct clk_core * const core) { - return core->ops->determine_rate || core->ops->round_rate; + return core->ops->determine_rate; } static int clk_core_round_rate_nolock(struct clk_core *core, @@ -1750,11 +1741,11 @@ EXPORT_SYMBOL_GPL(__clk_determine_rate); * use. * * Context: prepare_lock must be held. - * For clk providers to call from within clk_ops such as .round_rate, + * For clk providers to call from within clk_ops such as * .determine_rate. * - * Return: returns rounded rate of hw clk if clk supports round_rate operation - * else returns the parent rate. + * Return: returns rounded rate of hw clk if clk supports determine_rate + * operation; else returns the parent rate. */ unsigned long clk_hw_round_rate(struct clk_hw *hw, unsigned long rate) { @@ -2569,12 +2560,13 @@ err: * * Setting the CLK_SET_RATE_PARENT flag allows the rate change operation to * propagate up to clk's parent; whether or not this happens depends on the - * outcome of clk's .round_rate implementation. If *parent_rate is unchanged - * after calling .round_rate then upstream parent propagation is ignored. If - * *parent_rate comes back with a new rate for clk's parent then we propagate - * up to clk's parent and set its rate. Upward propagation will continue - * until either a clk does not support the CLK_SET_RATE_PARENT flag or - * .round_rate stops requesting changes to clk's parent_rate. + * outcome of clk's .determine_rate implementation. If req->best_parent_rate + * is unchanged after calling .determine_rate then upstream parent propagation + * is ignored. If req->best_parent_rate comes back with a new rate for clk's + * parent then we propagate up to clk's parent and set its rate. Upward + * propagation will continue until either a clk does not support the + * CLK_SET_RATE_PARENT flag or .determine_rate stops requesting changes to + * clk's parent_rate. * * Rate changes are accomplished via tree traversal that also recalculates the * rates for the clocks and fires off POST_RATE_CHANGE notifiers. @@ -2703,8 +2695,6 @@ static int clk_set_rate_range_nolock(struct clk *clk, * FIXME: * There is a catch. It may fail for the usual reason (clock * broken, clock protected, etc) but also because: - * - round_rate() was not favorable and fell on the wrong - * side of the boundary * - the determine_rate() callback does not really check for * this corner case when determining the rate */ @@ -3915,10 +3905,9 @@ static int __clk_core_init(struct clk_core *core) } /* check that clk_ops are sane. See Documentation/driver-api/clk.rst */ - if (core->ops->set_rate && - !((core->ops->round_rate || core->ops->determine_rate) && - core->ops->recalc_rate)) { - pr_err("%s: %s must implement .round_rate or .determine_rate in addition to .recalc_rate\n", + if (core->ops->set_rate && !core->ops->determine_rate && + core->ops->recalc_rate) { + pr_err("%s: %s must implement .determine_rate in addition to .recalc_rate\n", __func__, core->name); ret = -EINVAL; goto out; diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 630705a47129..1cda2c78dffa 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -136,10 +136,6 @@ struct clk_duty { * 0. Returns the calculated rate. Optional, but recommended - if * this op is not set then clock rate will be initialized to 0. * - * @round_rate: Given a target rate as input, returns the closest rate actually - * supported by the clock. The parent rate is an input/output - * parameter. - * * @determine_rate: Given a target rate as input, returns the closest rate * actually supported by the clock, and optionally the parent clock * that should be used to provide the clock rate. @@ -163,13 +159,13 @@ struct clk_duty { * * @set_rate: Change the rate of this clock. The requested rate is specified * by the second argument, which should typically be the return - * of .round_rate call. The third argument gives the parent rate - * which is likely helpful for most .set_rate implementation. + * of .determine_rate call. The third argument gives the parent + * rate which is likely helpful for most .set_rate implementation. * Returns 0 on success, -EERROR otherwise. * * @set_rate_and_parent: Change the rate and the parent of this clock. The * requested rate is specified by the second argument, which - * should typically be the return of .round_rate call. The + * should typically be the return of clk_round_rate() call. The * third argument gives the parent rate which is likely helpful * for most .set_rate_and_parent implementation. The fourth * argument gives the parent index. This callback is optional (and @@ -244,8 +240,6 @@ struct clk_ops { void (*restore_context)(struct clk_hw *hw); unsigned long (*recalc_rate)(struct clk_hw *hw, unsigned long parent_rate); - long (*round_rate)(struct clk_hw *hw, unsigned long rate, - unsigned long *parent_rate); int (*determine_rate)(struct clk_hw *hw, struct clk_rate_request *req); int (*set_parent)(struct clk_hw *hw, u8 index); @@ -679,7 +673,7 @@ struct clk_div_table { * @lock: register lock * * Clock with an adjustable divider affecting its output frequency. Implements - * .recalc_rate, .set_rate and .round_rate + * .recalc_rate, .set_rate and .determine_rate * * @flags: * CLK_DIVIDER_ONE_BASED - by default the divisor is the value read from the @@ -1126,7 +1120,7 @@ void of_fixed_factor_clk_setup(struct device_node *node); * * Clock with a fixed multiplier and divider. The output frequency is the * parent clock rate divided by div and multiplied by mult. - * Implements .recalc_rate, .set_rate, .round_rate and .recalc_accuracy + * Implements .recalc_rate, .set_rate, .determine_rate and .recalc_accuracy * * Flags: * * CLK_FIXED_FACTOR_FIXED_ACCURACY - Use the value in @acc instead of the @@ -1254,7 +1248,7 @@ void clk_hw_unregister_fractional_divider(struct clk_hw *hw); * @lock: register lock * * Clock with an adjustable multiplier affecting its output frequency. - * Implements .recalc_rate, .set_rate and .round_rate + * Implements .recalc_rate, .set_rate and .determine_rate * * @flags: * CLK_MULTIPLIER_ZERO_BYPASS - By default, the multiplier is the value read -- cgit v1.2.3 From 4b5231d608d00749a2346a3dd11bd6d05c0662e3 Mon Sep 17 00:00:00 2001 From: Brian Masney Date: Thu, 8 Jan 2026 16:16:44 -0500 Subject: clk: divider: remove divider_ro_round_rate_parent() There are no remaining users of divider_ro_round_rate_parent(), so let's go ahead and remove it. Signed-off-by: Brian Masney --- drivers/clk/clk-divider.c | 22 ---------------------- include/linux/clk-provider.h | 15 --------------- 2 files changed, 37 deletions(-) (limited to 'include/linux') diff --git a/drivers/clk/clk-divider.c b/drivers/clk/clk-divider.c index 45e7ebde4a8b..26610dd976ec 100644 --- a/drivers/clk/clk-divider.c +++ b/drivers/clk/clk-divider.c @@ -409,28 +409,6 @@ long divider_round_rate_parent(struct clk_hw *hw, struct clk_hw *parent, } EXPORT_SYMBOL_GPL(divider_round_rate_parent); -long divider_ro_round_rate_parent(struct clk_hw *hw, struct clk_hw *parent, - unsigned long rate, unsigned long *prate, - const struct clk_div_table *table, u8 width, - unsigned long flags, unsigned int val) -{ - struct clk_rate_request req; - int ret; - - clk_hw_init_rate_request(hw, &req, rate); - req.best_parent_rate = *prate; - req.best_parent_hw = parent; - - ret = divider_ro_determine_rate(hw, &req, table, width, flags, val); - if (ret) - return ret; - - *prate = req.best_parent_rate; - - return req.rate; -} -EXPORT_SYMBOL_GPL(divider_ro_round_rate_parent); - static int clk_divider_determine_rate(struct clk_hw *hw, struct clk_rate_request *req) { diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 1cda2c78dffa..0d31077749fb 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -737,10 +737,6 @@ long divider_round_rate_parent(struct clk_hw *hw, struct clk_hw *parent, unsigned long rate, unsigned long *prate, const struct clk_div_table *table, u8 width, unsigned long flags); -long divider_ro_round_rate_parent(struct clk_hw *hw, struct clk_hw *parent, - unsigned long rate, unsigned long *prate, - const struct clk_div_table *table, u8 width, - unsigned long flags, unsigned int val); int divider_determine_rate(struct clk_hw *hw, struct clk_rate_request *req, const struct clk_div_table *table, u8 width, unsigned long flags); @@ -1440,17 +1436,6 @@ static inline long divider_round_rate(struct clk_hw *hw, unsigned long rate, rate, prate, table, width, flags); } -static inline long divider_ro_round_rate(struct clk_hw *hw, unsigned long rate, - unsigned long *prate, - const struct clk_div_table *table, - u8 width, unsigned long flags, - unsigned int val) -{ - return divider_ro_round_rate_parent(hw, clk_hw_get_parent(hw), - rate, prate, table, width, flags, - val); -} - /* * FIXME clock api without lock protection */ -- cgit v1.2.3 From d4851759742c1322f498021dab882d322fc34a1d Mon Sep 17 00:00:00 2001 From: Brian Masney Date: Thu, 8 Jan 2026 16:16:45 -0500 Subject: clk: divider: remove divider_round_rate() and divider_round_rate_parent() There are no remaining users of divider_round_rate() and divider_round_rate_parent(), so let's go ahead and remove them. Signed-off-by: Brian Masney --- drivers/clk/clk-divider.c | 22 ---------------------- include/linux/clk-provider.h | 13 ------------- 2 files changed, 35 deletions(-) (limited to 'include/linux') diff --git a/drivers/clk/clk-divider.c b/drivers/clk/clk-divider.c index 26610dd976ec..b3b485d23ea8 100644 --- a/drivers/clk/clk-divider.c +++ b/drivers/clk/clk-divider.c @@ -387,28 +387,6 @@ int divider_ro_determine_rate(struct clk_hw *hw, struct clk_rate_request *req, } EXPORT_SYMBOL_GPL(divider_ro_determine_rate); -long divider_round_rate_parent(struct clk_hw *hw, struct clk_hw *parent, - unsigned long rate, unsigned long *prate, - const struct clk_div_table *table, - u8 width, unsigned long flags) -{ - struct clk_rate_request req; - int ret; - - clk_hw_init_rate_request(hw, &req, rate); - req.best_parent_rate = *prate; - req.best_parent_hw = parent; - - ret = divider_determine_rate(hw, &req, table, width, flags); - if (ret) - return ret; - - *prate = req.best_parent_rate; - - return req.rate; -} -EXPORT_SYMBOL_GPL(divider_round_rate_parent); - static int clk_divider_determine_rate(struct clk_hw *hw, struct clk_rate_request *req) { diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 0d31077749fb..4d21602d7dbd 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -733,10 +733,6 @@ extern const struct clk_ops clk_divider_ro_ops; unsigned long divider_recalc_rate(struct clk_hw *hw, unsigned long parent_rate, unsigned int val, const struct clk_div_table *table, unsigned long flags, unsigned long width); -long divider_round_rate_parent(struct clk_hw *hw, struct clk_hw *parent, - unsigned long rate, unsigned long *prate, - const struct clk_div_table *table, - u8 width, unsigned long flags); int divider_determine_rate(struct clk_hw *hw, struct clk_rate_request *req, const struct clk_div_table *table, u8 width, unsigned long flags); @@ -1427,15 +1423,6 @@ static inline void __clk_hw_set_clk(struct clk_hw *dst, struct clk_hw *src) dst->core = src->core; } -static inline long divider_round_rate(struct clk_hw *hw, unsigned long rate, - unsigned long *prate, - const struct clk_div_table *table, - u8 width, unsigned long flags) -{ - return divider_round_rate_parent(hw, clk_hw_get_parent(hw), - rate, prate, table, width, flags); -} - /* * FIXME clock api without lock protection */ -- cgit v1.2.3 From 5cab6d386bd30c3bb4efceb05b25842a6f144693 Mon Sep 17 00:00:00 2001 From: Sanjay Yadav Date: Thu, 12 Feb 2026 14:55:29 +0530 Subject: drm/buddy: Add kernel-doc for allocator structures and flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add missing kernel-doc for GPU buddy allocator flags, gpu_buddy_block, and gpu_buddy. The documentation covers block header fields, allocator roots, free trees, and allocation flags such as RANGE, TOPDOWN, CONTIGUOUS, CLEAR, and TRIM_DISABLE. Private members are marked with kernel-doc private markers and documented with regular comments. No functional changes. v2: - Corrected GPU_BUDDY_CLEAR_TREE and GPU_BUDDY_DIRTY_TREE index values (Arun) - Rebased after DRM buddy allocator moved to drivers/gpu/ - Updated commit message v3: - Document reserved bits 8:6 in header layout (Arun) - Fix checkpatch warning Cc: Christian König Cc: Arunpravin Paneer Selvam Suggested-by: Matthew Auld Signed-off-by: Sanjay Yadav Reviewed-by: Arunpravin Paneer Selvam Signed-off-by: Arunpravin Paneer Selvam Link: https://patch.msgid.link/20260212092527.718455-5-sanjay.kumar.yadav@intel.com --- include/linux/gpu_buddy.h | 123 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 103 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpu_buddy.h b/include/linux/gpu_buddy.h index 07ac65db6d2e..bf2a42256536 100644 --- a/include/linux/gpu_buddy.h +++ b/include/linux/gpu_buddy.h @@ -12,11 +12,58 @@ #include #include +/** + * GPU_BUDDY_RANGE_ALLOCATION - Allocate within a specific address range + * + * When set, allocation is restricted to the range [start, end) specified + * in gpu_buddy_alloc_blocks(). Without this flag, start/end are ignored + * and allocation can use any free space. + */ #define GPU_BUDDY_RANGE_ALLOCATION BIT(0) + +/** + * GPU_BUDDY_TOPDOWN_ALLOCATION - Allocate from top of address space + * + * Allocate starting from high addresses and working down. Useful for + * separating different allocation types (e.g., kernel vs userspace) + * to reduce fragmentation. + */ #define GPU_BUDDY_TOPDOWN_ALLOCATION BIT(1) + +/** + * GPU_BUDDY_CONTIGUOUS_ALLOCATION - Require physically contiguous blocks + * + * The allocation must be satisfied with a single contiguous block. + * If the requested size cannot be allocated contiguously, the + * allocation fails with -ENOSPC. + */ #define GPU_BUDDY_CONTIGUOUS_ALLOCATION BIT(2) + +/** + * GPU_BUDDY_CLEAR_ALLOCATION - Prefer pre-cleared (zeroed) memory + * + * Attempt to allocate from the clear tree first. If insufficient clear + * memory is available, falls back to dirty memory. Useful when the + * caller needs zeroed memory and wants to avoid GPU clear operations. + */ #define GPU_BUDDY_CLEAR_ALLOCATION BIT(3) + +/** + * GPU_BUDDY_CLEARED - Mark returned blocks as cleared + * + * Used with gpu_buddy_free_list() to indicate that the memory being + * freed has been cleared (zeroed). The blocks will be placed in the + * clear tree for future GPU_BUDDY_CLEAR_ALLOCATION requests. + */ #define GPU_BUDDY_CLEARED BIT(4) + +/** + * GPU_BUDDY_TRIM_DISABLE - Disable automatic block trimming + * + * By default, if an allocation is smaller than the allocated block, + * excess memory is trimmed and returned to the free pool. This flag + * disables trimming, keeping the full power-of-two block size. + */ #define GPU_BUDDY_TRIM_DISABLE BIT(5) enum gpu_buddy_free_tree { @@ -28,7 +75,28 @@ enum gpu_buddy_free_tree { #define for_each_free_tree(tree) \ for ((tree) = 0; (tree) < GPU_BUDDY_MAX_FREE_TREES; (tree)++) +/** + * struct gpu_buddy_block - Block within a buddy allocator + * + * Each block in the buddy allocator is represented by this structure. + * Blocks are organized in a binary tree where each parent block can be + * split into two children (left and right buddies). The allocator manages + * blocks at various orders (power-of-2 sizes) from chunk_size up to the + * largest contiguous region. + * + * @private: Private data owned by the allocator user (e.g., driver-specific data) + * @link: List node for user ownership while block is allocated + */ struct gpu_buddy_block { +/* private: */ + /* + * Header bit layout: + * - Bits 63:12: block offset within the address space + * - Bits 11:10: state (ALLOCATED, FREE, or SPLIT) + * - Bit 9: clear bit (1 if memory is zeroed) + * - Bits 8:6: reserved + * - Bits 5:0: order (log2 of size relative to chunk_size) + */ #define GPU_BUDDY_HEADER_OFFSET GENMASK_ULL(63, 12) #define GPU_BUDDY_HEADER_STATE GENMASK_ULL(11, 10) #define GPU_BUDDY_ALLOCATED (1 << 10) @@ -43,7 +111,7 @@ struct gpu_buddy_block { struct gpu_buddy_block *left; struct gpu_buddy_block *right; struct gpu_buddy_block *parent; - +/* public: */ void *private; /* owned by creator */ /* @@ -53,43 +121,58 @@ struct gpu_buddy_block { * gpu_buddy_free* ownership is given back to the mm. */ union { +/* private: */ struct rb_node rb; +/* public: */ struct list_head link; }; - +/* private: */ struct list_head tmp_link; }; /* Order-zero must be at least SZ_4K */ #define GPU_BUDDY_MAX_ORDER (63 - 12) -/* - * Binary Buddy System. +/** + * struct gpu_buddy - GPU binary buddy allocator + * + * The buddy allocator provides efficient power-of-two memory allocation + * with fast allocation and free operations. It is commonly used for GPU + * memory management where allocations can be split into power-of-two + * block sizes. * - * Locking should be handled by the user, a simple mutex around - * gpu_buddy_alloc* and gpu_buddy_free* should suffice. + * Locking should be handled by the user; a simple mutex around + * gpu_buddy_alloc_blocks() and gpu_buddy_free_block()/gpu_buddy_free_list() + * should suffice. + * + * @n_roots: Number of root blocks in the roots array. + * @max_order: Maximum block order (log2 of largest block size / chunk_size). + * @chunk_size: Minimum allocation granularity in bytes. Must be at least SZ_4K. + * @size: Total size of the address space managed by this allocator in bytes. + * @avail: Total free space currently available for allocation in bytes. + * @clear_avail: Free space available in the clear tree (zeroed memory) in bytes. + * This is a subset of @avail. */ struct gpu_buddy { - /* Maintain a free list for each order. */ - struct rb_root **free_trees; - +/* private: */ /* - * Maintain explicit binary tree(s) to track the allocation of the - * address space. This gives us a simple way of finding a buddy block - * and performing the potentially recursive merge step when freeing a - * block. Nodes are either allocated or free, in which case they will - * also exist on the respective free list. + * Array of red-black trees for free block management. + * Indexed as free_trees[clear/dirty][order] where: + * - Index 0 (GPU_BUDDY_CLEAR_TREE): blocks with zeroed content + * - Index 1 (GPU_BUDDY_DIRTY_TREE): blocks with unknown content + * Each tree holds free blocks of the corresponding order. */ - struct gpu_buddy_block **roots; - + struct rb_root **free_trees; /* - * Anything from here is public, and remains static for the lifetime of - * the mm. Everything above is considered do-not-touch. + * Array of root blocks representing the top-level blocks of the + * binary tree(s). Multiple roots exist when the total size is not + * a power of two, with each root being the largest power-of-two + * that fits in the remaining space. */ + struct gpu_buddy_block **roots; +/* public: */ unsigned int n_roots; unsigned int max_order; - - /* Must be at least SZ_4K */ u64 chunk_size; u64 size; u64 avail; -- cgit v1.2.3 From df8c7892e06efa5df2aa780a338f33a4f666370b Mon Sep 17 00:00:00 2001 From: Sanjay Yadav Date: Thu, 12 Feb 2026 14:55:30 +0530 Subject: drm/buddy: Move internal helpers to buddy.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move gpu_buddy_block_state(), gpu_buddy_block_is_allocated(), and gpu_buddy_block_is_split() from gpu_buddy.h to gpu_buddy.c as static functions since they have no external callers. Remove gpu_get_buddy() as it was an unused exported wrapper around the internal __get_buddy(). No functional changes. v2: - Rebased after DRM buddy allocator moved to drivers/gpu/ - Keep gpu_buddy_block_is_free() in header since it's now used by drm_buddy.c - Updated commit message Cc: Christian König Cc: Arunpravin Paneer Selvam Suggested-by: Matthew Auld Signed-off-by: Sanjay Yadav Reviewed-by: Arunpravin Paneer Selvam Signed-off-by: Arunpravin Paneer Selvam Link: https://patch.msgid.link/20260212092527.718455-6-sanjay.kumar.yadav@intel.com --- drivers/gpu/buddy.c | 35 ++++++++++++++++++----------------- include/linux/gpu_buddy.h | 25 ++----------------------- 2 files changed, 20 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpu/buddy.c b/drivers/gpu/buddy.c index 603c59a2013a..b27761246d4b 100644 --- a/drivers/gpu/buddy.c +++ b/drivers/gpu/buddy.c @@ -14,6 +14,24 @@ static struct kmem_cache *slab_blocks; +static unsigned int +gpu_buddy_block_state(struct gpu_buddy_block *block) +{ + return block->header & GPU_BUDDY_HEADER_STATE; +} + +static bool +gpu_buddy_block_is_allocated(struct gpu_buddy_block *block) +{ + return gpu_buddy_block_state(block) == GPU_BUDDY_ALLOCATED; +} + +static bool +gpu_buddy_block_is_split(struct gpu_buddy_block *block) +{ + return gpu_buddy_block_state(block) == GPU_BUDDY_SPLIT; +} + static struct gpu_buddy_block *gpu_block_alloc(struct gpu_buddy *mm, struct gpu_buddy_block *parent, unsigned int order, @@ -449,23 +467,6 @@ static int split_block(struct gpu_buddy *mm, return 0; } -/** - * gpu_get_buddy - get buddy address - * - * @block: GPU buddy block - * - * Returns the corresponding buddy block for @block, or NULL - * if this is a root block and can't be merged further. - * Requires some kind of locking to protect against - * any concurrent allocate and free operations. - */ -struct gpu_buddy_block * -gpu_get_buddy(struct gpu_buddy_block *block) -{ - return __get_buddy(block); -} -EXPORT_SYMBOL(gpu_get_buddy); - /** * gpu_buddy_reset_clear - reset blocks clear state * diff --git a/include/linux/gpu_buddy.h b/include/linux/gpu_buddy.h index bf2a42256536..f1fb6eff604a 100644 --- a/include/linux/gpu_buddy.h +++ b/include/linux/gpu_buddy.h @@ -191,16 +191,10 @@ gpu_buddy_block_order(struct gpu_buddy_block *block) return block->header & GPU_BUDDY_HEADER_ORDER; } -static inline unsigned int -gpu_buddy_block_state(struct gpu_buddy_block *block) -{ - return block->header & GPU_BUDDY_HEADER_STATE; -} - static inline bool -gpu_buddy_block_is_allocated(struct gpu_buddy_block *block) +gpu_buddy_block_is_free(struct gpu_buddy_block *block) { - return gpu_buddy_block_state(block) == GPU_BUDDY_ALLOCATED; + return (block->header & GPU_BUDDY_HEADER_STATE) == GPU_BUDDY_FREE; } static inline bool @@ -209,18 +203,6 @@ gpu_buddy_block_is_clear(struct gpu_buddy_block *block) return block->header & GPU_BUDDY_HEADER_CLEAR; } -static inline bool -gpu_buddy_block_is_free(struct gpu_buddy_block *block) -{ - return gpu_buddy_block_state(block) == GPU_BUDDY_FREE; -} - -static inline bool -gpu_buddy_block_is_split(struct gpu_buddy_block *block) -{ - return gpu_buddy_block_state(block) == GPU_BUDDY_SPLIT; -} - static inline u64 gpu_buddy_block_size(struct gpu_buddy *mm, struct gpu_buddy_block *block) @@ -232,9 +214,6 @@ int gpu_buddy_init(struct gpu_buddy *mm, u64 size, u64 chunk_size); void gpu_buddy_fini(struct gpu_buddy *mm); -struct gpu_buddy_block * -gpu_get_buddy(struct gpu_buddy_block *block); - int gpu_buddy_alloc_blocks(struct gpu_buddy *mm, u64 start, u64 end, u64 size, u64 min_page_size, -- cgit v1.2.3 From 8167d7f674648cfd428ed49773522f9df5c4fdfd Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 15 Feb 2026 21:44:18 -0800 Subject: soundwire: sdw.h: repair names and format of kernel-doc comments Fix all kernel-doc warnings in sdw.h: Warning: include/linux/soundwire/sdw.h:538 cannot understand function prototype: 'enum sdw_reg_bank' Warning: include/linux/soundwire/sdw.h:779 struct member 'port_num' not described in 'sdw_transport_params' Warning: include/linux/soundwire/sdw.h:792 struct member 'port_num' not described in 'sdw_enable_ch' Warning: include/linux/soundwire/sdw.h:892 cannot understand function prototype: 'struct sdw_port_config' Warning: include/linux/soundwire/sdw.h:906 cannot understand function prototype: 'struct sdw_stream_config' Warning: include/linux/soundwire/sdw.h:925 cannot understand function prototype: 'enum sdw_stream_state' Warning: include/linux/soundwire/sdw.h:942 cannot understand function prototype: 'struct sdw_stream_params' Warning: include/linux/soundwire/sdw.h:960 cannot understand function prototype: 'struct sdw_stream_runtime' Warning: include/linux/soundwire/sdw.h:1047 struct member 'bpt_stream_refcount' not described in 'sdw_bus' Signed-off-by: Randy Dunlap Reviewed-by: Charles Keepax Link: https://patch.msgid.link/20260216054418.2766846-1-rdunlap@infradead.org Signed-off-by: Vinod Koul --- include/linux/soundwire/sdw.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h index f462717acf20..6147eb1fb210 100644 --- a/include/linux/soundwire/sdw.h +++ b/include/linux/soundwire/sdw.h @@ -532,7 +532,7 @@ struct sdw_slave_intr_status { }; /** - * sdw_reg_bank - SoundWire register banks + * enum sdw_reg_bank - SoundWire register banks * @SDW_BANK0: Soundwire register bank 0 * @SDW_BANK1: Soundwire register bank 1 */ @@ -751,7 +751,7 @@ struct sdw_port_params { * struct sdw_transport_params: Data Port Transport Parameters * * @blk_grp_ctrl_valid: Port implements block group control - * @num: Port number + * @port_num: Port number * @blk_grp_ctrl: Block group control value * @sample_interval: Sample interval * @offset1: Blockoffset of the payload data @@ -782,7 +782,7 @@ struct sdw_transport_params { /** * struct sdw_enable_ch: Enable/disable Data Port channel * - * @num: Port number + * @port_num: Port number * @ch_mask: Active channel mask * @enable: Enable (true) /disable (false) channel */ @@ -885,7 +885,7 @@ void sdw_bus_master_delete(struct sdw_bus *bus); void sdw_show_ping_status(struct sdw_bus *bus, bool sync_delay); /** - * sdw_port_config: Master or Slave Port configuration + * struct sdw_port_config: Master or Slave Port configuration * * @num: Port number * @ch_mask: channels mask for port @@ -896,7 +896,7 @@ struct sdw_port_config { }; /** - * sdw_stream_config: Master or Slave stream configuration + * struct sdw_stream_config: Master or Slave stream configuration * * @frame_rate: Audio frame rate of the stream, in Hz * @ch_count: Channel count of the stream @@ -913,7 +913,7 @@ struct sdw_stream_config { }; /** - * sdw_stream_state: Stream states + * enum sdw_stream_state: Stream states * * @SDW_STREAM_ALLOCATED: New stream allocated. * @SDW_STREAM_CONFIGURED: Stream configured @@ -934,7 +934,7 @@ enum sdw_stream_state { }; /** - * sdw_stream_params: Stream parameters + * struct sdw_stream_params: Stream parameters * * @rate: Sampling frequency, in Hz * @ch_count: Number of channels @@ -947,7 +947,7 @@ struct sdw_stream_params { }; /** - * sdw_stream_runtime: Runtime stream parameters + * struct sdw_stream_runtime: Runtime stream parameters * * @name: SoundWire stream name * @params: Stream parameters @@ -983,7 +983,7 @@ struct sdw_stream_runtime { * @defer_msg: Defer message * @params: Current bus parameters * @stream_refcount: number of streams currently using this bus - * @btp_stream_refcount: number of BTP streams currently using this bus (should + * @bpt_stream_refcount: number of BTP streams currently using this bus (should * be zero or one, multiple streams per link is not supported). * @bpt_stream: pointer stored to handle BTP streams. * @ops: Master callback ops -- cgit v1.2.3 From 88440208c6074e639a7ccc038c6a7ed4b6f8bb99 Mon Sep 17 00:00:00 2001 From: Tomas Melin Date: Tue, 10 Feb 2026 10:53:34 +0000 Subject: iio: industrialio-backend: support backend capabilities MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Not all backends support the full set of capabilities provided by the industrialio-backend framework. Capability bits can be used in frontends and backends for checking for a certain feature set, or if using related functions can be expected to fail. Capability bits should be set by a compatible backend and provided when registering the backend. Reviewed-by: Nuno Sá Reviewed-by: David Lechner Signed-off-by: Tomas Melin Signed-off-by: Jonathan Cameron --- drivers/iio/industrialio-backend.c | 16 ++++++++++++++++ include/linux/iio/backend.h | 24 ++++++++++++++++++++++++ 2 files changed, 40 insertions(+) (limited to 'include/linux') diff --git a/drivers/iio/industrialio-backend.c b/drivers/iio/industrialio-backend.c index 447b694d6d5f..1afd00763da9 100644 --- a/drivers/iio/industrialio-backend.c +++ b/drivers/iio/industrialio-backend.c @@ -56,6 +56,7 @@ struct iio_backend { void *priv; const char *name; unsigned int cached_reg_addr; + u32 caps; /* * This index is relative to the frontend. Meaning that for * frontends with multiple backends, this will be the index of this @@ -774,6 +775,20 @@ int iio_backend_extend_chan_spec(struct iio_backend *back, } EXPORT_SYMBOL_NS_GPL(iio_backend_extend_chan_spec, "IIO_BACKEND"); +/** + * iio_backend_has_caps - Check if backend has specific capabilities + * @back: Backend device + * @caps: Capabilities to check + * + * RETURNS: + * True if backend has all the requested capabilities, false otherwise. + */ +bool iio_backend_has_caps(struct iio_backend *back, u32 caps) +{ + return (back->caps & caps) == caps; +} +EXPORT_SYMBOL_NS_GPL(iio_backend_has_caps, "IIO_BACKEND"); + static void iio_backend_release(void *arg) { struct iio_backend *back = arg; @@ -1114,6 +1129,7 @@ int devm_iio_backend_register(struct device *dev, back->ops = info->ops; back->name = info->name; + back->caps = info->caps; back->owner = dev->driver->owner; back->dev = dev; back->priv = priv; diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h index 7f815f3fed6a..4d15c2a9802c 100644 --- a/include/linux/iio/backend.h +++ b/include/linux/iio/backend.h @@ -84,6 +84,27 @@ enum iio_backend_filter_type { IIO_BACKEND_FILTER_TYPE_MAX }; +/** + * enum iio_backend_capabilities - Backend capabilities + * Backend capabilities can be used by frontends to check if a given + * functionality is supported by the backend. This is useful for frontend + * devices which are expected to work with alternative backend + * implementations. Capabilities are loosely coupled with operations, + * meaning that a capability requires certain operations to be implemented + * by the backend. A capability might be mapped to a single operation or + * multiple operations. + * + * @IIO_BACKEND_CAP_CALIBRATION: Backend supports digital interface + * calibration. Calibration procedure is device specific. + * @IIO_BACKEND_CAP_BUFFER: Support for IIO buffer interface. + * @IIO_BACKEND_CAP_ENABLE: Backend can be explicitly enabled/disabled. + */ +enum iio_backend_capabilities { + IIO_BACKEND_CAP_CALIBRATION = BIT(0), + IIO_BACKEND_CAP_BUFFER = BIT(1), + IIO_BACKEND_CAP_ENABLE = BIT(2), +}; + /** * struct iio_backend_ops - operations structure for an iio_backend * @enable: Enable backend. @@ -179,10 +200,12 @@ struct iio_backend_ops { * struct iio_backend_info - info structure for an iio_backend * @name: Backend name. * @ops: Backend operations. + * @caps: Backend capabilities. (bitmask of enum iio_backend_capabilities). */ struct iio_backend_info { const char *name; const struct iio_backend_ops *ops; + u32 caps; }; int iio_backend_chan_enable(struct iio_backend *back, unsigned int chan); @@ -235,6 +258,7 @@ int iio_backend_read_raw(struct iio_backend *back, long mask); int iio_backend_extend_chan_spec(struct iio_backend *back, struct iio_chan_spec *chan); +bool iio_backend_has_caps(struct iio_backend *back, u32 caps); void *iio_backend_get_priv(const struct iio_backend *conv); struct iio_backend *devm_iio_backend_get(struct device *dev, const char *name); struct iio_backend *devm_iio_backend_fwnode_get(struct device *dev, -- cgit v1.2.3 From 8b65eb52d93e4e496bd26e6867152344554eb39e Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 17 Feb 2026 11:15:10 -0800 Subject: locking/mutex: Rename mutex_init_lockep() Typo, this wants to be _lockdep(). Fixes: 51d7a054521d ("locking/mutex: Redo __mutex_init() to reduce generated code size") Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260217191512.1180151-2-dave@stgolabs.net --- include/linux/mutex.h | 4 ++-- kernel/locking/mutex.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index ecaa0440f6ec..8126da959088 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -87,12 +87,12 @@ do { \ struct mutex mutexname = __MUTEX_INITIALIZER(mutexname) #ifdef CONFIG_DEBUG_LOCK_ALLOC -void mutex_init_lockep(struct mutex *lock, const char *name, struct lock_class_key *key); +void mutex_init_lockdep(struct mutex *lock, const char *name, struct lock_class_key *key); static inline void __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) { - mutex_init_lockep(lock, name, key); + mutex_init_lockdep(lock, name, key); } #else extern void mutex_init_generic(struct mutex *lock); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 2a1d165b3167..c867f6c15530 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -171,7 +171,7 @@ static __always_inline bool __mutex_unlock_fast(struct mutex *lock) #else /* !CONFIG_DEBUG_LOCK_ALLOC */ -void mutex_init_lockep(struct mutex *lock, const char *name, struct lock_class_key *key) +void mutex_init_lockdep(struct mutex *lock, const char *name, struct lock_class_key *key) { __mutex_init_generic(lock); @@ -181,7 +181,7 @@ void mutex_init_lockep(struct mutex *lock, const char *name, struct lock_class_k debug_check_no_locks_freed((void *)lock, sizeof(*lock)); lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP); } -EXPORT_SYMBOL(mutex_init_lockep); +EXPORT_SYMBOL(mutex_init_lockdep); #endif /* !CONFIG_DEBUG_LOCK_ALLOC */ static inline void __mutex_set_flag(struct mutex *lock, unsigned long flag) -- cgit v1.2.3 From babcde3be8c9148aa60a14b17831e8f249854963 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 17 Feb 2026 11:15:11 -0800 Subject: locking/mutex: Fix wrong comment for CONFIG_DEBUG_LOCK_ALLOC ... that endif block should be CONFIG_DEBUG_LOCK_ALLOC, not CONFIG_LOCKDEP. Fixes: 51d7a054521d ("locking/mutex: Redo __mutex_init() to reduce generated code size") Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260217191512.1180151-3-dave@stgolabs.net --- include/linux/mutex.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 8126da959088..f57d2a97da57 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -146,7 +146,7 @@ static inline void __mutex_init(struct mutex *lock, const char *name, { mutex_rt_init_generic(lock); } -#endif /* !CONFIG_LOCKDEP */ +#endif /* !CONFIG_DEBUG_LOCK_ALLOC */ #endif /* CONFIG_PREEMPT_RT */ #ifdef CONFIG_DEBUG_MUTEXES -- cgit v1.2.3 From 50214dc4382055352fb1d7b9779550dabf5059e5 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 17 Feb 2026 11:15:12 -0800 Subject: locking/mutex: Add killable flavor to guard definitions The mutex guard family defines _try and _intr variants but is missing the killable one. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260217191512.1180151-4-dave@stgolabs.net --- include/linux/mutex.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index f57d2a97da57..2f648ee204e7 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -253,6 +253,7 @@ extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock) __cond_a DEFINE_LOCK_GUARD_1(mutex, struct mutex, mutex_lock(_T->lock), mutex_unlock(_T->lock)) DEFINE_LOCK_GUARD_1_COND(mutex, _try, mutex_trylock(_T->lock)) DEFINE_LOCK_GUARD_1_COND(mutex, _intr, mutex_lock_interruptible(_T->lock), _RET == 0) +DEFINE_LOCK_GUARD_1_COND(mutex, _kill, mutex_lock_killable(_T->lock), _RET == 0) DEFINE_LOCK_GUARD_1(mutex_init, struct mutex, mutex_init(_T->lock), /* */) DECLARE_LOCK_GUARD_1_ATTRS(mutex, __acquires(_T), __releases(*(struct mutex **)_T)) @@ -261,6 +262,8 @@ DECLARE_LOCK_GUARD_1_ATTRS(mutex_try, __acquires(_T), __releases(*(struct mutex #define class_mutex_try_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(mutex_try, _T) DECLARE_LOCK_GUARD_1_ATTRS(mutex_intr, __acquires(_T), __releases(*(struct mutex **)_T)) #define class_mutex_intr_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(mutex_intr, _T) +DECLARE_LOCK_GUARD_1_ATTRS(mutex_kill, __acquires(_T), __releases(*(struct mutex **)_T)) +#define class_mutex_kill_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(mutex_kill, _T) DECLARE_LOCK_GUARD_1_ATTRS(mutex_init, __acquires(_T), __releases(*(struct mutex **)_T)) #define class_mutex_init_constructor(_T) WITH_LOCK_GUARD_1_ATTRS(mutex_init, _T) -- cgit v1.2.3 From 263ff314cc5602599d481b0912a381555fcbad28 Mon Sep 17 00:00:00 2001 From: Avri Altman Date: Fri, 28 Nov 2025 07:20:11 +0200 Subject: mmc: core: Add quirk for incorrect manufacturing date Some eMMC vendors need to report manufacturing dates beyond 2025 but are reluctant to update the EXT_CSD revision from 8 to 9. Changing the Updating the EXT_CSD revision may involve additional testing or qualification steps with customers. To ease this transition and avoid a full re-qualification process, a workaround is needed. This patch introduces a temporary quirk that re-purposes the year codes corresponding to 2010, 2011, and 2012 to represent the years 2026, 2027, and 2028, respectively. This solution is only valid for this three-year period. After 2028, vendors must update their firmware to set EXT_CSD_REV=9 to continue reporting the correct manufacturing date in compliance with the JEDEC standard. The `MMC_QUIRK_BROKEN_MDT` is introduced and enabled for all Sandisk devices to handle this behavior. Signed-off-by: Avri Altman Signed-off-by: Ulf Hansson --- drivers/mmc/core/card.h | 6 ++++++ drivers/mmc/core/mmc.c | 5 +++++ drivers/mmc/core/quirks.h | 3 +++ include/linux/mmc/card.h | 1 + 4 files changed, 15 insertions(+) (limited to 'include/linux') diff --git a/drivers/mmc/core/card.h b/drivers/mmc/core/card.h index 1200951bab08..a9619dd45270 100644 --- a/drivers/mmc/core/card.h +++ b/drivers/mmc/core/card.h @@ -89,6 +89,7 @@ struct mmc_fixup { #define CID_MANFID_MICRON 0x13 #define CID_MANFID_SAMSUNG 0x15 #define CID_MANFID_APACER 0x27 +#define CID_MANFID_SANDISK_MMC 0x45 #define CID_MANFID_SWISSBIT 0x5D #define CID_MANFID_KINGSTON 0x70 #define CID_MANFID_HYNIX 0x90 @@ -305,4 +306,9 @@ static inline int mmc_card_no_uhs_ddr50_tuning(const struct mmc_card *c) return c->quirks & MMC_QUIRK_NO_UHS_DDR50_TUNING; } +static inline int mmc_card_broken_mdt(const struct mmc_card *c) +{ + return c->quirks & MMC_QUIRK_BROKEN_MDT; +} + #endif diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index f744dd501842..8846550a8892 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -676,6 +676,11 @@ static int mmc_decode_ext_csd(struct mmc_card *card, u8 *ext_csd) /* Adjust production date as per JEDEC JESD84-B51B September 2025 */ if (card->cid.year < 2023) card->cid.year += 16; + } else { + /* Handle vendors with broken MDT reporting */ + if (mmc_card_broken_mdt(card) && card->cid.year >= 2010 && + card->cid.year <= 2012) + card->cid.year += 16; } } diff --git a/drivers/mmc/core/quirks.h b/drivers/mmc/core/quirks.h index c417ed34c057..f5e8a0f6d11b 100644 --- a/drivers/mmc/core/quirks.h +++ b/drivers/mmc/core/quirks.h @@ -170,6 +170,9 @@ static const struct mmc_fixup __maybe_unused mmc_ext_csd_fixups[] = { MMC_FIXUP_EXT_CSD_REV(CID_NAME_ANY, CID_MANFID_NUMONYX, 0x014e, add_quirk, MMC_QUIRK_BROKEN_HPI, 6), + MMC_FIXUP(CID_NAME_ANY, CID_MANFID_SANDISK_MMC, CID_OEMID_ANY, add_quirk_mmc, + MMC_QUIRK_BROKEN_MDT), + END_FIXUP }; diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index e9e964c20e53..4722dd7e46ce 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -329,6 +329,7 @@ struct mmc_card { #define MMC_QUIRK_BROKEN_CACHE_FLUSH (1<<16) /* Don't flush cache until the write has occurred */ #define MMC_QUIRK_BROKEN_SD_POWEROFF_NOTIFY (1<<17) /* Disable broken SD poweroff notify support */ #define MMC_QUIRK_NO_UHS_DDR50_TUNING (1<<18) /* Disable DDR50 tuning */ +#define MMC_QUIRK_BROKEN_MDT (1<<19) /* Wrong manufacturing year */ bool written_flag; /* Indicates eMMC has been written since power on */ bool reenable_cmdq; /* Re-enable Command Queue */ -- cgit v1.2.3 From c0b68bc25efeaca8a1ef3a0cfab5fbf5f81d002d Mon Sep 17 00:00:00 2001 From: Jeff Chen Date: Tue, 13 Jan 2026 11:15:17 +0800 Subject: mmc: sdio: add NXP vendor and IW61x device IDs Add NXP's SDIO vendor ID (0x0471) and IW61x device ID (0x0205) to sdio_ids.h for future support of NXP Wi-Fi chips over SDIO. Signed-off-by: Jeff Chen Signed-off-by: Ulf Hansson --- include/linux/mmc/sdio_ids.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h index 673cbdf43453..39ac2b612e4a 100644 --- a/include/linux/mmc/sdio_ids.h +++ b/include/linux/mmc/sdio_ids.h @@ -116,6 +116,9 @@ #define SDIO_VENDOR_ID_MICROCHIP_WILC 0x0296 #define SDIO_DEVICE_ID_MICROCHIP_WILC1000 0x5347 +#define SDIO_VENDOR_ID_NXP 0x0471 +#define SDIO_DEVICE_ID_NXP_IW61X 0x0205 + #define SDIO_VENDOR_ID_REALTEK 0x024c #define SDIO_DEVICE_ID_REALTEK_RTW8723BS 0xb723 #define SDIO_DEVICE_ID_REALTEK_RTW8821BS 0xb821 -- cgit v1.2.3 From d6bf2e64dec87322f2b11565ddb59c0e967f96e3 Mon Sep 17 00:00:00 2001 From: Luke Wang Date: Wed, 4 Feb 2026 11:40:03 +0800 Subject: mmc: core: Optimize time for secure erase/trim for some Kingston eMMCs Kingston eMMC IY2964 and IB2932 takes a fixed ~2 seconds for each secure erase/trim operation regardless of size - that is, a single secure erase/trim operation of 1MB takes the same time as 1GB. With default calculated 3.5MB max discard size, secure erase 1GB requires ~300 separate operations taking ~10 minutes total. Add a card quirk, MMC_QUIRK_FIXED_SECURE_ERASE_TRIM_TIME, to set maximum secure erase size for those devices. This allows 1GB secure erase to complete in a single operation, reducing time from 10 minutes to just 2 seconds. Signed-off-by: Luke Wang Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/core/card.h | 5 +++++ drivers/mmc/core/queue.c | 9 +++++++-- drivers/mmc/core/quirks.h | 9 +++++++++ include/linux/mmc/card.h | 1 + 4 files changed, 22 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/mmc/core/card.h b/drivers/mmc/core/card.h index a9619dd45270..a7c364d0030a 100644 --- a/drivers/mmc/core/card.h +++ b/drivers/mmc/core/card.h @@ -311,4 +311,9 @@ static inline int mmc_card_broken_mdt(const struct mmc_card *c) return c->quirks & MMC_QUIRK_BROKEN_MDT; } +static inline int mmc_card_fixed_secure_erase_trim_time(const struct mmc_card *c) +{ + return c->quirks & MMC_QUIRK_FIXED_SECURE_ERASE_TRIM_TIME; +} + #endif diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c index 13000fc57e2e..39fcb662c43f 100644 --- a/drivers/mmc/core/queue.c +++ b/drivers/mmc/core/queue.c @@ -184,8 +184,13 @@ static void mmc_queue_setup_discard(struct mmc_card *card, return; lim->max_hw_discard_sectors = max_discard; - if (mmc_card_can_secure_erase_trim(card)) - lim->max_secure_erase_sectors = max_discard; + if (mmc_card_can_secure_erase_trim(card)) { + if (mmc_card_fixed_secure_erase_trim_time(card)) + lim->max_secure_erase_sectors = UINT_MAX >> card->erase_shift; + else + lim->max_secure_erase_sectors = max_discard; + } + if (mmc_card_can_trim(card) && card->erased_byte == 0) lim->max_write_zeroes_sectors = max_discard; diff --git a/drivers/mmc/core/quirks.h b/drivers/mmc/core/quirks.h index f5e8a0f6d11b..6f727b4a60a5 100644 --- a/drivers/mmc/core/quirks.h +++ b/drivers/mmc/core/quirks.h @@ -153,6 +153,15 @@ static const struct mmc_fixup __maybe_unused mmc_blk_fixups[] = { MMC_FIXUP("M62704", CID_MANFID_KINGSTON, 0x0100, add_quirk_mmc, MMC_QUIRK_TRIM_BROKEN), + /* + * On Some Kingston eMMCs, secure erase/trim time is independent + * of erase size, fixed at approximately 2 seconds. + */ + MMC_FIXUP("IY2964", CID_MANFID_KINGSTON, 0x0100, add_quirk_mmc, + MMC_QUIRK_FIXED_SECURE_ERASE_TRIM_TIME), + MMC_FIXUP("IB2932", CID_MANFID_KINGSTON, 0x0100, add_quirk_mmc, + MMC_QUIRK_FIXED_SECURE_ERASE_TRIM_TIME), + END_FIXUP }; diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index 4722dd7e46ce..9dc4750296af 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -330,6 +330,7 @@ struct mmc_card { #define MMC_QUIRK_BROKEN_SD_POWEROFF_NOTIFY (1<<17) /* Disable broken SD poweroff notify support */ #define MMC_QUIRK_NO_UHS_DDR50_TUNING (1<<18) /* Disable DDR50 tuning */ #define MMC_QUIRK_BROKEN_MDT (1<<19) /* Wrong manufacturing year */ +#define MMC_QUIRK_FIXED_SECURE_ERASE_TRIM_TIME (1<<20) /* Secure erase/trim time is fixed regardless of size */ bool written_flag; /* Indicates eMMC has been written since power on */ bool reenable_cmdq; /* Re-enable Command Queue */ -- cgit v1.2.3 From 94d709be8c0dc875dfc9ebb64d3b8093d0790c15 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 16 Feb 2026 14:31:57 +0100 Subject: xattr: add rcu_head and rhash_head to struct simple_xattr In preparation for converting simple_xattrs from rbtree to rhashtable, add rhash_head and rcu_head members to struct simple_xattr. The rhashtable implementation will use rhash_head for hash table linkage and RCU-based lockless reads, requiring that replaced or removed xattr entries be freed via call_rcu() rather than immediately. Add simple_xattr_free_rcu() which schedules RCU-deferred freeing of an xattr entry. This will be used by callers of simple_xattr_set() once they switch to the rhashtable-based xattr store. No functional changes. Link: https://patch.msgid.link/20260216-work-xattr-socket-v1-1-c2efa4f74cb7@kernel.org Acked-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/xattr.c | 23 +++++++++++++++++++++++ include/linux/xattr.h | 4 ++++ 2 files changed, 27 insertions(+) (limited to 'include/linux') diff --git a/fs/xattr.c b/fs/xattr.c index 3e49e612e1ba..9cbb1917bcb2 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -1197,6 +1197,29 @@ void simple_xattr_free(struct simple_xattr *xattr) kvfree(xattr); } +static void simple_xattr_rcu_free(struct rcu_head *head) +{ + struct simple_xattr *xattr; + + xattr = container_of(head, struct simple_xattr, rcu); + simple_xattr_free(xattr); +} + +/** + * simple_xattr_free_rcu - free an xattr object after an RCU grace period + * @xattr: the xattr object + * + * Schedule RCU-deferred freeing of an xattr entry. This is used by + * rhashtable-based callers of simple_xattr_set() that replace or remove + * an existing entry while concurrent RCU readers may still be accessing + * it. + */ +void simple_xattr_free_rcu(struct simple_xattr *xattr) +{ + if (xattr) + call_rcu(&xattr->rcu, simple_xattr_rcu_free); +} + /** * simple_xattr_alloc - allocate new xattr object * @value: value of the xattr object diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 296b5ee5c979..fdbd2095414a 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -112,6 +113,8 @@ struct simple_xattrs { struct simple_xattr { struct rb_node rb_node; + struct rhash_head hash_node; + struct rcu_head rcu; char *name; size_t size; char value[] __counted_by(size); @@ -122,6 +125,7 @@ void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space); size_t simple_xattr_space(const char *name, size_t size); struct simple_xattr *simple_xattr_alloc(const void *value, size_t size); void simple_xattr_free(struct simple_xattr *xattr); +void simple_xattr_free_rcu(struct simple_xattr *xattr); int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, void *buffer, size_t size); struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, -- cgit v1.2.3 From b32c4a213698ab351b44da2fd1b2a5976c7fa033 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 16 Feb 2026 14:31:58 +0100 Subject: xattr: add rhashtable-based simple_xattr infrastructure Add rhashtable support to the simple_xattr subsystem while keeping the existing rbtree code fully functional. This allows consumers to be migrated one at a time without breaking any intermediate build. struct simple_xattrs gains a dispatch flag and a union holding either the rbtree (rb_root + rwlock) or rhashtable state: struct simple_xattrs { bool use_rhashtable; union { struct { struct rb_root rb_root; rwlock_t lock; }; struct rhashtable ht; }; }; simple_xattrs_init() continues to set up the rbtree path for existing embedded-struct callers. Add simple_xattrs_alloc() which dynamically allocates a simple_xattrs and initializes the rhashtable path. This is the entry point for consumers switching to pointer-based lazy allocation. The five core functions (get, set, list, add, free) dispatch based on the use_rhashtable flag. Existing callers continue to use the rbtree path unchanged. As each consumer is converted it will switch to simple_xattrs_alloc() and the rhashtable path. Once all consumers are converted a follow-up patch will remove the rbtree code. Link: https://patch.msgid.link/20260216-work-xattr-socket-v1-2-c2efa4f74cb7@kernel.org Acked-by: Darrick J. Wong Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/xattr.c | 439 ++++++++++++++++++++++++++++++++++++++------------ include/linux/xattr.h | 25 ++- mm/shmem.c | 2 +- 3 files changed, 357 insertions(+), 109 deletions(-) (limited to 'include/linux') diff --git a/fs/xattr.c b/fs/xattr.c index 9cbb1917bcb2..1d98ea459b7b 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -22,6 +22,7 @@ #include #include #include +#include #include @@ -1228,22 +1229,25 @@ void simple_xattr_free_rcu(struct simple_xattr *xattr) * Allocate a new xattr object and initialize respective members. The caller is * responsible for handling the name of the xattr. * - * Return: On success a new xattr object is returned. On failure NULL is - * returned. + * Return: New xattr object on success, NULL if @value is NULL, ERR_PTR on + * failure. */ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size) { struct simple_xattr *new_xattr; size_t len; + if (!value) + return NULL; + /* wrap around? */ len = sizeof(*new_xattr) + size; if (len < sizeof(*new_xattr)) - return NULL; + return ERR_PTR(-ENOMEM); new_xattr = kvmalloc(len, GFP_KERNEL_ACCOUNT); if (!new_xattr) - return NULL; + return ERR_PTR(-ENOMEM); new_xattr->size = size; memcpy(new_xattr->value, value, size); @@ -1287,6 +1291,33 @@ static int rbtree_simple_xattr_node_cmp(struct rb_node *new_node, return rbtree_simple_xattr_cmp(xattr->name, node); } +static u32 simple_xattr_hashfn(const void *data, u32 len, u32 seed) +{ + const char *name = data; + return jhash(name, strlen(name), seed); +} + +static u32 simple_xattr_obj_hashfn(const void *obj, u32 len, u32 seed) +{ + const struct simple_xattr *xattr = obj; + return jhash(xattr->name, strlen(xattr->name), seed); +} + +static int simple_xattr_obj_cmpfn(struct rhashtable_compare_arg *arg, + const void *obj) +{ + const struct simple_xattr *xattr = obj; + return strcmp(xattr->name, arg->key); +} + +static const struct rhashtable_params simple_xattr_params = { + .head_offset = offsetof(struct simple_xattr, hash_node), + .hashfn = simple_xattr_hashfn, + .obj_hashfn = simple_xattr_obj_hashfn, + .obj_cmpfn = simple_xattr_obj_cmpfn, + .automatic_shrinking = true, +}; + /** * simple_xattr_get - get an xattr object * @xattrs: the header of the xattr object @@ -1306,22 +1337,41 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, void *buffer, size_t size) { struct simple_xattr *xattr = NULL; - struct rb_node *rbp; int ret = -ENODATA; - read_lock(&xattrs->lock); - rbp = rb_find(name, &xattrs->rb_root, rbtree_simple_xattr_cmp); - if (rbp) { - xattr = rb_entry(rbp, struct simple_xattr, rb_node); - ret = xattr->size; - if (buffer) { - if (size < xattr->size) - ret = -ERANGE; - else - memcpy(buffer, xattr->value, xattr->size); + if (xattrs->use_rhashtable) { + guard(rcu)(); + xattr = rhashtable_lookup(&xattrs->ht, name, + simple_xattr_params); + if (xattr) { + ret = xattr->size; + if (buffer) { + if (size < xattr->size) + ret = -ERANGE; + else + memcpy(buffer, xattr->value, + xattr->size); + } + } + } else { + struct rb_node *rbp; + + read_lock(&xattrs->lock); + rbp = rb_find(name, &xattrs->rb_root, + rbtree_simple_xattr_cmp); + if (rbp) { + xattr = rb_entry(rbp, struct simple_xattr, rb_node); + ret = xattr->size; + if (buffer) { + if (size < xattr->size) + ret = -ERANGE; + else + memcpy(buffer, xattr->value, + xattr->size); + } } + read_unlock(&xattrs->lock); } - read_unlock(&xattrs->lock); return ret; } @@ -1355,78 +1405,134 @@ struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, const char *name, const void *value, size_t size, int flags) { - struct simple_xattr *old_xattr = NULL, *new_xattr = NULL; - struct rb_node *parent = NULL, **rbp; - int err = 0, ret; + struct simple_xattr *old_xattr = NULL; + int err = 0; - /* value == NULL means remove */ - if (value) { - new_xattr = simple_xattr_alloc(value, size); - if (!new_xattr) - return ERR_PTR(-ENOMEM); + CLASS(simple_xattr, new_xattr)(value, size); + if (IS_ERR(new_xattr)) + return new_xattr; + if (new_xattr) { new_xattr->name = kstrdup(name, GFP_KERNEL_ACCOUNT); - if (!new_xattr->name) { - simple_xattr_free(new_xattr); + if (!new_xattr->name) return ERR_PTR(-ENOMEM); - } } - write_lock(&xattrs->lock); - rbp = &xattrs->rb_root.rb_node; - while (*rbp) { - parent = *rbp; - ret = rbtree_simple_xattr_cmp(name, *rbp); - if (ret < 0) - rbp = &(*rbp)->rb_left; - else if (ret > 0) - rbp = &(*rbp)->rb_right; - else - old_xattr = rb_entry(*rbp, struct simple_xattr, rb_node); - if (old_xattr) - break; - } + if (xattrs->use_rhashtable) { + /* + * Lookup is safe without RCU here since writes are + * serialized by the caller. + */ + old_xattr = rhashtable_lookup_fast(&xattrs->ht, name, + simple_xattr_params); + + if (old_xattr) { + /* Fail if XATTR_CREATE is requested and the xattr exists. */ + if (flags & XATTR_CREATE) + return ERR_PTR(-EEXIST); + + if (new_xattr) { + err = rhashtable_replace_fast(&xattrs->ht, + &old_xattr->hash_node, + &new_xattr->hash_node, + simple_xattr_params); + if (err) + return ERR_PTR(err); + } else { + err = rhashtable_remove_fast(&xattrs->ht, + &old_xattr->hash_node, + simple_xattr_params); + if (err) + return ERR_PTR(err); + } + } else { + /* Fail if XATTR_REPLACE is requested but no xattr is found. */ + if (flags & XATTR_REPLACE) + return ERR_PTR(-ENODATA); + + /* + * If XATTR_CREATE or no flags are specified together + * with a new value simply insert it. + */ + if (new_xattr) { + err = rhashtable_insert_fast(&xattrs->ht, + &new_xattr->hash_node, + simple_xattr_params); + if (err) + return ERR_PTR(err); + } - if (old_xattr) { - /* Fail if XATTR_CREATE is requested and the xattr exists. */ - if (flags & XATTR_CREATE) { - err = -EEXIST; - goto out_unlock; + /* + * If XATTR_CREATE or no flags are specified and + * neither an old or new xattr exist then we don't + * need to do anything. + */ } - - if (new_xattr) - rb_replace_node(&old_xattr->rb_node, - &new_xattr->rb_node, &xattrs->rb_root); - else - rb_erase(&old_xattr->rb_node, &xattrs->rb_root); } else { - /* Fail if XATTR_REPLACE is requested but no xattr is found. */ - if (flags & XATTR_REPLACE) { - err = -ENODATA; - goto out_unlock; - } + struct rb_node *parent = NULL, **rbp; + int ret; - /* - * If XATTR_CREATE or no flags are specified together with a - * new value simply insert it. - */ - if (new_xattr) { - rb_link_node(&new_xattr->rb_node, parent, rbp); - rb_insert_color(&new_xattr->rb_node, &xattrs->rb_root); + write_lock(&xattrs->lock); + rbp = &xattrs->rb_root.rb_node; + while (*rbp) { + parent = *rbp; + ret = rbtree_simple_xattr_cmp(name, *rbp); + if (ret < 0) + rbp = &(*rbp)->rb_left; + else if (ret > 0) + rbp = &(*rbp)->rb_right; + else + old_xattr = rb_entry(*rbp, struct simple_xattr, + rb_node); + if (old_xattr) + break; } - /* - * If XATTR_CREATE or no flags are specified and neither an - * old or new xattr exist then we don't need to do anything. - */ - } + if (old_xattr) { + /* Fail if XATTR_CREATE is requested and the xattr exists. */ + if (flags & XATTR_CREATE) { + err = -EEXIST; + goto out_unlock; + } + + if (new_xattr) + rb_replace_node(&old_xattr->rb_node, + &new_xattr->rb_node, + &xattrs->rb_root); + else + rb_erase(&old_xattr->rb_node, + &xattrs->rb_root); + } else { + /* Fail if XATTR_REPLACE is requested but no xattr is found. */ + if (flags & XATTR_REPLACE) { + err = -ENODATA; + goto out_unlock; + } + + /* + * If XATTR_CREATE or no flags are specified together + * with a new value simply insert it. + */ + if (new_xattr) { + rb_link_node(&new_xattr->rb_node, parent, rbp); + rb_insert_color(&new_xattr->rb_node, + &xattrs->rb_root); + } + + /* + * If XATTR_CREATE or no flags are specified and + * neither an old or new xattr exist then we don't + * need to do anything. + */ + } out_unlock: - write_unlock(&xattrs->lock); - if (!err) - return old_xattr; - simple_xattr_free(new_xattr); - return ERR_PTR(err); + write_unlock(&xattrs->lock); + if (err) + return ERR_PTR(err); + } + retain_and_null_ptr(new_xattr); + return old_xattr; } static bool xattr_is_trusted(const char *name) @@ -1467,7 +1573,6 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, { bool trusted = ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN); struct simple_xattr *xattr; - struct rb_node *rbp; ssize_t remaining_size = size; int err = 0; @@ -1487,23 +1592,62 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, remaining_size -= err; err = 0; - read_lock(&xattrs->lock); - for (rbp = rb_first(&xattrs->rb_root); rbp; rbp = rb_next(rbp)) { - xattr = rb_entry(rbp, struct simple_xattr, rb_node); + if (!xattrs) + return size - remaining_size; - /* skip "trusted." attributes for unprivileged callers */ - if (!trusted && xattr_is_trusted(xattr->name)) - continue; + if (xattrs->use_rhashtable) { + struct rhashtable_iter iter; - /* skip MAC labels; these are provided by LSM above */ - if (xattr_is_maclabel(xattr->name)) - continue; + rhashtable_walk_enter(&xattrs->ht, &iter); + rhashtable_walk_start(&iter); - err = xattr_list_one(&buffer, &remaining_size, xattr->name); - if (err) - break; + while ((xattr = rhashtable_walk_next(&iter)) != NULL) { + if (IS_ERR(xattr)) { + if (PTR_ERR(xattr) == -EAGAIN) + continue; + err = PTR_ERR(xattr); + break; + } + + /* skip "trusted." attributes for unprivileged callers */ + if (!trusted && xattr_is_trusted(xattr->name)) + continue; + + /* skip MAC labels; these are provided by LSM above */ + if (xattr_is_maclabel(xattr->name)) + continue; + + err = xattr_list_one(&buffer, &remaining_size, + xattr->name); + if (err) + break; + } + + rhashtable_walk_stop(&iter); + rhashtable_walk_exit(&iter); + } else { + struct rb_node *rbp; + + read_lock(&xattrs->lock); + for (rbp = rb_first(&xattrs->rb_root); rbp; + rbp = rb_next(rbp)) { + xattr = rb_entry(rbp, struct simple_xattr, rb_node); + + /* skip "trusted." attributes for unprivileged callers */ + if (!trusted && xattr_is_trusted(xattr->name)) + continue; + + /* skip MAC labels; these are provided by LSM above */ + if (xattr_is_maclabel(xattr->name)) + continue; + + err = xattr_list_one(&buffer, &remaining_size, + xattr->name); + if (err) + break; + } + read_unlock(&xattrs->lock); } - read_unlock(&xattrs->lock); return err ? err : size - remaining_size; } @@ -1536,9 +1680,16 @@ static bool rbtree_simple_xattr_less(struct rb_node *new_node, void simple_xattr_add(struct simple_xattrs *xattrs, struct simple_xattr *new_xattr) { - write_lock(&xattrs->lock); - rb_add(&new_xattr->rb_node, &xattrs->rb_root, rbtree_simple_xattr_less); - write_unlock(&xattrs->lock); + if (xattrs->use_rhashtable) { + WARN_ON(rhashtable_insert_fast(&xattrs->ht, + &new_xattr->hash_node, + simple_xattr_params)); + } else { + write_lock(&xattrs->lock); + rb_add(&new_xattr->rb_node, &xattrs->rb_root, + rbtree_simple_xattr_less); + write_unlock(&xattrs->lock); + } } /** @@ -1549,10 +1700,80 @@ void simple_xattr_add(struct simple_xattrs *xattrs, */ void simple_xattrs_init(struct simple_xattrs *xattrs) { + xattrs->use_rhashtable = false; xattrs->rb_root = RB_ROOT; rwlock_init(&xattrs->lock); } +/** + * simple_xattrs_alloc - allocate and initialize a new xattr header + * + * Dynamically allocate a simple_xattrs header and initialize the + * underlying rhashtable. This is intended for consumers that want + * rhashtable-based xattr storage. + * + * Return: On success a new simple_xattrs is returned. On failure an + * ERR_PTR is returned. + */ +struct simple_xattrs *simple_xattrs_alloc(void) +{ + struct simple_xattrs *xattrs __free(kfree) = NULL; + + xattrs = kzalloc(sizeof(*xattrs), GFP_KERNEL); + if (!xattrs) + return ERR_PTR(-ENOMEM); + + xattrs->use_rhashtable = true; + if (rhashtable_init(&xattrs->ht, &simple_xattr_params)) + return ERR_PTR(-ENOMEM); + + return no_free_ptr(xattrs); +} + +/** + * simple_xattrs_lazy_alloc - get or allocate xattrs for a set operation + * @xattrsp: pointer to the xattrs pointer (may point to NULL) + * @value: value being set (NULL means remove) + * @flags: xattr set flags + * + * For lazily-allocated xattrs on the write path. If no xattrs exist yet + * and this is a remove operation, returns the appropriate result without + * allocating. Otherwise ensures xattrs is allocated and published with + * store-release semantics. + * + * Return: On success a valid pointer to the xattrs is returned. On + * failure or early-exit an ERR_PTR or NULL is returned. Callers should + * check with IS_ERR_OR_NULL() and propagate with PTR_ERR() which + * correctly returns 0 for the NULL no-op case. + */ +struct simple_xattrs *simple_xattrs_lazy_alloc(struct simple_xattrs **xattrsp, + const void *value, int flags) +{ + struct simple_xattrs *xattrs; + + xattrs = READ_ONCE(*xattrsp); + if (xattrs) + return xattrs; + + if (!value) + return (flags & XATTR_REPLACE) ? ERR_PTR(-ENODATA) : NULL; + + xattrs = simple_xattrs_alloc(); + if (!IS_ERR(xattrs)) + smp_store_release(xattrsp, xattrs); + return xattrs; +} + +static void simple_xattr_ht_free(void *ptr, void *arg) +{ + struct simple_xattr *xattr = ptr; + size_t *freed_space = arg; + + if (freed_space) + *freed_space += simple_xattr_space(xattr->name, xattr->size); + simple_xattr_free(xattr); +} + /** * simple_xattrs_free - free xattrs * @xattrs: xattr header whose xattrs to destroy @@ -1563,22 +1784,28 @@ void simple_xattrs_init(struct simple_xattrs *xattrs) */ void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space) { - struct rb_node *rbp; - if (freed_space) *freed_space = 0; - rbp = rb_first(&xattrs->rb_root); - while (rbp) { - struct simple_xattr *xattr; - struct rb_node *rbp_next; - - rbp_next = rb_next(rbp); - xattr = rb_entry(rbp, struct simple_xattr, rb_node); - rb_erase(&xattr->rb_node, &xattrs->rb_root); - if (freed_space) - *freed_space += simple_xattr_space(xattr->name, - xattr->size); - simple_xattr_free(xattr); - rbp = rbp_next; + + if (xattrs->use_rhashtable) { + rhashtable_free_and_destroy(&xattrs->ht, + simple_xattr_ht_free, freed_space); + } else { + struct rb_node *rbp; + + rbp = rb_first(&xattrs->rb_root); + while (rbp) { + struct simple_xattr *xattr; + struct rb_node *rbp_next; + + rbp_next = rb_next(rbp); + xattr = rb_entry(rbp, struct simple_xattr, rb_node); + rb_erase(&xattr->rb_node, &xattrs->rb_root); + if (freed_space) + *freed_space += simple_xattr_space(xattr->name, + xattr->size); + simple_xattr_free(xattr); + rbp = rbp_next; + } } } diff --git a/include/linux/xattr.h b/include/linux/xattr.h index fdbd2095414a..832a44358661 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -107,8 +107,14 @@ static inline const char *xattr_prefix(const struct xattr_handler *handler) } struct simple_xattrs { - struct rb_root rb_root; - rwlock_t lock; + bool use_rhashtable; + union { + struct { + struct rb_root rb_root; + rwlock_t lock; + }; + struct rhashtable ht; + }; }; struct simple_xattr { @@ -121,6 +127,9 @@ struct simple_xattr { }; void simple_xattrs_init(struct simple_xattrs *xattrs); +struct simple_xattrs *simple_xattrs_alloc(void); +struct simple_xattrs *simple_xattrs_lazy_alloc(struct simple_xattrs **xattrsp, + const void *value, int flags); void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space); size_t simple_xattr_space(const char *name, size_t size); struct simple_xattr *simple_xattr_alloc(const void *value, size_t size); @@ -137,4 +146,16 @@ void simple_xattr_add(struct simple_xattrs *xattrs, struct simple_xattr *new_xattr); int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name); +DEFINE_CLASS(simple_xattr, + struct simple_xattr *, + if (!IS_ERR_OR_NULL(_T)) simple_xattr_free(_T), + simple_xattr_alloc(value, size), + const void *value, size_t size) + +DEFINE_CLASS(simple_xattrs, + struct simple_xattrs *, + if (!IS_ERR_OR_NULL(_T)) { simple_xattrs_free(_T, NULL); kfree(_T); }, + simple_xattrs_alloc(), + void) + #endif /* _LINUX_XATTR_H */ diff --git a/mm/shmem.c b/mm/shmem.c index b40f3cd48961..35c2f8748668 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -4278,7 +4278,7 @@ static int shmem_initxattrs(struct inode *inode, for (xattr = xattr_array; xattr->name != NULL; xattr++) { new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len); - if (!new_xattr) + if (IS_ERR(new_xattr)) break; len = strlen(xattr->name) + 1; -- cgit v1.2.3 From 52b364fed6e1578e551fee20c76fecb3fc0e10ed Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 16 Feb 2026 14:31:59 +0100 Subject: shmem: adapt to rhashtable-based simple_xattrs with lazy allocation Adapt tmpfs/shmem to use the rhashtable-based xattr path and switch from an embedded struct to pointer-based lazy allocation. Change shmem_inode_info.xattrs from embedded 'struct simple_xattrs' to a pointer 'struct simple_xattrs *', initialized to NULL. This avoids the rhashtable overhead for every tmpfs inode, which helps when a lot of inodes exist. The xattr store is allocated on first use: - shmem_initxattrs(): Allocates via simple_xattrs_alloc() when security modules set initial xattrs during inode creation. - shmem_xattr_handler_set(): Allocates on first setxattr, with a short-circuit for removal when no xattrs are stored yet. All read paths (shmem_xattr_handler_get, shmem_listxattr) check for NULL xattrs pointer and return -ENODATA or 0 respectively. Replaced xattr entries are freed via simple_xattr_free_rcu() to allow concurrent RCU readers to finish. shmem_evict_inode() conditionally frees the xattr store only when allocated. Also change simple_xattr_add() from void to int to propagate rhashtable insertion failures. shmem_initxattrs() is the only caller. Link: https://patch.msgid.link/20260216-work-xattr-socket-v1-3-c2efa4f74cb7@kernel.org Acked-by: Darrick J. Wong Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/xattr.c | 26 +++++++++++++------------- include/linux/shmem_fs.h | 2 +- include/linux/xattr.h | 4 ++-- mm/shmem.c | 44 +++++++++++++++++++++++++++++++------------- 4 files changed, 47 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/fs/xattr.c b/fs/xattr.c index 1d98ea459b7b..eb45ae0fd17f 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -1677,19 +1677,19 @@ static bool rbtree_simple_xattr_less(struct rb_node *new_node, * of matching xattrs is wanted. Should only be called during inode * initialization when a few distinct initial xattrs are supposed to be set. */ -void simple_xattr_add(struct simple_xattrs *xattrs, - struct simple_xattr *new_xattr) -{ - if (xattrs->use_rhashtable) { - WARN_ON(rhashtable_insert_fast(&xattrs->ht, - &new_xattr->hash_node, - simple_xattr_params)); - } else { - write_lock(&xattrs->lock); - rb_add(&new_xattr->rb_node, &xattrs->rb_root, - rbtree_simple_xattr_less); - write_unlock(&xattrs->lock); - } +int simple_xattr_add(struct simple_xattrs *xattrs, + struct simple_xattr *new_xattr) +{ + if (xattrs->use_rhashtable) + return rhashtable_insert_fast(&xattrs->ht, + &new_xattr->hash_node, + simple_xattr_params); + + write_lock(&xattrs->lock); + rb_add(&new_xattr->rb_node, &xattrs->rb_root, + rbtree_simple_xattr_less); + write_unlock(&xattrs->lock); + return 0; } /** diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index a8273b32e041..f6a2d3402d76 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -48,7 +48,7 @@ struct shmem_inode_info { }; struct timespec64 i_crtime; /* file creation time */ struct shared_policy policy; /* NUMA memory alloc policy */ - struct simple_xattrs xattrs; /* list of xattrs */ + struct simple_xattrs *xattrs; /* list of xattrs */ pgoff_t fallocend; /* highest fallocate endindex */ unsigned int fsflags; /* for FS_IOC_[SG]ETFLAGS */ atomic_t stop_eviction; /* hold when working on inode */ diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 832a44358661..6e619e185e90 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -142,8 +142,8 @@ struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, size_t size, int flags); ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer, size_t size); -void simple_xattr_add(struct simple_xattrs *xattrs, - struct simple_xattr *new_xattr); +int simple_xattr_add(struct simple_xattrs *xattrs, + struct simple_xattr *new_xattr); int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name); DEFINE_CLASS(simple_xattr, diff --git a/mm/shmem.c b/mm/shmem.c index 35c2f8748668..0b0e577e880a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1425,7 +1425,10 @@ static void shmem_evict_inode(struct inode *inode) } } - simple_xattrs_free(&info->xattrs, sbinfo->max_inodes ? &freed : NULL); + if (info->xattrs) { + simple_xattrs_free(info->xattrs, sbinfo->max_inodes ? &freed : NULL); + kfree(info->xattrs); + } shmem_free_inode(inode->i_sb, freed); WARN_ON(inode->i_blocks); clear_inode(inode); @@ -3101,7 +3104,6 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, shmem_set_inode_flags(inode, info->fsflags, NULL); INIT_LIST_HEAD(&info->shrinklist); INIT_LIST_HEAD(&info->swaplist); - simple_xattrs_init(&info->xattrs); cache_no_acl(inode); if (sbinfo->noswap) mapping_set_unevictable(inode->i_mapping); @@ -4255,10 +4257,13 @@ static int shmem_initxattrs(struct inode *inode, struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); const struct xattr *xattr; - struct simple_xattr *new_xattr; size_t ispace = 0; size_t len; + CLASS(simple_xattrs, xattrs)(); + if (IS_ERR(xattrs)) + return PTR_ERR(xattrs); + if (sbinfo->max_inodes) { for (xattr = xattr_array; xattr->name != NULL; xattr++) { ispace += simple_xattr_space(xattr->name, @@ -4277,24 +4282,24 @@ static int shmem_initxattrs(struct inode *inode, } for (xattr = xattr_array; xattr->name != NULL; xattr++) { - new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len); + CLASS(simple_xattr, new_xattr)(xattr->value, xattr->value_len); if (IS_ERR(new_xattr)) break; len = strlen(xattr->name) + 1; new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, GFP_KERNEL_ACCOUNT); - if (!new_xattr->name) { - kvfree(new_xattr); + if (!new_xattr->name) break; - } memcpy(new_xattr->name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, xattr->name, len); - simple_xattr_add(&info->xattrs, new_xattr); + if (simple_xattr_add(xattrs, new_xattr)) + break; + retain_and_null_ptr(new_xattr); } if (xattr->name != NULL) { @@ -4303,10 +4308,10 @@ static int shmem_initxattrs(struct inode *inode, sbinfo->free_ispace += ispace; raw_spin_unlock(&sbinfo->stat_lock); } - simple_xattrs_free(&info->xattrs, NULL); return -ENOMEM; } + smp_store_release(&info->xattrs, no_free_ptr(xattrs)); return 0; } @@ -4315,9 +4320,14 @@ static int shmem_xattr_handler_get(const struct xattr_handler *handler, const char *name, void *buffer, size_t size) { struct shmem_inode_info *info = SHMEM_I(inode); + struct simple_xattrs *xattrs; + + xattrs = READ_ONCE(info->xattrs); + if (!xattrs) + return -ENODATA; name = xattr_full_name(handler, name); - return simple_xattr_get(&info->xattrs, name, buffer, size); + return simple_xattr_get(xattrs, name, buffer, size); } static int shmem_xattr_handler_set(const struct xattr_handler *handler, @@ -4328,10 +4338,16 @@ static int shmem_xattr_handler_set(const struct xattr_handler *handler, { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct simple_xattrs *xattrs; struct simple_xattr *old_xattr; size_t ispace = 0; name = xattr_full_name(handler, name); + + xattrs = simple_xattrs_lazy_alloc(&info->xattrs, value, flags); + if (IS_ERR_OR_NULL(xattrs)) + return PTR_ERR(xattrs); + if (value && sbinfo->max_inodes) { ispace = simple_xattr_space(name, size); raw_spin_lock(&sbinfo->stat_lock); @@ -4344,13 +4360,13 @@ static int shmem_xattr_handler_set(const struct xattr_handler *handler, return -ENOSPC; } - old_xattr = simple_xattr_set(&info->xattrs, name, value, size, flags); + old_xattr = simple_xattr_set(xattrs, name, value, size, flags); if (!IS_ERR(old_xattr)) { ispace = 0; if (old_xattr && sbinfo->max_inodes) ispace = simple_xattr_space(old_xattr->name, old_xattr->size); - simple_xattr_free(old_xattr); + simple_xattr_free_rcu(old_xattr); old_xattr = NULL; inode_set_ctime_current(inode); inode_inc_iversion(inode); @@ -4391,7 +4407,9 @@ static const struct xattr_handler * const shmem_xattr_handlers[] = { static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) { struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); - return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size); + + return simple_xattr_list(d_inode(dentry), READ_ONCE(info->xattrs), + buffer, size); } #endif /* CONFIG_TMPFS_XATTR */ -- cgit v1.2.3 From f4cc3ab824d6772a48ca9d9c74ac623b3309985d Mon Sep 17 00:00:00 2001 From: Christian König Date: Tue, 7 Oct 2025 14:06:05 +0200 Subject: dma-buf: protected fence ops by RCU v8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fence ops of a dma_fence currently need to life as long as the dma_fence is alive. This means that the module which originally issued a dma_fence can't unload unless all fences are freed up. As first step to solve this issue protect the fence ops by RCU. While it is counter intuitive to protect a constant function pointer table by RCU it allows modules to wait for an RCU grace period before they unload, to make sure that nobody is executing their functions any more. This patch has not much functional change, but only adds the RCU handling for the static checker to test. v2: make one the now duplicated lockdep warnings a comment instead. v3: Add more documentation to ->wait and ->release callback. v4: fix typo in documentation v5: rebased on drm-tip v6: improve code comments v7: improve commit message and code comments v8: fix sparse rcu warnings Signed-off-by: Christian König Reviewed-by: Tvrtko Ursulin Reviewed-by: Boris Brezillon Link: https://lore.kernel.org/r/20260219160822.1529-2-christian.koenig@amd.com --- drivers/dma-buf/dma-fence.c | 71 +++++++++++++++++++++++---------- drivers/gpu/drm/drm_crtc.c | 2 +- drivers/gpu/drm/scheduler/sched_fence.c | 4 +- include/linux/dma-fence.h | 33 ++++++++++++--- 4 files changed, 80 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c index 7e8db99186c2..076e6e6c75be 100644 --- a/drivers/dma-buf/dma-fence.c +++ b/drivers/dma-buf/dma-fence.c @@ -522,6 +522,7 @@ EXPORT_SYMBOL(dma_fence_signal); signed long dma_fence_wait_timeout(struct dma_fence *fence, bool intr, signed long timeout) { + const struct dma_fence_ops *ops; signed long ret; if (WARN_ON(timeout < 0)) @@ -533,15 +534,22 @@ dma_fence_wait_timeout(struct dma_fence *fence, bool intr, signed long timeout) dma_fence_enable_sw_signaling(fence); - if (trace_dma_fence_wait_start_enabled()) { - rcu_read_lock(); - trace_dma_fence_wait_start(fence); + rcu_read_lock(); + ops = rcu_dereference(fence->ops); + trace_dma_fence_wait_start(fence); + if (ops->wait) { + /* + * Implementing the wait ops is deprecated and not supported for + * issuers of fences who need their lifetime to be independent + * of their module after they signal, so it is ok to use the + * ops outside the RCU protected section. + */ + rcu_read_unlock(); + ret = ops->wait(fence, intr, timeout); + } else { rcu_read_unlock(); - } - if (fence->ops->wait) - ret = fence->ops->wait(fence, intr, timeout); - else ret = dma_fence_default_wait(fence, intr, timeout); + } if (trace_dma_fence_wait_end_enabled()) { rcu_read_lock(); trace_dma_fence_wait_end(fence); @@ -562,6 +570,7 @@ void dma_fence_release(struct kref *kref) { struct dma_fence *fence = container_of(kref, struct dma_fence, refcount); + const struct dma_fence_ops *ops; rcu_read_lock(); trace_dma_fence_destroy(fence); @@ -593,12 +602,12 @@ void dma_fence_release(struct kref *kref) spin_unlock_irqrestore(fence->lock, flags); } - rcu_read_unlock(); - - if (fence->ops->release) - fence->ops->release(fence); + ops = rcu_dereference(fence->ops); + if (ops->release) + ops->release(fence); else dma_fence_free(fence); + rcu_read_unlock(); } EXPORT_SYMBOL(dma_fence_release); @@ -617,6 +626,7 @@ EXPORT_SYMBOL(dma_fence_free); static bool __dma_fence_enable_signaling(struct dma_fence *fence) { + const struct dma_fence_ops *ops; bool was_set; lockdep_assert_held(fence->lock); @@ -627,14 +637,18 @@ static bool __dma_fence_enable_signaling(struct dma_fence *fence) if (dma_fence_test_signaled_flag(fence)) return false; - if (!was_set && fence->ops->enable_signaling) { + rcu_read_lock(); + ops = rcu_dereference(fence->ops); + if (!was_set && ops->enable_signaling) { trace_dma_fence_enable_signal(fence); - if (!fence->ops->enable_signaling(fence)) { + if (!ops->enable_signaling(fence)) { + rcu_read_unlock(); dma_fence_signal_locked(fence); return false; } } + rcu_read_unlock(); return true; } @@ -1007,8 +1021,13 @@ EXPORT_SYMBOL(dma_fence_wait_any_timeout); */ void dma_fence_set_deadline(struct dma_fence *fence, ktime_t deadline) { - if (fence->ops->set_deadline && !dma_fence_is_signaled(fence)) - fence->ops->set_deadline(fence, deadline); + const struct dma_fence_ops *ops; + + rcu_read_lock(); + ops = rcu_dereference(fence->ops); + if (ops->set_deadline && !dma_fence_is_signaled(fence)) + ops->set_deadline(fence, deadline); + rcu_read_unlock(); } EXPORT_SYMBOL(dma_fence_set_deadline); @@ -1049,7 +1068,13 @@ __dma_fence_init(struct dma_fence *fence, const struct dma_fence_ops *ops, BUG_ON(!ops || !ops->get_driver_name || !ops->get_timeline_name); kref_init(&fence->refcount); - fence->ops = ops; + /* + * While it is counter intuitive to protect a constant function pointer + * table by RCU it allows modules to wait for an RCU grace period + * before they unload, to make sure that nobody is executing their + * functions any more. + */ + RCU_INIT_POINTER(fence->ops, ops); INIT_LIST_HEAD(&fence->cb_list); fence->lock = lock; fence->context = context; @@ -1129,11 +1154,12 @@ EXPORT_SYMBOL(dma_fence_init64); */ const char __rcu *dma_fence_driver_name(struct dma_fence *fence) { - RCU_LOCKDEP_WARN(!rcu_read_lock_held(), - "RCU protection is required for safe access to returned string"); + const struct dma_fence_ops *ops; + /* RCU protection is required for safe access to returned string */ + ops = rcu_dereference(fence->ops); if (!dma_fence_test_signaled_flag(fence)) - return (const char __rcu *)fence->ops->get_driver_name(fence); + return (const char __rcu *)ops->get_driver_name(fence); else return (const char __rcu *)"detached-driver"; } @@ -1161,11 +1187,12 @@ EXPORT_SYMBOL(dma_fence_driver_name); */ const char __rcu *dma_fence_timeline_name(struct dma_fence *fence) { - RCU_LOCKDEP_WARN(!rcu_read_lock_held(), - "RCU protection is required for safe access to returned string"); + const struct dma_fence_ops *ops; + /* RCU protection is required for safe access to returned string */ + ops = rcu_dereference(fence->ops); if (!dma_fence_test_signaled_flag(fence)) - return (const char __rcu *)fence->ops->get_driver_name(fence); + return (const char __rcu *)ops->get_driver_name(fence); else return (const char __rcu *)"signaled-timeline"; } diff --git a/drivers/gpu/drm/drm_crtc.c b/drivers/gpu/drm/drm_crtc.c index 90684f30a048..960fdc1cc6ba 100644 --- a/drivers/gpu/drm/drm_crtc.c +++ b/drivers/gpu/drm/drm_crtc.c @@ -158,7 +158,7 @@ static const struct dma_fence_ops drm_crtc_fence_ops; static struct drm_crtc *fence_to_crtc(struct dma_fence *fence) { - BUG_ON(fence->ops != &drm_crtc_fence_ops); + BUG_ON(rcu_access_pointer(fence->ops) != &drm_crtc_fence_ops); return container_of(fence->lock, struct drm_crtc, fence_lock); } diff --git a/drivers/gpu/drm/scheduler/sched_fence.c b/drivers/gpu/drm/scheduler/sched_fence.c index 9391d6f0dc01..a27786cb86fb 100644 --- a/drivers/gpu/drm/scheduler/sched_fence.c +++ b/drivers/gpu/drm/scheduler/sched_fence.c @@ -195,10 +195,10 @@ static const struct dma_fence_ops drm_sched_fence_ops_finished = { struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f) { - if (f->ops == &drm_sched_fence_ops_scheduled) + if (rcu_access_pointer(f->ops) == &drm_sched_fence_ops_scheduled) return container_of(f, struct drm_sched_fence, scheduled); - if (f->ops == &drm_sched_fence_ops_finished) + if (rcu_access_pointer(f->ops) == &drm_sched_fence_ops_finished) return container_of(f, struct drm_sched_fence, finished); return NULL; diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index 9c4d25289239..fa3cfe3e98ac 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -67,7 +67,7 @@ struct seq_file; */ struct dma_fence { spinlock_t *lock; - const struct dma_fence_ops *ops; + const struct dma_fence_ops __rcu *ops; /* * We clear the callback list on kref_put so that by the time we * release the fence it is unused. No one should be adding to the @@ -220,6 +220,10 @@ struct dma_fence_ops { * timed out. Can also return other error values on custom implementations, * which should be treated as if the fence is signaled. For example a hardware * lockup could be reported like that. + * + * Implementing this callback prevents the fence from detaching after + * signaling and so it is necessary for the module providing the + * dma_fence_ops to stay loaded as long as the dma_fence exists. */ signed long (*wait)(struct dma_fence *fence, bool intr, signed long timeout); @@ -231,6 +235,13 @@ struct dma_fence_ops { * Can be called from irq context. This callback is optional. If it is * NULL, then dma_fence_free() is instead called as the default * implementation. + * + * Implementing this callback prevents the fence from detaching after + * signaling and so it is necessary for the module providing the + * dma_fence_ops to stay loaded as long as the dma_fence exists. + * + * If the callback is implemented the memory backing the dma_fence + * object must be freed RCU safe. */ void (*release)(struct dma_fence *fence); @@ -454,13 +465,19 @@ dma_fence_test_signaled_flag(struct dma_fence *fence) static inline bool dma_fence_is_signaled_locked(struct dma_fence *fence) { + const struct dma_fence_ops *ops; + if (dma_fence_test_signaled_flag(fence)) return true; - if (fence->ops->signaled && fence->ops->signaled(fence)) { + rcu_read_lock(); + ops = rcu_dereference(fence->ops); + if (ops->signaled && ops->signaled(fence)) { + rcu_read_unlock(); dma_fence_signal_locked(fence); return true; } + rcu_read_unlock(); return false; } @@ -484,13 +501,19 @@ dma_fence_is_signaled_locked(struct dma_fence *fence) static inline bool dma_fence_is_signaled(struct dma_fence *fence) { + const struct dma_fence_ops *ops; + if (dma_fence_test_signaled_flag(fence)) return true; - if (fence->ops->signaled && fence->ops->signaled(fence)) { + rcu_read_lock(); + ops = rcu_dereference(fence->ops); + if (ops->signaled && ops->signaled(fence)) { + rcu_read_unlock(); dma_fence_signal(fence); return true; } + rcu_read_unlock(); return false; } @@ -695,7 +718,7 @@ extern const struct dma_fence_ops dma_fence_chain_ops; */ static inline bool dma_fence_is_array(struct dma_fence *fence) { - return fence->ops == &dma_fence_array_ops; + return rcu_access_pointer(fence->ops) == &dma_fence_array_ops; } /** @@ -706,7 +729,7 @@ static inline bool dma_fence_is_array(struct dma_fence *fence) */ static inline bool dma_fence_is_chain(struct dma_fence *fence) { - return fence->ops == &dma_fence_chain_ops; + return rcu_access_pointer(fence->ops) == &dma_fence_chain_ops; } /** -- cgit v1.2.3 From 541c8f2468b933acc5d129e84bd264923675a66e Mon Sep 17 00:00:00 2001 From: Christian König Date: Wed, 8 Oct 2025 18:12:46 +0200 Subject: dma-buf: detach fence ops on signal v3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When neither a release nor a wait backend ops is specified it is possible to let the dma_fence live on independently of the module who issued it. This makes it possible to unload drivers and only wait for all their fences to signal. v2: fix typo in comment v3: fix sparse rcu warnings Signed-off-by: Christian König Reviewed-by: Tvrtko Ursulin Reviewed-by: Philipp Stanner Reviewed-by: Boris Brezillon Link: https://lore.kernel.org/r/20260219160822.1529-3-christian.koenig@amd.com --- drivers/dma-buf/dma-fence.c | 18 ++++++++++++++---- include/linux/dma-fence.h | 4 ++-- 2 files changed, 16 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c index 076e6e6c75be..3279d82ffa98 100644 --- a/drivers/dma-buf/dma-fence.c +++ b/drivers/dma-buf/dma-fence.c @@ -362,6 +362,7 @@ void __dma_fence_might_wait(void) void dma_fence_signal_timestamp_locked(struct dma_fence *fence, ktime_t timestamp) { + const struct dma_fence_ops *ops; struct dma_fence_cb *cur, *tmp; struct list_head cb_list; @@ -371,6 +372,15 @@ void dma_fence_signal_timestamp_locked(struct dma_fence *fence, &fence->flags))) return; + /* + * When neither a release nor a wait operation is specified set the ops + * pointer to NULL to allow the fence structure to become independent + * from who originally issued it. + */ + ops = rcu_dereference_protected(fence->ops, true); + if (!ops->release && !ops->wait) + RCU_INIT_POINTER(fence->ops, NULL); + /* Stash the cb_list before replacing it with the timestamp */ list_replace(&fence->cb_list, &cb_list); @@ -537,7 +547,7 @@ dma_fence_wait_timeout(struct dma_fence *fence, bool intr, signed long timeout) rcu_read_lock(); ops = rcu_dereference(fence->ops); trace_dma_fence_wait_start(fence); - if (ops->wait) { + if (ops && ops->wait) { /* * Implementing the wait ops is deprecated and not supported for * issuers of fences who need their lifetime to be independent @@ -603,7 +613,7 @@ void dma_fence_release(struct kref *kref) } ops = rcu_dereference(fence->ops); - if (ops->release) + if (ops && ops->release) ops->release(fence); else dma_fence_free(fence); @@ -639,7 +649,7 @@ static bool __dma_fence_enable_signaling(struct dma_fence *fence) rcu_read_lock(); ops = rcu_dereference(fence->ops); - if (!was_set && ops->enable_signaling) { + if (!was_set && ops && ops->enable_signaling) { trace_dma_fence_enable_signal(fence); if (!ops->enable_signaling(fence)) { @@ -1025,7 +1035,7 @@ void dma_fence_set_deadline(struct dma_fence *fence, ktime_t deadline) rcu_read_lock(); ops = rcu_dereference(fence->ops); - if (ops->set_deadline && !dma_fence_is_signaled(fence)) + if (ops && ops->set_deadline && !dma_fence_is_signaled(fence)) ops->set_deadline(fence, deadline); rcu_read_unlock(); } diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index fa3cfe3e98ac..9ff2c4a09cdc 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -472,7 +472,7 @@ dma_fence_is_signaled_locked(struct dma_fence *fence) rcu_read_lock(); ops = rcu_dereference(fence->ops); - if (ops->signaled && ops->signaled(fence)) { + if (ops && ops->signaled && ops->signaled(fence)) { rcu_read_unlock(); dma_fence_signal_locked(fence); return true; @@ -508,7 +508,7 @@ dma_fence_is_signaled(struct dma_fence *fence) rcu_read_lock(); ops = rcu_dereference(fence->ops); - if (ops->signaled && ops->signaled(fence)) { + if (ops && ops->signaled && ops->signaled(fence)) { rcu_read_unlock(); dma_fence_signal(fence); return true; -- cgit v1.2.3 From 3e5067931b5df667f5350fafe4410554e228e53e Mon Sep 17 00:00:00 2001 From: Christian König Date: Thu, 9 Oct 2025 10:40:06 +0200 Subject: dma-buf: abstract fence locking v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add dma_fence_lock_irqsafe() and dma_fence_unlock_irqrestore() wrappers and mechanically apply them everywhere. Just a pre-requisite cleanup for a follow up patch. v2: add some missing i915 bits, add abstraction for lockdep assertion as well v3: one more suggestion by Tvrtko Signed-off-by: Christian König Reviewed-by: Tvrtko Ursulin Link: https://lore.kernel.org/r/20260219160822.1529-4-christian.koenig@amd.com --- drivers/dma-buf/dma-fence.c | 48 +++++++++++++---------------- drivers/dma-buf/st-dma-fence.c | 6 ++-- drivers/dma-buf/sw_sync.c | 14 ++++----- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 4 +-- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 4 +-- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +- drivers/gpu/drm/i915/gt/intel_breadcrumbs.c | 2 +- drivers/gpu/drm/i915/i915_active.c | 19 +++++++----- drivers/gpu/drm/nouveau/nouveau_drm.c | 5 +-- drivers/gpu/drm/scheduler/sched_fence.c | 6 ++-- drivers/gpu/drm/xe/xe_sched_job.c | 4 +-- include/linux/dma-fence.h | 38 +++++++++++++++++++++++ 12 files changed, 96 insertions(+), 56 deletions(-) (limited to 'include/linux') diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c index 3279d82ffa98..698260c49f52 100644 --- a/drivers/dma-buf/dma-fence.c +++ b/drivers/dma-buf/dma-fence.c @@ -366,7 +366,7 @@ void dma_fence_signal_timestamp_locked(struct dma_fence *fence, struct dma_fence_cb *cur, *tmp; struct list_head cb_list; - lockdep_assert_held(fence->lock); + dma_fence_assert_held(fence); if (unlikely(test_and_set_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags))) @@ -414,9 +414,9 @@ void dma_fence_signal_timestamp(struct dma_fence *fence, ktime_t timestamp) if (WARN_ON(!fence)) return; - spin_lock_irqsave(fence->lock, flags); + dma_fence_lock_irqsave(fence, flags); dma_fence_signal_timestamp_locked(fence, timestamp); - spin_unlock_irqrestore(fence->lock, flags); + dma_fence_unlock_irqrestore(fence, flags); } EXPORT_SYMBOL(dma_fence_signal_timestamp); @@ -475,9 +475,9 @@ bool dma_fence_check_and_signal(struct dma_fence *fence) unsigned long flags; bool ret; - spin_lock_irqsave(fence->lock, flags); + dma_fence_lock_irqsave(fence, flags); ret = dma_fence_check_and_signal_locked(fence); - spin_unlock_irqrestore(fence->lock, flags); + dma_fence_unlock_irqrestore(fence, flags); return ret; } @@ -503,9 +503,9 @@ void dma_fence_signal(struct dma_fence *fence) tmp = dma_fence_begin_signalling(); - spin_lock_irqsave(fence->lock, flags); + dma_fence_lock_irqsave(fence, flags); dma_fence_signal_timestamp_locked(fence, ktime_get()); - spin_unlock_irqrestore(fence->lock, flags); + dma_fence_unlock_irqrestore(fence, flags); dma_fence_end_signalling(tmp); } @@ -606,10 +606,10 @@ void dma_fence_release(struct kref *kref) * don't leave chains dangling. We set the error flag first * so that the callbacks know this signal is due to an error. */ - spin_lock_irqsave(fence->lock, flags); + dma_fence_lock_irqsave(fence, flags); fence->error = -EDEADLK; dma_fence_signal_locked(fence); - spin_unlock_irqrestore(fence->lock, flags); + dma_fence_unlock_irqrestore(fence, flags); } ops = rcu_dereference(fence->ops); @@ -639,7 +639,7 @@ static bool __dma_fence_enable_signaling(struct dma_fence *fence) const struct dma_fence_ops *ops; bool was_set; - lockdep_assert_held(fence->lock); + dma_fence_assert_held(fence); was_set = test_and_set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &fence->flags); @@ -675,9 +675,9 @@ void dma_fence_enable_sw_signaling(struct dma_fence *fence) { unsigned long flags; - spin_lock_irqsave(fence->lock, flags); + dma_fence_lock_irqsave(fence, flags); __dma_fence_enable_signaling(fence); - spin_unlock_irqrestore(fence->lock, flags); + dma_fence_unlock_irqrestore(fence, flags); } EXPORT_SYMBOL(dma_fence_enable_sw_signaling); @@ -717,8 +717,7 @@ int dma_fence_add_callback(struct dma_fence *fence, struct dma_fence_cb *cb, return -ENOENT; } - spin_lock_irqsave(fence->lock, flags); - + dma_fence_lock_irqsave(fence, flags); if (__dma_fence_enable_signaling(fence)) { cb->func = func; list_add_tail(&cb->node, &fence->cb_list); @@ -726,8 +725,7 @@ int dma_fence_add_callback(struct dma_fence *fence, struct dma_fence_cb *cb, INIT_LIST_HEAD(&cb->node); ret = -ENOENT; } - - spin_unlock_irqrestore(fence->lock, flags); + dma_fence_unlock_irqrestore(fence, flags); return ret; } @@ -750,9 +748,9 @@ int dma_fence_get_status(struct dma_fence *fence) unsigned long flags; int status; - spin_lock_irqsave(fence->lock, flags); + dma_fence_lock_irqsave(fence, flags); status = dma_fence_get_status_locked(fence); - spin_unlock_irqrestore(fence->lock, flags); + dma_fence_unlock_irqrestore(fence, flags); return status; } @@ -782,13 +780,11 @@ dma_fence_remove_callback(struct dma_fence *fence, struct dma_fence_cb *cb) unsigned long flags; bool ret; - spin_lock_irqsave(fence->lock, flags); - + dma_fence_lock_irqsave(fence, flags); ret = !list_empty(&cb->node); if (ret) list_del_init(&cb->node); - - spin_unlock_irqrestore(fence->lock, flags); + dma_fence_unlock_irqrestore(fence, flags); return ret; } @@ -827,7 +823,7 @@ dma_fence_default_wait(struct dma_fence *fence, bool intr, signed long timeout) unsigned long flags; signed long ret = timeout ? timeout : 1; - spin_lock_irqsave(fence->lock, flags); + dma_fence_lock_irqsave(fence, flags); if (dma_fence_test_signaled_flag(fence)) goto out; @@ -851,11 +847,11 @@ dma_fence_default_wait(struct dma_fence *fence, bool intr, signed long timeout) __set_current_state(TASK_INTERRUPTIBLE); else __set_current_state(TASK_UNINTERRUPTIBLE); - spin_unlock_irqrestore(fence->lock, flags); + dma_fence_unlock_irqrestore(fence, flags); ret = schedule_timeout(ret); - spin_lock_irqsave(fence->lock, flags); + dma_fence_lock_irqsave(fence, flags); if (ret > 0 && intr && signal_pending(current)) ret = -ERESTARTSYS; } @@ -865,7 +861,7 @@ dma_fence_default_wait(struct dma_fence *fence, bool intr, signed long timeout) __set_current_state(TASK_RUNNING); out: - spin_unlock_irqrestore(fence->lock, flags); + dma_fence_unlock_irqrestore(fence, flags); return ret; } EXPORT_SYMBOL(dma_fence_default_wait); diff --git a/drivers/dma-buf/st-dma-fence.c b/drivers/dma-buf/st-dma-fence.c index 73ed6fd48a13..5d0d9abc6e21 100644 --- a/drivers/dma-buf/st-dma-fence.c +++ b/drivers/dma-buf/st-dma-fence.c @@ -410,8 +410,10 @@ struct race_thread { static void __wait_for_callbacks(struct dma_fence *f) { - spin_lock_irq(f->lock); - spin_unlock_irq(f->lock); + unsigned long flags; + + dma_fence_lock_irqsave(f, flags); + dma_fence_unlock_irqrestore(f, flags); } static int thread_signal_callback(void *arg) diff --git a/drivers/dma-buf/sw_sync.c b/drivers/dma-buf/sw_sync.c index 963a72324d16..8df20b0218a9 100644 --- a/drivers/dma-buf/sw_sync.c +++ b/drivers/dma-buf/sw_sync.c @@ -156,12 +156,12 @@ static void timeline_fence_release(struct dma_fence *fence) struct sync_timeline *parent = dma_fence_parent(fence); unsigned long flags; - spin_lock_irqsave(fence->lock, flags); + dma_fence_lock_irqsave(fence, flags); if (!list_empty(&pt->link)) { list_del(&pt->link); rb_erase(&pt->node, &parent->pt_tree); } - spin_unlock_irqrestore(fence->lock, flags); + dma_fence_unlock_irqrestore(fence, flags); sync_timeline_put(parent); dma_fence_free(fence); @@ -179,7 +179,7 @@ static void timeline_fence_set_deadline(struct dma_fence *fence, ktime_t deadlin struct sync_pt *pt = dma_fence_to_sync_pt(fence); unsigned long flags; - spin_lock_irqsave(fence->lock, flags); + dma_fence_lock_irqsave(fence, flags); if (test_bit(SW_SYNC_HAS_DEADLINE_BIT, &fence->flags)) { if (ktime_before(deadline, pt->deadline)) pt->deadline = deadline; @@ -187,7 +187,7 @@ static void timeline_fence_set_deadline(struct dma_fence *fence, ktime_t deadlin pt->deadline = deadline; __set_bit(SW_SYNC_HAS_DEADLINE_BIT, &fence->flags); } - spin_unlock_irqrestore(fence->lock, flags); + dma_fence_unlock_irqrestore(fence, flags); } static const struct dma_fence_ops timeline_fence_ops = { @@ -431,13 +431,13 @@ static int sw_sync_ioctl_get_deadline(struct sync_timeline *obj, unsigned long a goto put_fence; } - spin_lock_irqsave(fence->lock, flags); + dma_fence_lock_irqsave(fence, flags); if (!test_bit(SW_SYNC_HAS_DEADLINE_BIT, &fence->flags)) { ret = -ENOENT; goto unlock; } data.deadline_ns = ktime_to_ns(pt->deadline); - spin_unlock_irqrestore(fence->lock, flags); + dma_fence_unlock_irqrestore(fence, flags); dma_fence_put(fence); @@ -450,7 +450,7 @@ static int sw_sync_ioctl_get_deadline(struct sync_timeline *obj, unsigned long a return 0; unlock: - spin_unlock_irqrestore(fence->lock, flags); + dma_fence_unlock_irqrestore(fence, flags); put_fence: dma_fence_put(fence); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c index 4638a686a84e..7c047f5a1549 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c @@ -479,10 +479,10 @@ bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid, if (amdgpu_sriov_vf(ring->adev) || !ring->funcs->soft_recovery || !fence) return false; - spin_lock_irqsave(fence->lock, flags); + dma_fence_lock_irqsave(fence, flags); if (!dma_fence_is_signaled_locked(fence)) dma_fence_set_error(fence, -ENODATA); - spin_unlock_irqrestore(fence->lock, flags); + dma_fence_unlock_irqrestore(fence, flags); while (!dma_fence_is_signaled(fence) && ktime_to_ns(ktime_sub(deadline, ktime_get())) > 0) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index f2beb980e3c3..8b095087feb4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2785,8 +2785,8 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm) dma_fence_put(vm->last_unlocked); dma_fence_wait(vm->last_tlb_flush, false); /* Make sure that all fence callbacks have completed */ - spin_lock_irqsave(vm->last_tlb_flush->lock, flags); - spin_unlock_irqrestore(vm->last_tlb_flush->lock, flags); + dma_fence_lock_irqsave(vm->last_tlb_flush, flags); + dma_fence_unlock_irqrestore(vm->last_tlb_flush, flags); dma_fence_put(vm->last_tlb_flush); list_for_each_entry_safe(mapping, tmp, &vm->freed, list) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 806d62ed61ef..a914ceec90aa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -639,7 +639,7 @@ static inline uint64_t amdgpu_vm_tlb_seq(struct amdgpu_vm *vm) * sure that the dma_fence structure isn't freed up. */ rcu_read_lock(); - lock = vm->last_tlb_flush->lock; + lock = dma_fence_spinlock(vm->last_tlb_flush); rcu_read_unlock(); spin_lock_irqsave(lock, flags); diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c index a2b413982ce6..c10ac0ab3bfa 100644 --- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c +++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c @@ -148,7 +148,7 @@ __dma_fence_signal__notify(struct dma_fence *fence, { struct dma_fence_cb *cur, *tmp; - lockdep_assert_held(fence->lock); + dma_fence_assert_held(fence); list_for_each_entry_safe(cur, tmp, list, node) { INIT_LIST_HEAD(&cur->node); diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c index 25c46d7b1ea7..cd44cbfb53b5 100644 --- a/drivers/gpu/drm/i915/i915_active.c +++ b/drivers/gpu/drm/i915/i915_active.c @@ -1045,9 +1045,10 @@ __i915_active_fence_set(struct i915_active_fence *active, * nesting rules for the fence->lock; the inner lock is always the * older lock. */ - spin_lock_irqsave(fence->lock, flags); + dma_fence_lock_irqsave(fence, flags); if (prev) - spin_lock_nested(prev->lock, SINGLE_DEPTH_NESTING); + spin_lock_nested(dma_fence_spinlock(prev), + SINGLE_DEPTH_NESTING); /* * A does the cmpxchg first, and so it sees C or NULL, as before, or @@ -1061,17 +1062,18 @@ __i915_active_fence_set(struct i915_active_fence *active, */ while (cmpxchg(__active_fence_slot(active), prev, fence) != prev) { if (prev) { - spin_unlock(prev->lock); + spin_unlock(dma_fence_spinlock(prev)); dma_fence_put(prev); } - spin_unlock_irqrestore(fence->lock, flags); + dma_fence_unlock_irqrestore(fence, flags); prev = i915_active_fence_get(active); GEM_BUG_ON(prev == fence); - spin_lock_irqsave(fence->lock, flags); + dma_fence_lock_irqsave(fence, flags); if (prev) - spin_lock_nested(prev->lock, SINGLE_DEPTH_NESTING); + spin_lock_nested(dma_fence_spinlock(prev), + SINGLE_DEPTH_NESTING); } /* @@ -1088,10 +1090,11 @@ __i915_active_fence_set(struct i915_active_fence *active, */ if (prev) { __list_del_entry(&active->cb.node); - spin_unlock(prev->lock); /* serialise with prev->cb_list */ + /* serialise with prev->cb_list */ + spin_unlock(dma_fence_spinlock(prev)); } list_add_tail(&active->cb.node, &fence->cb_list); - spin_unlock_irqrestore(fence->lock, flags); + dma_fence_unlock_irqrestore(fence, flags); return prev; } diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c index cb22237ac17d..17c114645d9f 100644 --- a/drivers/gpu/drm/nouveau/nouveau_drm.c +++ b/drivers/gpu/drm/nouveau/nouveau_drm.c @@ -156,12 +156,13 @@ nouveau_name(struct drm_device *dev) static inline bool nouveau_cli_work_ready(struct dma_fence *fence) { + unsigned long flags; bool ret = true; - spin_lock_irq(fence->lock); + dma_fence_lock_irqsave(fence, flags); if (!dma_fence_is_signaled_locked(fence)) ret = false; - spin_unlock_irq(fence->lock); + dma_fence_unlock_irqrestore(fence, flags); if (ret == true) dma_fence_put(fence); diff --git a/drivers/gpu/drm/scheduler/sched_fence.c b/drivers/gpu/drm/scheduler/sched_fence.c index a27786cb86fb..096fe28aa9c9 100644 --- a/drivers/gpu/drm/scheduler/sched_fence.c +++ b/drivers/gpu/drm/scheduler/sched_fence.c @@ -156,19 +156,19 @@ static void drm_sched_fence_set_deadline_finished(struct dma_fence *f, struct dma_fence *parent; unsigned long flags; - spin_lock_irqsave(&fence->lock, flags); + dma_fence_lock_irqsave(f, flags); /* If we already have an earlier deadline, keep it: */ if (test_bit(DRM_SCHED_FENCE_FLAG_HAS_DEADLINE_BIT, &f->flags) && ktime_before(fence->deadline, deadline)) { - spin_unlock_irqrestore(&fence->lock, flags); + dma_fence_unlock_irqrestore(f, flags); return; } fence->deadline = deadline; set_bit(DRM_SCHED_FENCE_FLAG_HAS_DEADLINE_BIT, &f->flags); - spin_unlock_irqrestore(&fence->lock, flags); + dma_fence_unlock_irqrestore(f, flags); /* * smp_load_aquire() to ensure that if we are racing another diff --git a/drivers/gpu/drm/xe/xe_sched_job.c b/drivers/gpu/drm/xe/xe_sched_job.c index 3927666fe556..ae5b38b2a884 100644 --- a/drivers/gpu/drm/xe/xe_sched_job.c +++ b/drivers/gpu/drm/xe/xe_sched_job.c @@ -190,11 +190,11 @@ static bool xe_fence_set_error(struct dma_fence *fence, int error) unsigned long irq_flags; bool signaled; - spin_lock_irqsave(fence->lock, irq_flags); + dma_fence_lock_irqsave(fence, irq_flags); signaled = test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags); if (!signaled) dma_fence_set_error(fence, error); - spin_unlock_irqrestore(fence->lock, irq_flags); + dma_fence_unlock_irqrestore(fence, irq_flags); return signaled; } diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index 9ff2c4a09cdc..85d6eac9fa85 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -377,6 +377,44 @@ dma_fence_get_rcu_safe(struct dma_fence __rcu **fencep) } while (1); } +/** + * dma_fence_spinlock - return pointer to the spinlock protecting the fence + * @fence: the fence to get the lock from + * + * Return the pointer to the extern lock. + */ +static inline spinlock_t *dma_fence_spinlock(struct dma_fence *fence) +{ + return fence->lock; +} + +/** + * dma_fence_lock_irqsave - irqsave lock the fence + * @fence: the fence to lock + * @flags: where to store the CPU flags. + * + * Lock the fence, preventing it from changing to the signaled state. + */ +#define dma_fence_lock_irqsave(fence, flags) \ + spin_lock_irqsave(fence->lock, flags) + +/** + * dma_fence_unlock_irqrestore - unlock the fence and irqrestore + * @fence: the fence to unlock + * @flags the CPU flags to restore + * + * Unlock the fence, allowing it to change it's state to signaled again. + */ +#define dma_fence_unlock_irqrestore(fence, flags) \ + spin_unlock_irqrestore(fence->lock, flags) + +/** + * dma_fence_assert_held - lockdep assertion that fence is locked + * @fence: the fence which should be locked + */ +#define dma_fence_assert_held(fence) \ + lockdep_assert_held(dma_fence_spinlock(fence)); + #ifdef CONFIG_LOCKDEP bool dma_fence_begin_signalling(void); void dma_fence_end_signalling(bool cookie); -- cgit v1.2.3 From 1f32f310a13c9fb67a9993ab67f596b3f960206f Mon Sep 17 00:00:00 2001 From: Christian König Date: Thu, 9 Oct 2025 10:40:06 +0200 Subject: dma-buf: inline spinlock for fence protection v5 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement per-fence spinlocks, allowing implementations to not give an external spinlock to protect the fence internal state. Instead a spinlock embedded into the fence structure itself is used in this case. Shared spinlocks have the problem that implementations need to guarantee that the lock lives at least as long all fences referencing them. Using a per-fence spinlock allows completely decoupling spinlock producer and consumer life times, simplifying the handling in most use cases. v2: improve naming, coverage and function documentation v3: fix one additional locking in the selftests v4: separate out some changes to make the patch smaller, fix one amdgpu crash found by CI systems v5: improve comments Signed-off-by: Christian König Reviewed-by: Boris Brezillon Link: https://lore.kernel.org/r/20260219160822.1529-5-christian.koenig@amd.com --- drivers/dma-buf/dma-fence.c | 21 ++++++++++++++++----- drivers/dma-buf/sync_debug.h | 2 +- drivers/gpu/drm/drm_crtc.c | 2 +- drivers/gpu/drm/drm_writeback.c | 2 +- drivers/gpu/drm/nouveau/nouveau_fence.c | 3 ++- drivers/gpu/drm/qxl/qxl_release.c | 3 ++- drivers/gpu/drm/vmwgfx/vmwgfx_fence.c | 3 ++- drivers/gpu/drm/xe/xe_hw_fence.c | 3 ++- include/linux/dma-fence.h | 19 +++++++++++++------ 9 files changed, 40 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c index 698260c49f52..4ad863d2a52c 100644 --- a/drivers/dma-buf/dma-fence.c +++ b/drivers/dma-buf/dma-fence.c @@ -343,7 +343,6 @@ void __dma_fence_might_wait(void) } #endif - /** * dma_fence_signal_timestamp_locked - signal completion of a fence * @fence: the fence to signal @@ -1070,7 +1069,6 @@ static void __dma_fence_init(struct dma_fence *fence, const struct dma_fence_ops *ops, spinlock_t *lock, u64 context, u64 seqno, unsigned long flags) { - BUG_ON(!lock); BUG_ON(!ops || !ops->get_driver_name || !ops->get_timeline_name); kref_init(&fence->refcount); @@ -1082,10 +1080,15 @@ __dma_fence_init(struct dma_fence *fence, const struct dma_fence_ops *ops, */ RCU_INIT_POINTER(fence->ops, ops); INIT_LIST_HEAD(&fence->cb_list); - fence->lock = lock; fence->context = context; fence->seqno = seqno; fence->flags = flags | BIT(DMA_FENCE_FLAG_INITIALIZED_BIT); + if (lock) { + fence->extern_lock = lock; + } else { + spin_lock_init(&fence->inline_lock); + fence->flags |= BIT(DMA_FENCE_FLAG_INLINE_LOCK_BIT); + } fence->error = 0; trace_dma_fence_init(fence); @@ -1095,7 +1098,7 @@ __dma_fence_init(struct dma_fence *fence, const struct dma_fence_ops *ops, * dma_fence_init - Initialize a custom fence. * @fence: the fence to initialize * @ops: the dma_fence_ops for operations on this fence - * @lock: the irqsafe spinlock to use for locking this fence + * @lock: optional irqsafe spinlock to use for locking this fence * @context: the execution context this fence is run on * @seqno: a linear increasing sequence number for this context * @@ -1105,6 +1108,10 @@ __dma_fence_init(struct dma_fence *fence, const struct dma_fence_ops *ops, * * context and seqno are used for easy comparison between fences, allowing * to check which fence is later by simply using dma_fence_later(). + * + * It is strongly discouraged to provide an external lock because this couples + * lock and fence life time. This is only allowed for legacy use cases when + * multiple fences need to be prevented from signaling out of order. */ void dma_fence_init(struct dma_fence *fence, const struct dma_fence_ops *ops, @@ -1118,7 +1125,7 @@ EXPORT_SYMBOL(dma_fence_init); * dma_fence_init64 - Initialize a custom fence with 64-bit seqno support. * @fence: the fence to initialize * @ops: the dma_fence_ops for operations on this fence - * @lock: the irqsafe spinlock to use for locking this fence + * @lock: optional irqsafe spinlock to use for locking this fence * @context: the execution context this fence is run on * @seqno: a linear increasing sequence number for this context * @@ -1128,6 +1135,10 @@ EXPORT_SYMBOL(dma_fence_init); * * Context and seqno are used for easy comparison between fences, allowing * to check which fence is later by simply using dma_fence_later(). + * + * It is strongly discouraged to provide an external lock because this couples + * lock and fence life time. This is only allowed for legacy use cases when + * multiple fences need to be prevented from signaling out of order. */ void dma_fence_init64(struct dma_fence *fence, const struct dma_fence_ops *ops, diff --git a/drivers/dma-buf/sync_debug.h b/drivers/dma-buf/sync_debug.h index 02af347293d0..c49324505b20 100644 --- a/drivers/dma-buf/sync_debug.h +++ b/drivers/dma-buf/sync_debug.h @@ -47,7 +47,7 @@ struct sync_timeline { static inline struct sync_timeline *dma_fence_parent(struct dma_fence *fence) { - return container_of(fence->lock, struct sync_timeline, lock); + return container_of(fence->extern_lock, struct sync_timeline, lock); } /** diff --git a/drivers/gpu/drm/drm_crtc.c b/drivers/gpu/drm/drm_crtc.c index 960fdc1cc6ba..8d6f721c2c9a 100644 --- a/drivers/gpu/drm/drm_crtc.c +++ b/drivers/gpu/drm/drm_crtc.c @@ -159,7 +159,7 @@ static const struct dma_fence_ops drm_crtc_fence_ops; static struct drm_crtc *fence_to_crtc(struct dma_fence *fence) { BUG_ON(rcu_access_pointer(fence->ops) != &drm_crtc_fence_ops); - return container_of(fence->lock, struct drm_crtc, fence_lock); + return container_of(fence->extern_lock, struct drm_crtc, fence_lock); } static const char *drm_crtc_fence_get_driver_name(struct dma_fence *fence) diff --git a/drivers/gpu/drm/drm_writeback.c b/drivers/gpu/drm/drm_writeback.c index 09362cf4f22f..4da5d6094721 100644 --- a/drivers/gpu/drm/drm_writeback.c +++ b/drivers/gpu/drm/drm_writeback.c @@ -81,7 +81,7 @@ * From userspace, this property will always read as zero. */ -#define fence_to_wb_connector(x) container_of(x->lock, \ +#define fence_to_wb_connector(x) container_of(x->extern_lock, \ struct drm_writeback_connector, \ fence_lock) diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c index 903d326927ca..edbe9e08ba0f 100644 --- a/drivers/gpu/drm/nouveau/nouveau_fence.c +++ b/drivers/gpu/drm/nouveau/nouveau_fence.c @@ -41,7 +41,8 @@ static const struct dma_fence_ops nouveau_fence_ops_legacy; static inline struct nouveau_fence_chan * nouveau_fctx(struct nouveau_fence *fence) { - return container_of(fence->base.lock, struct nouveau_fence_chan, lock); + return container_of(fence->base.extern_lock, struct nouveau_fence_chan, + lock); } static bool diff --git a/drivers/gpu/drm/qxl/qxl_release.c b/drivers/gpu/drm/qxl/qxl_release.c index 720d6d57151c..06979d0e8a9f 100644 --- a/drivers/gpu/drm/qxl/qxl_release.c +++ b/drivers/gpu/drm/qxl/qxl_release.c @@ -62,7 +62,8 @@ static long qxl_fence_wait(struct dma_fence *fence, bool intr, struct qxl_device *qdev; unsigned long cur, end = jiffies + timeout; - qdev = container_of(fence->lock, struct qxl_device, release_lock); + qdev = container_of(fence->extern_lock, struct qxl_device, + release_lock); if (!wait_event_timeout(qdev->release_event, (dma_fence_is_signaled(fence) || diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c index 3469e2c9e706..4ef84ff9b638 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c @@ -47,7 +47,8 @@ struct vmw_event_fence_action { static struct vmw_fence_manager * fman_from_fence(struct vmw_fence_obj *fence) { - return container_of(fence->base.lock, struct vmw_fence_manager, lock); + return container_of(fence->base.extern_lock, struct vmw_fence_manager, + lock); } static void vmw_fence_obj_destroy(struct dma_fence *f) diff --git a/drivers/gpu/drm/xe/xe_hw_fence.c b/drivers/gpu/drm/xe/xe_hw_fence.c index ae8ed15b64c5..14720623ad00 100644 --- a/drivers/gpu/drm/xe/xe_hw_fence.c +++ b/drivers/gpu/drm/xe/xe_hw_fence.c @@ -124,7 +124,8 @@ static struct xe_hw_fence *to_xe_hw_fence(struct dma_fence *fence); static struct xe_hw_fence_irq *xe_hw_fence_irq(struct xe_hw_fence *fence) { - return container_of(fence->dma.lock, struct xe_hw_fence_irq, lock); + return container_of(fence->dma.extern_lock, struct xe_hw_fence_irq, + lock); } static const char *xe_hw_fence_get_driver_name(struct dma_fence *dma_fence) diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index 85d6eac9fa85..3dc93f068bf6 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -34,7 +34,8 @@ struct seq_file; * @ops: dma_fence_ops associated with this fence * @rcu: used for releasing fence with kfree_rcu * @cb_list: list of all callbacks to call - * @lock: spin_lock_irqsave used for locking + * @extern_lock: external spin_lock_irqsave used for locking (deprecated) + * @inline_lock: alternative internal spin_lock_irqsave used for locking * @context: execution context this fence belongs to, returned by * dma_fence_context_alloc() * @seqno: the sequence number of this fence inside the execution context, @@ -49,6 +50,7 @@ struct seq_file; * of the time. * * DMA_FENCE_FLAG_INITIALIZED_BIT - fence was initialized + * DMA_FENCE_FLAG_INLINE_LOCK_BIT - use inline spinlock instead of external one * DMA_FENCE_FLAG_SIGNALED_BIT - fence is already signaled * DMA_FENCE_FLAG_TIMESTAMP_BIT - timestamp recorded for fence signaling * DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT - enable_signaling might have been called @@ -66,7 +68,10 @@ struct seq_file; * been completed, or never called at all. */ struct dma_fence { - spinlock_t *lock; + union { + spinlock_t *extern_lock; + spinlock_t inline_lock; + }; const struct dma_fence_ops __rcu *ops; /* * We clear the callback list on kref_put so that by the time we @@ -100,6 +105,7 @@ struct dma_fence { enum dma_fence_flag_bits { DMA_FENCE_FLAG_INITIALIZED_BIT, + DMA_FENCE_FLAG_INLINE_LOCK_BIT, DMA_FENCE_FLAG_SEQNO64_BIT, DMA_FENCE_FLAG_SIGNALED_BIT, DMA_FENCE_FLAG_TIMESTAMP_BIT, @@ -381,11 +387,12 @@ dma_fence_get_rcu_safe(struct dma_fence __rcu **fencep) * dma_fence_spinlock - return pointer to the spinlock protecting the fence * @fence: the fence to get the lock from * - * Return the pointer to the extern lock. + * Return either the pointer to the embedded or the external spin lock. */ static inline spinlock_t *dma_fence_spinlock(struct dma_fence *fence) { - return fence->lock; + return test_bit(DMA_FENCE_FLAG_INLINE_LOCK_BIT, &fence->flags) ? + &fence->inline_lock : fence->extern_lock; } /** @@ -396,7 +403,7 @@ static inline spinlock_t *dma_fence_spinlock(struct dma_fence *fence) * Lock the fence, preventing it from changing to the signaled state. */ #define dma_fence_lock_irqsave(fence, flags) \ - spin_lock_irqsave(fence->lock, flags) + spin_lock_irqsave(dma_fence_spinlock(fence), flags) /** * dma_fence_unlock_irqrestore - unlock the fence and irqrestore @@ -406,7 +413,7 @@ static inline spinlock_t *dma_fence_spinlock(struct dma_fence *fence) * Unlock the fence, allowing it to change it's state to signaled again. */ #define dma_fence_unlock_irqrestore(fence, flags) \ - spin_unlock_irqrestore(fence->lock, flags) + spin_unlock_irqrestore(dma_fence_spinlock(fence), flags) /** * dma_fence_assert_held - lockdep assertion that fence is locked -- cgit v1.2.3 From 5943243914b9fed8e26edcb9d45421721a5e3576 Mon Sep 17 00:00:00 2001 From: Christian König Date: Thu, 9 Oct 2025 16:18:53 +0200 Subject: dma-buf: use inline lock for the dma-fence-array MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using the inline lock is now the recommended way for dma_fence implementations. So use this approach for the framework's internal fences as well. Also saves about 4 bytes for the external spinlock. Signed-off-by: Christian König Reviewed-by: Tvrtko Ursulin Reviewed-by: Philipp Stanner Reviewed-by: Boris Brezillon Link: https://lore.kernel.org/r/20260219160822.1529-8-christian.koenig@amd.com --- drivers/dma-buf/dma-fence-array.c | 5 ++--- include/linux/dma-fence-array.h | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/dma-buf/dma-fence-array.c b/drivers/dma-buf/dma-fence-array.c index 37e2c6179d77..cd970eceaefb 100644 --- a/drivers/dma-buf/dma-fence-array.c +++ b/drivers/dma-buf/dma-fence-array.c @@ -204,9 +204,8 @@ void dma_fence_array_init(struct dma_fence_array *array, array->num_fences = num_fences; - spin_lock_init(&array->lock); - dma_fence_init(&array->base, &dma_fence_array_ops, &array->lock, - context, seqno); + dma_fence_init(&array->base, &dma_fence_array_ops, NULL, context, + seqno); init_irq_work(&array->work, irq_dma_fence_array_work); atomic_set(&array->num_pending, signal_on_any ? 1 : num_fences); diff --git a/include/linux/dma-fence-array.h b/include/linux/dma-fence-array.h index 079b3dec0a16..370b3d2bba37 100644 --- a/include/linux/dma-fence-array.h +++ b/include/linux/dma-fence-array.h @@ -38,7 +38,6 @@ struct dma_fence_array_cb { struct dma_fence_array { struct dma_fence base; - spinlock_t lock; unsigned num_fences; atomic_t num_pending; struct dma_fence **fences; -- cgit v1.2.3 From a408c0ca0c411ca1ead995bdae3112a806c87556 Mon Sep 17 00:00:00 2001 From: Christian König Date: Thu, 9 Oct 2025 16:32:33 +0200 Subject: dma-buf: use inline lock for the dma-fence-chain MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using the inline lock is now the recommended way for dma_fence implementations. So use this approach for the framework's internal fences as well. Also saves about 4 bytes for the external spinlock. Signed-off-by: Christian König Reviewed-by: Tvrtko Ursulin Reviewed-by: Philipp Stanner Reviewed-by: Boris Brezillon Link: https://lore.kernel.org/r/20260219160822.1529-9-christian.koenig@amd.com --- drivers/dma-buf/dma-fence-chain.c | 3 +-- include/linux/dma-fence-chain.h | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/dma-buf/dma-fence-chain.c b/drivers/dma-buf/dma-fence-chain.c index a8a90acf4f34..a707792b6025 100644 --- a/drivers/dma-buf/dma-fence-chain.c +++ b/drivers/dma-buf/dma-fence-chain.c @@ -245,7 +245,6 @@ void dma_fence_chain_init(struct dma_fence_chain *chain, struct dma_fence_chain *prev_chain = to_dma_fence_chain(prev); uint64_t context; - spin_lock_init(&chain->lock); rcu_assign_pointer(chain->prev, prev); chain->fence = fence; chain->prev_seqno = 0; @@ -261,7 +260,7 @@ void dma_fence_chain_init(struct dma_fence_chain *chain, seqno = max(prev->seqno, seqno); } - dma_fence_init64(&chain->base, &dma_fence_chain_ops, &chain->lock, + dma_fence_init64(&chain->base, &dma_fence_chain_ops, NULL, context, seqno); /* diff --git a/include/linux/dma-fence-chain.h b/include/linux/dma-fence-chain.h index 5cd3ba53b4a1..df3beadf1515 100644 --- a/include/linux/dma-fence-chain.h +++ b/include/linux/dma-fence-chain.h @@ -46,7 +46,6 @@ struct dma_fence_chain { */ struct irq_work work; }; - spinlock_t lock; }; -- cgit v1.2.3 From 9fe89f022c05d99c052d6bc088b82d4ff83bf463 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 27 Jan 2026 16:17:48 +0100 Subject: sched/fair: More complex proportional newidle balance It turns out that a few workloads (easyWave, fio) have a fairly low success rate on newidle balance, but still benefit greatly from having it anyway. Luckliky these workloads have a faily low newidle rate, so the cost if doing the newidle is relatively low, even if unsuccessfull. Add a simple rate based part to the newidle ratio compute, such that low rate newidle will still have a high newidle ratio. This cures the easyWave and fio workloads while not affecting the schbench numbers either (which have a very high newidle rate). Reported-by: Mario Roy Reported-by: "Mohamed Abuelfotoh, Hazem" Signed-off-by: Peter Zijlstra (Intel) Tested-by: Mario Roy Tested-by: "Mohamed Abuelfotoh, Hazem" Link: https://patch.msgid.link/20260127151748.GA1079264@noisy.programming.kicks-ass.net --- include/linux/sched/topology.h | 1 + kernel/sched/fair.c | 27 +++++++++++++++++++++++++-- kernel/sched/features.h | 1 + kernel/sched/topology.c | 3 +++ 4 files changed, 30 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 45c0022b91ce..a1e1032426dc 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -95,6 +95,7 @@ struct sched_domain { unsigned int newidle_call; unsigned int newidle_success; unsigned int newidle_ratio; + u64 newidle_stamp; u64 max_newidle_lb_cost; unsigned long last_decay_max_lb_cost; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bf948db905ed..66afa0ac7396 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12289,7 +12289,30 @@ static inline void update_newidle_stats(struct sched_domain *sd, unsigned int su sd->newidle_success += success; if (sd->newidle_call >= 1024) { - sd->newidle_ratio = sd->newidle_success; + u64 now = sched_clock(); + s64 delta = now - sd->newidle_stamp; + sd->newidle_stamp = now; + int ratio = 0; + + if (delta < 0) + delta = 0; + + if (sched_feat(NI_RATE)) { + /* + * ratio delta freq + * + * 1024 - 4 s - 128 Hz + * 512 - 2 s - 256 Hz + * 256 - 1 s - 512 Hz + * 128 - .5 s - 1024 Hz + * 64 - .25 s - 2048 Hz + */ + ratio = delta >> 22; + } + + ratio += sd->newidle_success; + + sd->newidle_ratio = min(1024, ratio); sd->newidle_call /= 2; sd->newidle_success /= 2; } @@ -12996,7 +13019,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) if (sd->flags & SD_BALANCE_NEWIDLE) { unsigned int weight = 1; - if (sched_feat(NI_RANDOM)) { + if (sched_feat(NI_RANDOM) && sd->newidle_ratio < 1024) { /* * Throw a 1k sided dice; and only run * newidle_balance according to the success diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 136a6584be79..37d5928fa6dd 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -126,3 +126,4 @@ SCHED_FEAT(LATENCY_WARN, false) * Do newidle balancing proportional to its success rate using randomization. */ SCHED_FEAT(NI_RANDOM, true) +SCHED_FEAT(NI_RATE, true) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 32dcddaead82..061f8c85f555 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -4,6 +4,7 @@ */ #include +#include #include #include "sched.h" @@ -1642,6 +1643,7 @@ sd_init(struct sched_domain_topology_level *tl, struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); int sd_id, sd_weight, sd_flags = 0; struct cpumask *sd_span; + u64 now = sched_clock(); sd_weight = cpumask_weight(tl->mask(tl, cpu)); @@ -1679,6 +1681,7 @@ sd_init(struct sched_domain_topology_level *tl, .newidle_call = 512, .newidle_success = 256, .newidle_ratio = 512, + .newidle_stamp = now, .max_newidle_lb_cost = 0, .last_decay_max_lb_cost = jiffies, -- cgit v1.2.3 From be6d4c9e9d714ebbf358be41332726a0f94b9ffa Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sat, 31 Jan 2026 07:34:16 +0200 Subject: dma-buf: Add dma_buf_attach_revocable() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some exporters need a flow to synchronously revoke access to the DMA-buf by importers. Once revoke is completed the importer is not permitted to touch the memory otherwise they may get IOMMU faults, AERs, or worse. DMA-buf today defines a revoke flow, for both pinned and dynamic importers, which is broadly: dma_resv_lock(dmabuf->resv, NULL); // Prevent new mappings from being established priv->revoked = true; // Tell all importers to eventually unmap dma_buf_invalidate_mappings(dmabuf); // Wait for any inprogress fences on the old mapping dma_resv_wait_timeout(dmabuf->resv, DMA_RESV_USAGE_BOOKKEEP, false, MAX_SCHEDULE_TIMEOUT); dma_resv_unlock(dmabuf->resv, NULL); // Wait for all importers to complete unmap wait_for_completion(&priv->unmapped_comp); This works well, and an importer that continues to access the DMA-buf after unmapping it is very buggy. However, the final wait for unmap is effectively unbounded. Several importers do not support invalidate_mappings() at all and won't unmap until userspace triggers it. This unbounded wait is not suitable for exporters like VFIO and RDMA tha need to issue revoke as part of their normal operations. Add dma_buf_attach_revocable() to allow exporters to determine the difference between importers that can complete the above in bounded time, and those that can't. It can be called inside the exporter's attach op to reject incompatible importers. Document these details about how dma_buf_invalidate_mappings() works and what the required sequence is to achieve a full revocation. Signed-off-by: Leon Romanovsky Reviewed-by: Christian König Signed-off-by: Christian König Link: https://lore.kernel.org/r/20260131-dmabuf-revoke-v7-6-463d956bd527@nvidia.com --- drivers/dma-buf/dma-buf.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++- include/linux/dma-buf.h | 9 +++------ 2 files changed, 50 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index 3b32f15fbc18..a202a308c079 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -1318,13 +1318,59 @@ void dma_buf_unmap_attachment_unlocked(struct dma_buf_attachment *attach, } EXPORT_SYMBOL_NS_GPL(dma_buf_unmap_attachment_unlocked, "DMA_BUF"); +/** + * dma_buf_attach_revocable - check if a DMA-buf importer implements + * revoke semantics. + * @attach: the DMA-buf attachment to check + * + * Returns true if the DMA-buf importer can support the revoke sequence + * explained in dma_buf_invalidate_mappings() within bounded time. Meaning the + * importer implements invalidate_mappings() and ensures that unmap is called as + * a result. + */ +bool dma_buf_attach_revocable(struct dma_buf_attachment *attach) +{ + return attach->importer_ops && + attach->importer_ops->invalidate_mappings; +} +EXPORT_SYMBOL_NS_GPL(dma_buf_attach_revocable, "DMA_BUF"); + /** * dma_buf_invalidate_mappings - notify attachments that DMA-buf is moving * * @dmabuf: [in] buffer which is moving * * Informs all attachments that they need to destroy and recreate all their - * mappings. + * mappings. If the attachment is dynamic then the dynamic importer is expected + * to invalidate any caches it has of the mapping result and perform a new + * mapping request before allowing HW to do any further DMA. + * + * If the attachment is pinned then this informs the pinned importer that the + * underlying mapping is no longer available. Pinned importers may take this is + * as a permanent revocation and never establish new mappings so exporters + * should not trigger it lightly. + * + * Upon return importers may continue to access the DMA-buf memory. The caller + * must do two additional waits to ensure that the memory is no longer being + * accessed: + * 1) Until dma_resv_wait_timeout() retires fences the importer is allowed to + * fully access the memory. + * 2) Until the importer calls unmap it is allowed to speculatively + * read-and-discard the memory. It must not write to the memory. + * + * A caller wishing to use dma_buf_invalidate_mappings() to fully stop access to + * the DMA-buf must wait for both. Dynamic callers can often use just the first. + * + * All importers providing a invalidate_mappings() op must ensure that unmap is + * called within bounded time after the op. + * + * Pinned importers that do not support a invalidate_mappings() op will + * eventually perform unmap when they are done with the buffer, which may be an + * ubounded time from calling this function. dma_buf_attach_revocable() can be + * used to prevent such importers from attaching. + * + * Importers are free to request a new mapping in parallel as this function + * returns. */ void dma_buf_invalidate_mappings(struct dma_buf *dmabuf) { diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index e744b8f9bfad..166933b82e27 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -456,12 +456,8 @@ struct dma_buf_attach_ops { * called with this lock held as well. This makes sure that no mapping * is created concurrently with an ongoing move operation. * - * Mappings stay valid and are not directly affected by this callback. - * But the DMA-buf can now be in a different physical location, so all - * mappings should be destroyed and re-created as soon as possible. - * - * New mappings can be created after this callback returns, and will - * point to the new location of the DMA-buf. + * See the kdoc for dma_buf_invalidate_mappings() for details on the + * required behavior. */ void (*invalidate_mappings)(struct dma_buf_attachment *attach); }; @@ -579,6 +575,7 @@ struct sg_table *dma_buf_map_attachment(struct dma_buf_attachment *, void dma_buf_unmap_attachment(struct dma_buf_attachment *, struct sg_table *, enum dma_data_direction); void dma_buf_invalidate_mappings(struct dma_buf *dma_buf); +bool dma_buf_attach_revocable(struct dma_buf_attachment *attach); int dma_buf_begin_cpu_access(struct dma_buf *dma_buf, enum dma_data_direction dir); int dma_buf_end_cpu_access(struct dma_buf *dma_buf, -- cgit v1.2.3 From ebf1ccff79c43f860cbd2f9d6cfab9a462d0cb2d Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Wed, 18 Feb 2026 09:32:17 +0100 Subject: sched_ext: Fix ops.dequeue() semantics Currently, ops.dequeue() is only invoked when the sched_ext core knows that a task resides in BPF-managed data structures, which causes it to miss scheduling property change events. In addition, ops.dequeue() callbacks are completely skipped when tasks are dispatched to non-local DSQs from ops.select_cpu(). As a result, BPF schedulers cannot reliably track task state. Fix this by guaranteeing that each task entering the BPF scheduler's custody triggers exactly one ops.dequeue() call when it leaves that custody, whether the exit is due to a dispatch (regular or via a core scheduling pick) or to a scheduling property change (e.g. sched_setaffinity(), sched_setscheduler(), set_user_nice(), NUMA balancing, etc.). BPF scheduler custody concept: a task is considered to be in the BPF scheduler's custody when the scheduler is responsible for managing its lifecycle. This includes tasks dispatched to user-created DSQs or stored in the BPF scheduler's internal data structures from ops.enqueue(). Custody ends when the task is dispatched to a terminal DSQ (such as the local DSQ or %SCX_DSQ_GLOBAL), selected by core scheduling, or removed due to a property change. Tasks directly dispatched to terminal DSQs bypass the BPF scheduler entirely and are never in its custody. Terminal DSQs include: - Local DSQs (%SCX_DSQ_LOCAL or %SCX_DSQ_LOCAL_ON): per-CPU queues where tasks go directly to execution. - Global DSQ (%SCX_DSQ_GLOBAL): the built-in fallback queue where the BPF scheduler is considered "done" with the task. As a result, ops.dequeue() is not invoked for tasks directly dispatched to terminal DSQs. To identify dequeues triggered by scheduling property changes, introduce the new ops.dequeue() flag %SCX_DEQ_SCHED_CHANGE: when this flag is set, the dequeue was caused by a scheduling property change. New ops.dequeue() semantics: - ops.dequeue() is invoked exactly once when the task leaves the BPF scheduler's custody, in one of the following cases: a) regular dispatch: a task dispatched to a user DSQ or stored in internal BPF data structures is moved to a terminal DSQ (ops.dequeue() called without any special flags set), b) core scheduling dispatch: core-sched picks task before dispatch (ops.dequeue() called with %SCX_DEQ_CORE_SCHED_EXEC flag set), c) property change: task properties modified before dispatch, (ops.dequeue() called with %SCX_DEQ_SCHED_CHANGE flag set). This allows BPF schedulers to: - reliably track task ownership and lifecycle, - maintain accurate accounting of managed tasks, - update internal state when tasks change properties. Cc: Tejun Heo Cc: Emil Tsalapatis Cc: Kuba Piecuch Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- Documentation/scheduler/sched-ext.rst | 78 +++++++++++++++-- include/linux/sched/ext.h | 1 + kernel/sched/ext.c | 110 ++++++++++++++++++++++-- kernel/sched/ext_internal.h | 7 ++ tools/sched_ext/include/scx/enum_defs.autogen.h | 1 + tools/sched_ext/include/scx/enums.autogen.bpf.h | 2 + tools/sched_ext/include/scx/enums.autogen.h | 1 + 7 files changed, 184 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst index 9e2882d937b4..7cb77fd2e4d7 100644 --- a/Documentation/scheduler/sched-ext.rst +++ b/Documentation/scheduler/sched-ext.rst @@ -228,16 +228,23 @@ The following briefly shows how a waking task is scheduled and executed. scheduler can wake up any cpu using the ``scx_bpf_kick_cpu()`` helper, using ``ops.select_cpu()`` judiciously can be simpler and more efficient. - A task can be immediately inserted into a DSQ from ``ops.select_cpu()`` - by calling ``scx_bpf_dsq_insert()``. If the task is inserted into - ``SCX_DSQ_LOCAL`` from ``ops.select_cpu()``, it will be inserted into the - local DSQ of whichever CPU is returned from ``ops.select_cpu()``. - Additionally, inserting directly from ``ops.select_cpu()`` will cause the - ``ops.enqueue()`` callback to be skipped. - Note that the scheduler core will ignore an invalid CPU selection, for example, if it's outside the allowed cpumask of the task. + A task can be immediately inserted into a DSQ from ``ops.select_cpu()`` + by calling ``scx_bpf_dsq_insert()`` or ``scx_bpf_dsq_insert_vtime()``. + + If the task is inserted into ``SCX_DSQ_LOCAL`` from + ``ops.select_cpu()``, it will be added to the local DSQ of whichever CPU + is returned from ``ops.select_cpu()``. Additionally, inserting directly + from ``ops.select_cpu()`` will cause the ``ops.enqueue()`` callback to + be skipped. + + Any other attempt to store a task in BPF-internal data structures from + ``ops.select_cpu()`` does not prevent ``ops.enqueue()`` from being + invoked. This is discouraged, as it can introduce racy behavior or + inconsistent state. + 2. Once the target CPU is selected, ``ops.enqueue()`` is invoked (unless the task was inserted directly from ``ops.select_cpu()``). ``ops.enqueue()`` can make one of the following decisions: @@ -251,6 +258,61 @@ The following briefly shows how a waking task is scheduled and executed. * Queue the task on the BPF side. + **Task State Tracking and ops.dequeue() Semantics** + + A task is in the "BPF scheduler's custody" when the BPF scheduler is + responsible for managing its lifecycle. A task enters custody when it is + dispatched to a user DSQ or stored in the BPF scheduler's internal data + structures. Custody is entered only from ``ops.enqueue()`` for those + operations. The only exception is dispatching to a user DSQ from + ``ops.select_cpu()``: although the task is not yet technically in BPF + scheduler custody at that point, the dispatch has the same semantic + effect as dispatching from ``ops.enqueue()`` for custody-related + purposes. + + Once ``ops.enqueue()`` is called, the task may or may not enter custody + depending on what the scheduler does: + + * **Directly dispatched to terminal DSQs** (``SCX_DSQ_LOCAL``, + ``SCX_DSQ_LOCAL_ON | cpu``, or ``SCX_DSQ_GLOBAL``): the BPF scheduler + is done with the task - it either goes straight to a CPU's local run + queue or to the global DSQ as a fallback. The task never enters (or + exits) BPF custody, and ``ops.dequeue()`` will not be called. + + * **Dispatch to user-created DSQs** (custom DSQs): the task enters the + BPF scheduler's custody. When the task later leaves BPF custody + (dispatched to a terminal DSQ, picked by core-sched, or dequeued for + sleep/property changes), ``ops.dequeue()`` will be called exactly + once. + + * **Stored in BPF data structures** (e.g., internal BPF queues): the + task is in BPF custody. ``ops.dequeue()`` will be called when it + leaves (e.g., when ``ops.dispatch()`` moves it to a terminal DSQ, or + on property change / sleep). + + When a task leaves BPF scheduler custody, ``ops.dequeue()`` is invoked. + The dequeue can happen for different reasons, distinguished by flags: + + 1. **Regular dispatch**: when a task in BPF custody is dispatched to a + terminal DSQ from ``ops.dispatch()`` (leaving BPF custody for + execution), ``ops.dequeue()`` is triggered without any special flags. + + 2. **Core scheduling pick**: when ``CONFIG_SCHED_CORE`` is enabled and + core scheduling picks a task for execution while it's still in BPF + custody, ``ops.dequeue()`` is called with the + ``SCX_DEQ_CORE_SCHED_EXEC`` flag. + + 3. **Scheduling property change**: when a task property changes (via + operations like ``sched_setaffinity()``, ``sched_setscheduler()``, + priority changes, CPU migrations, etc.) while the task is still in + BPF custody, ``ops.dequeue()`` is called with the + ``SCX_DEQ_SCHED_CHANGE`` flag set in ``deq_flags``. + + **Important**: Once a task has left BPF custody (e.g., after being + dispatched to a terminal DSQ), property changes will not trigger + ``ops.dequeue()``, since the task is no longer managed by the BPF + scheduler. + 3. When a CPU is ready to schedule, it first looks at its local DSQ. If empty, it then looks at the global DSQ. If there still isn't a task to run, ``ops.dispatch()`` is invoked which can use the following two @@ -318,6 +380,8 @@ by a sched_ext scheduler: /* Any usable CPU becomes available */ ops.dispatch(); /* Task is moved to a local DSQ */ + + ops.dequeue(); /* Exiting BPF scheduler */ } ops.running(); /* Task starts running on its assigned CPU */ while (task->scx.slice > 0 && task is runnable) diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index bcb962d5ee7d..4601e5ecb43c 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -84,6 +84,7 @@ struct scx_dispatch_q { /* scx_entity.flags */ enum scx_ent_flags { SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */ + SCX_TASK_IN_CUSTODY = 1 << 1, /* in custody, needs ops.dequeue() when leaving */ SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */ SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 044bb2168dd0..d5e688b9acc0 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -986,12 +986,45 @@ static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p) __scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1); } +/* + * Return true if @p is moving due to an internal SCX migration, false + * otherwise. + */ +static inline bool task_scx_migrating(struct task_struct *p) +{ + /* + * We only need to check sticky_cpu: it is set to the destination + * CPU in move_remote_task_to_local_dsq() before deactivate_task() + * and cleared when the task is enqueued on the destination, so it + * is only non-negative during an internal SCX migration. + */ + return p->scx.sticky_cpu >= 0; +} + +/* + * Call ops.dequeue() if the task is in BPF custody and not migrating. + * Clears %SCX_TASK_IN_CUSTODY when the callback is invoked. + */ +static void call_task_dequeue(struct scx_sched *sch, struct rq *rq, + struct task_struct *p, u64 deq_flags) +{ + if (!(p->scx.flags & SCX_TASK_IN_CUSTODY) || task_scx_migrating(p)) + return; + + if (SCX_HAS_OP(sch, dequeue)) + SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq, p, deq_flags); + + p->scx.flags &= ~SCX_TASK_IN_CUSTODY; +} + static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p, u64 enq_flags) { struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); bool preempt = false; + call_task_dequeue(scx_root, rq, p, 0); + /* * If @rq is in balance, the CPU is already vacant and looking for the * next task to run. No need to preempt or trigger resched after moving @@ -1115,17 +1148,34 @@ static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq, p->scx.ddsp_dsq_id = SCX_DSQ_INVALID; p->scx.ddsp_enq_flags = 0; + /* + * Update custody and call ops.dequeue() before clearing ops_state: + * once ops_state is cleared, waiters in ops_dequeue() can proceed + * and dequeue_task_scx() will RMW p->scx.flags. If we clear + * ops_state first, both sides would modify p->scx.flags + * concurrently in a non-atomic way. + */ + if (is_local) { + local_dsq_post_enq(dsq, p, enq_flags); + } else { + /* + * Task on global/bypass DSQ: leave custody, task on + * non-terminal DSQ: enter custody. + */ + if (dsq->id == SCX_DSQ_GLOBAL || dsq->id == SCX_DSQ_BYPASS) + call_task_dequeue(sch, rq, p, 0); + else + p->scx.flags |= SCX_TASK_IN_CUSTODY; + + raw_spin_unlock(&dsq->lock); + } + /* * We're transitioning out of QUEUEING or DISPATCHING. store_release to * match waiters' load_acquire. */ if (enq_flags & SCX_ENQ_CLEAR_OPSS) atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); - - if (is_local) - local_dsq_post_enq(dsq, p, enq_flags); - else - raw_spin_unlock(&dsq->lock); } static void task_unlink_from_dsq(struct task_struct *p, @@ -1405,6 +1455,12 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) goto direct; + /* + * Task is now in BPF scheduler's custody. Set %SCX_TASK_IN_CUSTODY + * so ops.dequeue() is called when it leaves custody. + */ + p->scx.flags |= SCX_TASK_IN_CUSTODY; + /* * If not directly dispatched, QUEUEING isn't clear yet and dispatch or * dequeue may be waiting. The store_release matches their load_acquire. @@ -1522,6 +1578,14 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) { struct scx_sched *sch = scx_root; unsigned long opss; + u64 op_deq_flags = deq_flags; + + /* + * Set %SCX_DEQ_SCHED_CHANGE when the dequeue is due to a property + * change (not sleep or core-sched pick). + */ + if (!(op_deq_flags & (DEQUEUE_SLEEP | SCX_DEQ_CORE_SCHED_EXEC))) + op_deq_flags |= SCX_DEQ_SCHED_CHANGE; /* dequeue is always temporary, don't reset runnable_at */ clr_task_runnable(p, false); @@ -1539,10 +1603,8 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) */ BUG(); case SCX_OPSS_QUEUED: - if (SCX_HAS_OP(sch, dequeue)) - SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq, - p, deq_flags); - + /* A queued task must always be in BPF scheduler's custody */ + WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_IN_CUSTODY)); if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, SCX_OPSS_NONE)) break; @@ -1565,6 +1627,22 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); break; } + + /* + * Call ops.dequeue() if the task is still in BPF custody. + * + * The code that clears ops_state to %SCX_OPSS_NONE does not always + * clear %SCX_TASK_IN_CUSTODY: in dispatch_to_local_dsq(), when + * we're moving a task that was in %SCX_OPSS_DISPATCHING to a + * remote CPU's local DSQ, we only set ops_state to %SCX_OPSS_NONE + * so that a concurrent dequeue can proceed, but we clear + * %SCX_TASK_IN_CUSTODY only when we later enqueue or move the + * task. So we can see NONE + IN_CUSTODY here and we must handle + * it. Similarly, after waiting on %SCX_OPSS_DISPATCHING we see + * NONE but the task may still have %SCX_TASK_IN_CUSTODY set until + * it is enqueued on the destination. + */ + call_task_dequeue(sch, rq, p, op_deq_flags); } static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags) @@ -2935,6 +3013,13 @@ static void scx_enable_task(struct task_struct *p) lockdep_assert_rq_held(rq); + /* + * Verify the task is not in BPF scheduler's custody. If flag + * transitions are consistent, the flag should always be clear + * here. + */ + WARN_ON_ONCE(p->scx.flags & SCX_TASK_IN_CUSTODY); + /* * Set the weight before calling ops.enable() so that the scheduler * doesn't see a stale value if they inspect the task struct. @@ -2966,6 +3051,13 @@ static void scx_disable_task(struct task_struct *p) if (SCX_HAS_OP(sch, disable)) SCX_CALL_OP_TASK(sch, SCX_KF_REST, disable, rq, p); scx_set_task_state(p, SCX_TASK_READY); + + /* + * Verify the task is not in BPF scheduler's custody. If flag + * transitions are consistent, the flag should always be clear + * here. + */ + WARN_ON_ONCE(p->scx.flags & SCX_TASK_IN_CUSTODY); } static void scx_exit_task(struct task_struct *p) diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h index 386c677e4c9a..befa9a5d6e53 100644 --- a/kernel/sched/ext_internal.h +++ b/kernel/sched/ext_internal.h @@ -982,6 +982,13 @@ enum scx_deq_flags { * it hasn't been dispatched yet. Dequeue from the BPF side. */ SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32, + + /* + * The task is being dequeued due to a property change (e.g., + * sched_setaffinity(), sched_setscheduler(), set_user_nice(), + * etc.). + */ + SCX_DEQ_SCHED_CHANGE = 1LLU << 33, }; enum scx_pick_idle_cpu_flags { diff --git a/tools/sched_ext/include/scx/enum_defs.autogen.h b/tools/sched_ext/include/scx/enum_defs.autogen.h index c2c33df9292c..dcc945304760 100644 --- a/tools/sched_ext/include/scx/enum_defs.autogen.h +++ b/tools/sched_ext/include/scx/enum_defs.autogen.h @@ -21,6 +21,7 @@ #define HAVE_SCX_CPU_PREEMPT_UNKNOWN #define HAVE_SCX_DEQ_SLEEP #define HAVE_SCX_DEQ_CORE_SCHED_EXEC +#define HAVE_SCX_DEQ_SCHED_CHANGE #define HAVE_SCX_DSQ_FLAG_BUILTIN #define HAVE_SCX_DSQ_FLAG_LOCAL_ON #define HAVE_SCX_DSQ_INVALID diff --git a/tools/sched_ext/include/scx/enums.autogen.bpf.h b/tools/sched_ext/include/scx/enums.autogen.bpf.h index 2f8002bcc19a..5da50f937684 100644 --- a/tools/sched_ext/include/scx/enums.autogen.bpf.h +++ b/tools/sched_ext/include/scx/enums.autogen.bpf.h @@ -127,3 +127,5 @@ const volatile u64 __SCX_ENQ_CLEAR_OPSS __weak; const volatile u64 __SCX_ENQ_DSQ_PRIQ __weak; #define SCX_ENQ_DSQ_PRIQ __SCX_ENQ_DSQ_PRIQ +const volatile u64 __SCX_DEQ_SCHED_CHANGE __weak; +#define SCX_DEQ_SCHED_CHANGE __SCX_DEQ_SCHED_CHANGE diff --git a/tools/sched_ext/include/scx/enums.autogen.h b/tools/sched_ext/include/scx/enums.autogen.h index fedec938584b..fc9a7a4d9dea 100644 --- a/tools/sched_ext/include/scx/enums.autogen.h +++ b/tools/sched_ext/include/scx/enums.autogen.h @@ -46,4 +46,5 @@ SCX_ENUM_SET(skel, scx_enq_flags, SCX_ENQ_LAST); \ SCX_ENUM_SET(skel, scx_enq_flags, SCX_ENQ_CLEAR_OPSS); \ SCX_ENUM_SET(skel, scx_enq_flags, SCX_ENQ_DSQ_PRIQ); \ + SCX_ENUM_SET(skel, scx_deq_flags, SCX_DEQ_SCHED_CHANGE); \ } while (0) -- cgit v1.2.3 From baff45179e90276a14acb9dffce17ff517708453 Mon Sep 17 00:00:00 2001 From: Jishnu Prakash Date: Fri, 30 Jan 2026 17:24:20 +0530 Subject: iio: adc: Add support for QCOM PMIC5 Gen3 ADC The ADC architecture on PMIC5 Gen3 is similar to that on PMIC5 Gen2, with all SW communication to ADC going through PMK8550 which communicates with other PMICs through PBS. One major difference is that the register interface used here is that of an SDAM (Shared Direct Access Memory) peripheral present on PMK8550. There may be more than one SDAM used for ADC5 Gen3 and each has eight channels, which may be used for either immediate reads (same functionality as previous PMIC5 and PMIC5 Gen2 ADC peripherals) or recurring measurements (same as ADC_TM functionality). By convention, we reserve the first channel of the first SDAM for all immediate reads and use the remaining channels across all SDAMs for ADC_TM monitoring functionality. Add support for PMIC5 Gen3 ADC driver for immediate read functionality. ADC_TM is implemented as an auxiliary thermal driver under this ADC driver. Signed-off-by: Jishnu Prakash Signed-off-by: Jonathan Cameron --- drivers/iio/adc/Kconfig | 26 + drivers/iio/adc/Makefile | 1 + drivers/iio/adc/qcom-spmi-adc5-gen3.c | 860 ++++++++++++++++++++++++++ include/linux/iio/adc/qcom-adc5-gen3-common.h | 211 +++++++ 4 files changed, 1098 insertions(+) create mode 100644 drivers/iio/adc/qcom-spmi-adc5-gen3.c create mode 100644 include/linux/iio/adc/qcom-adc5-gen3-common.h (limited to 'include/linux') diff --git a/drivers/iio/adc/Kconfig b/drivers/iio/adc/Kconfig index 60038ae8dfc4..1f5915dd192d 100644 --- a/drivers/iio/adc/Kconfig +++ b/drivers/iio/adc/Kconfig @@ -1366,6 +1366,32 @@ config QCOM_SPMI_ADC5 To compile this driver as a module, choose M here: the module will be called qcom-spmi-adc5. +config QCOM_SPMI_ADC5_GEN3 + tristate "Qualcomm Technologies Inc. SPMI PMIC5 GEN3 ADC" + depends on SPMI && THERMAL + select REGMAP_SPMI + select QCOM_VADC_COMMON + select AUXILIARY_BUS + help + IIO Voltage PMIC5 Gen3 ADC driver for Qualcomm Technologies Inc. + + The driver supports reading multiple channels. The ADC is a 16-bit + sigma-delta ADC. The hardware supports calibrated results for + conversion requests and clients include reading phone power supply + voltage, on board system thermistors connected to the PMIC ADC, + PMIC die temperature, charger temperature, battery current, USB + voltage input and voltage signals connected to supported PMIC GPIO + pins. The hardware supports internal pull-up for thermistors and can + choose between a 30k, 100k or 400k ohm pull up using the ADC channels. + + In addition, the same driver supports ADC thermal monitoring devices + too. They appear as thermal zones with multiple trip points. A thermal + client sets threshold temperature for both warm and cool trips and + gets updated when a threshold is reached. + + To compile this driver as a module, choose M here: the module will + be called qcom-spmi-adc5-gen3. + config RCAR_GYRO_ADC tristate "Renesas R-Car GyroADC driver" depends on ARCH_RCAR_GEN2 || COMPILE_TEST diff --git a/drivers/iio/adc/Makefile b/drivers/iio/adc/Makefile index c76550415ff1..097357d146ba 100644 --- a/drivers/iio/adc/Makefile +++ b/drivers/iio/adc/Makefile @@ -116,6 +116,7 @@ obj-$(CONFIG_PAC1934) += pac1934.o obj-$(CONFIG_PALMAS_GPADC) += palmas_gpadc.o obj-$(CONFIG_QCOM_PM8XXX_XOADC) += qcom-pm8xxx-xoadc.o obj-$(CONFIG_QCOM_SPMI_ADC5) += qcom-spmi-adc5.o +obj-$(CONFIG_QCOM_SPMI_ADC5_GEN3) += qcom-spmi-adc5-gen3.o obj-$(CONFIG_QCOM_SPMI_IADC) += qcom-spmi-iadc.o obj-$(CONFIG_QCOM_SPMI_RRADC) += qcom-spmi-rradc.o obj-$(CONFIG_QCOM_SPMI_VADC) += qcom-spmi-vadc.o diff --git a/drivers/iio/adc/qcom-spmi-adc5-gen3.c b/drivers/iio/adc/qcom-spmi-adc5-gen3.c new file mode 100644 index 000000000000..f8168a14b907 --- /dev/null +++ b/drivers/iio/adc/qcom-spmi-adc5-gen3.c @@ -0,0 +1,860 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ADC5_GEN3_VADC_SDAM 0x0 + +struct adc5_chip; + +/** + * struct adc5_channel_prop - ADC channel structure + * @common_props: structure with ADC channel properties (common to TM usage). + * @adc_tm: indicates TM type if the channel is used for TM measurements. + * @chip: pointer to top-level ADC device structure. + */ +struct adc5_channel_prop { + struct adc5_channel_common_prop common_props; + int adc_tm; + struct adc5_chip *chip; +}; + +/** + * struct adc5_chip - ADC private structure. + * @dev: SPMI ADC5 Gen3 device. + * @dev_data: Top-level ADC device data. + * @nchannels: number of ADC channels. + * @chan_props: array of ADC channel properties. + * @iio_chans: array of IIO channels specification. + * @complete: ADC result notification after interrupt is received. + * @lock: ADC lock for access to the peripheral, to prevent concurrent + * requests from multiple clients. + * @data: software configuration data. + * @n_tm_channels: number of ADC channels used for TM measurements. + * @handler: TM callback to be called for threshold violation interrupt + * on first SDAM. + * @tm_aux: pointer to auxiliary TM device. + */ +struct adc5_chip { + struct device *dev; + struct adc5_device_data dev_data; + unsigned int nchannels; + struct adc5_channel_prop *chan_props; + struct iio_chan_spec *iio_chans; + struct completion complete; + struct mutex lock; + const struct adc5_data *data; + unsigned int n_tm_channels; + void (*handler)(struct auxiliary_device *tm_aux); + struct auxiliary_device *tm_aux; +}; + +int adc5_gen3_read(struct adc5_device_data *adc, unsigned int sdam_index, + u16 offset, u8 *data, int len) +{ + return regmap_bulk_read(adc->regmap, + adc->base[sdam_index].base_addr + offset, + data, len); +} +EXPORT_SYMBOL_NS_GPL(adc5_gen3_read, "QCOM_SPMI_ADC5_GEN3"); + +int adc5_gen3_write(struct adc5_device_data *adc, unsigned int sdam_index, + u16 offset, u8 *data, int len) +{ + return regmap_bulk_write(adc->regmap, + adc->base[sdam_index].base_addr + offset, + data, len); +} +EXPORT_SYMBOL_NS_GPL(adc5_gen3_write, "QCOM_SPMI_ADC5_GEN3"); + +static int adc5_gen3_read_voltage_data(struct adc5_chip *adc, u16 *data) +{ + u8 rslt[2]; + int ret; + + ret = adc5_gen3_read(&adc->dev_data, ADC5_GEN3_VADC_SDAM, + ADC5_GEN3_CH_DATA0(0), rslt, sizeof(rslt)); + if (ret) + return ret; + + *data = get_unaligned_le16(rslt); + + if (*data == ADC5_USR_DATA_CHECK) { + dev_err(adc->dev, "Invalid data:%#x\n", *data); + return -EINVAL; + } + + dev_dbg(adc->dev, "voltage raw code:%#x\n", *data); + + return 0; +} + +void adc5_gen3_update_dig_param(struct adc5_channel_common_prop *prop, u8 *data) +{ + /* Update calibration select and decimation ratio select */ + *data &= ~(ADC5_GEN3_DIG_PARAM_CAL_SEL_MASK | ADC5_GEN3_DIG_PARAM_DEC_RATIO_SEL_MASK); + *data |= FIELD_PREP(ADC5_GEN3_DIG_PARAM_CAL_SEL_MASK, prop->cal_method); + *data |= FIELD_PREP(ADC5_GEN3_DIG_PARAM_DEC_RATIO_SEL_MASK, prop->decimation); +} +EXPORT_SYMBOL_NS_GPL(adc5_gen3_update_dig_param, "QCOM_SPMI_ADC5_GEN3"); + +#define ADC5_GEN3_READ_CONFIG_REGS 7 + +static int adc5_gen3_configure(struct adc5_chip *adc, + struct adc5_channel_common_prop *prop) +{ + u8 buf[ADC5_GEN3_READ_CONFIG_REGS]; + u8 conv_req = 0; + int ret; + + ret = adc5_gen3_read(&adc->dev_data, ADC5_GEN3_VADC_SDAM, ADC5_GEN3_SID, + buf, sizeof(buf)); + if (ret) + return ret; + + /* Write SID */ + buf[0] = FIELD_PREP(ADC5_GEN3_SID_MASK, prop->sid); + + /* + * Use channel 0 by default for immediate conversion and to indicate + * there is an actual conversion request + */ + buf[1] = ADC5_GEN3_CHAN_CONV_REQ | 0; + + buf[2] = ADC5_GEN3_TIME_IMMEDIATE; + + /* Digital param selection */ + adc5_gen3_update_dig_param(prop, &buf[3]); + + /* Update fast average sample value */ + buf[4] = FIELD_PREP(ADC5_GEN3_FAST_AVG_CTL_SAMPLES_MASK, + prop->avg_samples) | ADC5_GEN3_FAST_AVG_CTL_EN; + + /* Select ADC channel */ + buf[5] = prop->channel; + + /* Select HW settle delay for channel */ + buf[6] = FIELD_PREP(ADC5_GEN3_HW_SETTLE_DELAY_MASK, + prop->hw_settle_time_us); + + reinit_completion(&adc->complete); + + ret = adc5_gen3_write(&adc->dev_data, ADC5_GEN3_VADC_SDAM, ADC5_GEN3_SID, + buf, sizeof(buf)); + if (ret) + return ret; + + conv_req = ADC5_GEN3_CONV_REQ_REQ; + return adc5_gen3_write(&adc->dev_data, ADC5_GEN3_VADC_SDAM, + ADC5_GEN3_CONV_REQ, &conv_req, sizeof(conv_req)); +} + +/* + * Worst case delay from PBS in readying handshake bit can be up to 15ms, when + * PBS is busy running other simultaneous transactions, while in the best case, + * it is already ready at this point. Assigning polling delay and retry count + * accordingly. + */ + +#define ADC5_GEN3_HS_DELAY_US 100 +#define ADC5_GEN3_HS_RETRY_COUNT 150 + +int adc5_gen3_poll_wait_hs(struct adc5_device_data *adc, + unsigned int sdam_index) +{ + u8 conv_req = ADC5_GEN3_CONV_REQ_REQ; + int ret, count; + u8 status = 0; + + for (count = 0; count < ADC5_GEN3_HS_RETRY_COUNT; count++) { + ret = adc5_gen3_read(adc, sdam_index, ADC5_GEN3_HS, &status, sizeof(status)); + if (ret) + return ret; + + if (status == ADC5_GEN3_HS_READY) { + ret = adc5_gen3_read(adc, sdam_index, ADC5_GEN3_CONV_REQ, + &conv_req, sizeof(conv_req)); + if (ret) + return ret; + + if (!conv_req) + return 0; + } + + fsleep(ADC5_GEN3_HS_DELAY_US); + } + + pr_err("Setting HS ready bit timed out, sdam_index:%d, status:%#x\n", + sdam_index, status); + return -ETIMEDOUT; +} +EXPORT_SYMBOL_NS_GPL(adc5_gen3_poll_wait_hs, "QCOM_SPMI_ADC5_GEN3"); + +int adc5_gen3_status_clear(struct adc5_device_data *adc, + int sdam_index, u16 offset, u8 *val, int len) +{ + u8 value; + int ret; + + ret = adc5_gen3_write(adc, sdam_index, offset, val, len); + if (ret) + return ret; + + /* To indicate conversion request is only to clear a status */ + value = 0; + ret = adc5_gen3_write(adc, sdam_index, ADC5_GEN3_PERPH_CH, &value, + sizeof(value)); + if (ret) + return ret; + + value = ADC5_GEN3_CONV_REQ_REQ; + return adc5_gen3_write(adc, sdam_index, ADC5_GEN3_CONV_REQ, &value, + sizeof(value)); +} +EXPORT_SYMBOL_NS_GPL(adc5_gen3_status_clear, "QCOM_SPMI_ADC5_GEN3"); + +/* + * Worst case delay from PBS for conversion time can be up to 500ms, when PBS + * has timed out twice, once for the initial attempt and once for a retry of + * the same transaction. + */ + +#define ADC5_GEN3_CONV_TIMEOUT_MS 501 + +static int adc5_gen3_do_conversion(struct adc5_chip *adc, + struct adc5_channel_common_prop *prop, + u16 *data_volt) +{ + unsigned long rc; + int ret; + u8 val; + + guard(mutex)(&adc->lock); + ret = adc5_gen3_poll_wait_hs(&adc->dev_data, ADC5_GEN3_VADC_SDAM); + if (ret) + return ret; + + ret = adc5_gen3_configure(adc, prop); + if (ret) { + dev_err(adc->dev, "ADC configure failed with %d\n", ret); + return ret; + } + + /* No support for polling mode at present */ + rc = wait_for_completion_timeout(&adc->complete, + msecs_to_jiffies(ADC5_GEN3_CONV_TIMEOUT_MS)); + if (!rc) { + dev_err(adc->dev, "Reading ADC channel %s timed out\n", + prop->label); + return -ETIMEDOUT; + } + + ret = adc5_gen3_read_voltage_data(adc, data_volt); + if (ret) + return ret; + + val = BIT(0); + return adc5_gen3_status_clear(&adc->dev_data, ADC5_GEN3_VADC_SDAM, + ADC5_GEN3_EOC_CLR, &val, 1); +} + +static irqreturn_t adc5_gen3_isr(int irq, void *dev_id) +{ + struct adc5_chip *adc = dev_id; + struct device *dev = adc->dev; + struct auxiliary_device *adev; + u8 status, eoc_status, val; + u8 tm_status[2]; + int ret; + + ret = adc5_gen3_read(&adc->dev_data, ADC5_GEN3_VADC_SDAM, + ADC5_GEN3_STATUS1, &status, sizeof(status)); + if (ret) { + dev_err(dev, "adc read status1 failed with %d\n", ret); + return IRQ_HANDLED; + } + + ret = adc5_gen3_read(&adc->dev_data, ADC5_GEN3_VADC_SDAM, + ADC5_GEN3_EOC_STS, &eoc_status, sizeof(eoc_status)); + if (ret) { + dev_err(dev, "adc read eoc status failed with %d\n", ret); + return IRQ_HANDLED; + } + + if (status & ADC5_GEN3_STATUS1_CONV_FAULT) { + dev_err_ratelimited(dev, + "Unexpected conversion fault, status:%#x, eoc_status:%#x\n", + status, eoc_status); + val = ADC5_GEN3_CONV_ERR_CLR_REQ; + adc5_gen3_status_clear(&adc->dev_data, ADC5_GEN3_VADC_SDAM, + ADC5_GEN3_CONV_ERR_CLR, &val, 1); + return IRQ_HANDLED; + } + + /* CHAN0 is the preconfigured channel for immediate conversion */ + if (eoc_status & ADC5_GEN3_EOC_CHAN_0) + complete(&adc->complete); + + ret = adc5_gen3_read(&adc->dev_data, ADC5_GEN3_VADC_SDAM, + ADC5_GEN3_TM_HIGH_STS, tm_status, sizeof(tm_status)); + if (ret) { + dev_err(dev, "adc read TM status failed with %d\n", ret); + return IRQ_HANDLED; + } + + dev_dbg(dev, "Interrupt status:%#x, EOC status:%#x, high:%#x, low:%#x\n", + status, eoc_status, tm_status[0], tm_status[1]); + + if (tm_status[0] || tm_status[1]) { + adev = adc->tm_aux; + if (!adev || !adev->dev.driver) { + dev_err(dev, "adc_tm auxiliary device not initialized\n"); + return IRQ_HANDLED; + } + + adc->handler(adev); + } + + return IRQ_HANDLED; +} + +static int adc5_gen3_fwnode_xlate(struct iio_dev *indio_dev, + const struct fwnode_reference_args *iiospec) +{ + struct adc5_chip *adc = iio_priv(indio_dev); + int i, v_channel; + + for (i = 0; i < adc->nchannels; i++) { + v_channel = ADC5_GEN3_V_CHAN(adc->chan_props[i].common_props); + if (v_channel == iiospec->args[0]) + return i; + } + + return -ENOENT; +} + +static int adc5_gen3_read_raw(struct iio_dev *indio_dev, + struct iio_chan_spec const *chan, int *val, + int *val2, long mask) +{ + struct adc5_chip *adc = iio_priv(indio_dev); + struct adc5_channel_common_prop *prop; + u16 adc_code_volt; + int ret; + + prop = &adc->chan_props[chan->address].common_props; + + switch (mask) { + case IIO_CHAN_INFO_PROCESSED: + ret = adc5_gen3_do_conversion(adc, prop, &adc_code_volt); + if (ret) + return ret; + + ret = qcom_adc5_hw_scale(prop->scale_fn_type, prop->prescale, + adc->data, adc_code_volt, val); + if (ret) + return ret; + + return IIO_VAL_INT; + default: + return -EINVAL; + } +} + +static int adc5_gen3_read_label(struct iio_dev *indio_dev, + const struct iio_chan_spec *chan, char *label) +{ + struct adc5_chip *adc = iio_priv(indio_dev); + struct adc5_channel_prop *prop; + + prop = &adc->chan_props[chan->address]; + return sprintf(label, "%s\n", prop->common_props.label); +} + +static const struct iio_info adc5_gen3_info = { + .read_raw = adc5_gen3_read_raw, + .read_label = adc5_gen3_read_label, + .fwnode_xlate = adc5_gen3_fwnode_xlate, +}; + +struct adc5_channels { + unsigned int prescale_index; + enum iio_chan_type type; + long info_mask; + enum vadc_scale_fn_type scale_fn_type; +}; + +/* In these definitions, _pre refers to an index into adc5_prescale_ratios. */ +#define ADC5_CHAN(_type, _mask, _pre, _scale) \ + { \ + .prescale_index = _pre, \ + .type = _type, \ + .info_mask = _mask, \ + .scale_fn_type = _scale, \ + }, \ + +#define ADC5_CHAN_TEMP(_pre, _scale) \ + ADC5_CHAN(IIO_TEMP, BIT(IIO_CHAN_INFO_PROCESSED), _pre, _scale) \ + +#define ADC5_CHAN_VOLT(_pre, _scale) \ + ADC5_CHAN(IIO_VOLTAGE, BIT(IIO_CHAN_INFO_PROCESSED), _pre, _scale) \ + +#define ADC5_CHAN_CUR(_pre, _scale) \ + ADC5_CHAN(IIO_CURRENT, BIT(IIO_CHAN_INFO_PROCESSED), _pre, _scale) \ + +static const struct adc5_channels adc5_gen3_chans_pmic[ADC5_MAX_CHANNEL] = { + [ADC5_GEN3_REF_GND] = ADC5_CHAN_VOLT(0, SCALE_HW_CALIB_DEFAULT) + [ADC5_GEN3_1P25VREF] = ADC5_CHAN_VOLT(0, SCALE_HW_CALIB_DEFAULT) + [ADC5_GEN3_VPH_PWR] = ADC5_CHAN_VOLT(1, SCALE_HW_CALIB_DEFAULT) + [ADC5_GEN3_VBAT_SNS_QBG] = ADC5_CHAN_VOLT(1, SCALE_HW_CALIB_DEFAULT) + [ADC5_GEN3_USB_SNS_V_16] = ADC5_CHAN_TEMP(8, SCALE_HW_CALIB_DEFAULT) + [ADC5_GEN3_VIN_DIV16_MUX] = ADC5_CHAN_TEMP(8, SCALE_HW_CALIB_DEFAULT) + [ADC5_GEN3_DIE_TEMP] = ADC5_CHAN_TEMP(0, + SCALE_HW_CALIB_PMIC_THERM_PM7) + [ADC5_GEN3_AMUX1_THM_100K_PU] = ADC5_CHAN_TEMP(0, + SCALE_HW_CALIB_THERM_100K_PU_PM7) + [ADC5_GEN3_AMUX2_THM_100K_PU] = ADC5_CHAN_TEMP(0, + SCALE_HW_CALIB_THERM_100K_PU_PM7) + [ADC5_GEN3_AMUX3_THM_100K_PU] = ADC5_CHAN_TEMP(0, + SCALE_HW_CALIB_THERM_100K_PU_PM7) + [ADC5_GEN3_AMUX4_THM_100K_PU] = ADC5_CHAN_TEMP(0, + SCALE_HW_CALIB_THERM_100K_PU_PM7) + [ADC5_GEN3_AMUX5_THM_100K_PU] = ADC5_CHAN_TEMP(0, + SCALE_HW_CALIB_THERM_100K_PU_PM7) + [ADC5_GEN3_AMUX6_THM_100K_PU] = ADC5_CHAN_TEMP(0, + SCALE_HW_CALIB_THERM_100K_PU_PM7) + [ADC5_GEN3_AMUX1_GPIO_100K_PU] = ADC5_CHAN_TEMP(0, + SCALE_HW_CALIB_THERM_100K_PU_PM7) + [ADC5_GEN3_AMUX2_GPIO_100K_PU] = ADC5_CHAN_TEMP(0, + SCALE_HW_CALIB_THERM_100K_PU_PM7) + [ADC5_GEN3_AMUX3_GPIO_100K_PU] = ADC5_CHAN_TEMP(0, + SCALE_HW_CALIB_THERM_100K_PU_PM7) + [ADC5_GEN3_AMUX4_GPIO_100K_PU] = ADC5_CHAN_TEMP(0, + SCALE_HW_CALIB_THERM_100K_PU_PM7) +}; + +static int adc5_gen3_get_fw_channel_data(struct adc5_chip *adc, + struct adc5_channel_prop *prop, + struct fwnode_handle *fwnode) +{ + const char *name = fwnode_get_name(fwnode); + const struct adc5_data *data = adc->data; + struct device *dev = adc->dev; + const char *channel_name; + u32 chan, value, sid; + u32 varr[2]; + int ret; + + ret = fwnode_property_read_u32(fwnode, "reg", &chan); + if (ret < 0) + return dev_err_probe(dev, ret, "invalid channel number %s\n", + name); + + /* + * Value read from "reg" is virtual channel number + * virtual channel number = sid << 8 | channel number + */ + sid = FIELD_GET(ADC5_GEN3_VIRTUAL_SID_MASK, chan); + chan = FIELD_GET(ADC5_GEN3_CHANNEL_MASK, chan); + + if (chan > ADC5_MAX_CHANNEL) + return dev_err_probe(dev, -EINVAL, + "%s invalid channel number %d\n", + name, chan); + + prop->common_props.channel = chan; + prop->common_props.sid = sid; + + if (!adc->data->adc_chans[chan].info_mask) + return dev_err_probe(dev, -EINVAL, "Channel %#x not supported\n", chan); + + channel_name = name; + fwnode_property_read_string(fwnode, "label", &channel_name); + prop->common_props.label = channel_name; + + value = data->decimation[ADC5_DECIMATION_DEFAULT]; + fwnode_property_read_u32(fwnode, "qcom,decimation", &value); + ret = qcom_adc5_decimation_from_dt(value, data->decimation); + if (ret < 0) + return dev_err_probe(dev, ret, "%#x invalid decimation %d\n", + chan, value); + prop->common_props.decimation = ret; + + prop->common_props.prescale = adc->data->adc_chans[chan].prescale_index; + ret = fwnode_property_read_u32_array(fwnode, "qcom,pre-scaling", varr, 2); + if (!ret) { + ret = qcom_adc5_prescaling_from_dt(varr[0], varr[1]); + if (ret < 0) + return dev_err_probe(dev, ret, + "%#x invalid pre-scaling <%d %d>\n", + chan, varr[0], varr[1]); + prop->common_props.prescale = ret; + } + + value = data->hw_settle_1[VADC_DEF_HW_SETTLE_TIME]; + fwnode_property_read_u32(fwnode, "qcom,hw-settle-time", &value); + ret = qcom_adc5_hw_settle_time_from_dt(value, data->hw_settle_1); + if (ret < 0) + return dev_err_probe(dev, ret, + "%#x invalid hw-settle-time %d us\n", + chan, value); + prop->common_props.hw_settle_time_us = ret; + + value = BIT(VADC_DEF_AVG_SAMPLES); + fwnode_property_read_u32(fwnode, "qcom,avg-samples", &value); + ret = qcom_adc5_avg_samples_from_dt(value); + if (ret < 0) + return dev_err_probe(dev, ret, "%#x invalid avg-samples %d\n", + chan, value); + prop->common_props.avg_samples = ret; + + if (fwnode_property_read_bool(fwnode, "qcom,ratiometric")) + prop->common_props.cal_method = ADC5_RATIOMETRIC_CAL; + else + prop->common_props.cal_method = ADC5_ABSOLUTE_CAL; + + prop->adc_tm = fwnode_property_read_bool(fwnode, "qcom,adc-tm"); + if (prop->adc_tm) { + adc->n_tm_channels++; + if (adc->n_tm_channels > (adc->dev_data.num_sdams * 8 - 1)) + return dev_err_probe(dev, -EINVAL, + "Number of TM nodes %u greater than channels supported:%u\n", + adc->n_tm_channels, + adc->dev_data.num_sdams * 8 - 1); + } + + return 0; +} + +static const struct adc5_data adc5_gen3_data_pmic = { + .full_scale_code_volt = 0x70e4, + .adc_chans = adc5_gen3_chans_pmic, + .info = &adc5_gen3_info, + .decimation = (unsigned int [ADC5_DECIMATION_SAMPLES_MAX]) + { 85, 340, 1360 }, + .hw_settle_1 = (unsigned int [VADC_HW_SETTLE_SAMPLES_MAX]) + { 15, 100, 200, 300, + 400, 500, 600, 700, + 1000, 2000, 4000, 8000, + 16000, 32000, 64000, 128000 }, +}; + +static const struct of_device_id adc5_match_table[] = { + { + .compatible = "qcom,spmi-adc5-gen3", + .data = &adc5_gen3_data_pmic, + }, + { } +}; +MODULE_DEVICE_TABLE(of, adc5_match_table); + +static int adc5_get_fw_data(struct adc5_chip *adc) +{ + const struct adc5_channels *adc_chan; + struct adc5_channel_prop *chan_props; + struct iio_chan_spec *iio_chan; + struct device *dev = adc->dev; + unsigned int index = 0; + int ret; + + adc->nchannels = device_get_child_node_count(dev); + if (!adc->nchannels) + return dev_err_probe(dev, -EINVAL, "No ADC channels found\n"); + + adc->iio_chans = devm_kcalloc(dev, adc->nchannels, + sizeof(*adc->iio_chans), GFP_KERNEL); + if (!adc->iio_chans) + return -ENOMEM; + + adc->chan_props = devm_kcalloc(dev, adc->nchannels, + sizeof(*adc->chan_props), GFP_KERNEL); + if (!adc->chan_props) + return -ENOMEM; + + chan_props = adc->chan_props; + adc->n_tm_channels = 0; + iio_chan = adc->iio_chans; + adc->data = device_get_match_data(dev); + + device_for_each_child_node_scoped(dev, child) { + ret = adc5_gen3_get_fw_channel_data(adc, chan_props, child); + if (ret) + return ret; + + chan_props->chip = adc; + adc_chan = &adc->data->adc_chans[chan_props->common_props.channel]; + chan_props->common_props.scale_fn_type = adc_chan->scale_fn_type; + + iio_chan->channel = ADC5_GEN3_V_CHAN(chan_props->common_props); + iio_chan->info_mask_separate = adc_chan->info_mask; + iio_chan->type = adc_chan->type; + iio_chan->address = index; + iio_chan->indexed = 1; + iio_chan++; + chan_props++; + index++; + } + + return 0; +} + +static void adc5_gen3_uninit_aux(void *data) +{ + auxiliary_device_uninit(data); +} + +static void adc5_gen3_delete_aux(void *data) +{ + auxiliary_device_delete(data); +} + +static void adc5_gen3_aux_device_release(struct device *dev) {} + +static int adc5_gen3_add_aux_tm_device(struct adc5_chip *adc) +{ + struct tm5_aux_dev_wrapper *aux_device; + int i, ret, i_tm = 0; + + aux_device = devm_kzalloc(adc->dev, sizeof(*aux_device), GFP_KERNEL); + if (!aux_device) + return -ENOMEM; + + aux_device->aux_dev.name = "adc5_tm_gen3"; + aux_device->aux_dev.dev.parent = adc->dev; + aux_device->aux_dev.dev.release = adc5_gen3_aux_device_release; + + aux_device->tm_props = devm_kcalloc(adc->dev, adc->n_tm_channels, + sizeof(*aux_device->tm_props), + GFP_KERNEL); + if (!aux_device->tm_props) + return -ENOMEM; + + aux_device->dev_data = &adc->dev_data; + + for (i = 0; i < adc->nchannels; i++) { + if (!adc->chan_props[i].adc_tm) + continue; + aux_device->tm_props[i_tm] = adc->chan_props[i].common_props; + i_tm++; + } + + device_set_of_node_from_dev(&aux_device->aux_dev.dev, adc->dev); + + aux_device->n_tm_channels = adc->n_tm_channels; + + ret = auxiliary_device_init(&aux_device->aux_dev); + if (ret) + return ret; + + ret = devm_add_action_or_reset(adc->dev, adc5_gen3_uninit_aux, + &aux_device->aux_dev); + if (ret) + return ret; + + ret = auxiliary_device_add(&aux_device->aux_dev); + if (ret) + return ret; + ret = devm_add_action_or_reset(adc->dev, adc5_gen3_delete_aux, + &aux_device->aux_dev); + if (ret) + return ret; + + adc->tm_aux = &aux_device->aux_dev; + + return 0; +} + +void adc5_gen3_mutex_lock(struct device *dev) + __acquires(&adc->lock) +{ + struct iio_dev *indio_dev = dev_get_drvdata(dev->parent); + struct adc5_chip *adc = iio_priv(indio_dev); + + mutex_lock(&adc->lock); +} +EXPORT_SYMBOL_NS_GPL(adc5_gen3_mutex_lock, "QCOM_SPMI_ADC5_GEN3"); + +void adc5_gen3_mutex_unlock(struct device *dev) + __releases(&adc->lock) +{ + struct iio_dev *indio_dev = dev_get_drvdata(dev->parent); + struct adc5_chip *adc = iio_priv(indio_dev); + + mutex_unlock(&adc->lock); +} +EXPORT_SYMBOL_NS_GPL(adc5_gen3_mutex_unlock, "QCOM_SPMI_ADC5_GEN3"); + +int adc5_gen3_get_scaled_reading(struct device *dev, + struct adc5_channel_common_prop *common_props, + int *val) +{ + struct iio_dev *indio_dev = dev_get_drvdata(dev->parent); + struct adc5_chip *adc = iio_priv(indio_dev); + u16 adc_code_volt; + int ret; + + ret = adc5_gen3_do_conversion(adc, common_props, &adc_code_volt); + if (ret) + return ret; + + return qcom_adc5_hw_scale(common_props->scale_fn_type, + common_props->prescale, + adc->data, adc_code_volt, val); +} +EXPORT_SYMBOL_NS_GPL(adc5_gen3_get_scaled_reading, "QCOM_SPMI_ADC5_GEN3"); + +int adc5_gen3_therm_code_to_temp(struct device *dev, + struct adc5_channel_common_prop *common_props, + u16 code, int *val) +{ + struct iio_dev *indio_dev = dev_get_drvdata(dev->parent); + struct adc5_chip *adc = iio_priv(indio_dev); + + return qcom_adc5_hw_scale(common_props->scale_fn_type, + common_props->prescale, + adc->data, code, val); +} +EXPORT_SYMBOL_NS_GPL(adc5_gen3_therm_code_to_temp, "QCOM_SPMI_ADC5_GEN3"); + +void adc5_gen3_register_tm_event_notifier(struct device *dev, + void (*handler)(struct auxiliary_device *)) +{ + struct iio_dev *indio_dev = dev_get_drvdata(dev->parent); + struct adc5_chip *adc = iio_priv(indio_dev); + + adc->handler = handler; +} +EXPORT_SYMBOL_NS_GPL(adc5_gen3_register_tm_event_notifier, "QCOM_SPMI_ADC5_GEN3"); + +static int adc5_gen3_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct iio_dev *indio_dev; + struct adc5_chip *adc; + struct regmap *regmap; + int ret, i; + u32 *reg; + + regmap = dev_get_regmap(dev->parent, NULL); + if (!regmap) + return -ENODEV; + + indio_dev = devm_iio_device_alloc(dev, sizeof(*adc)); + if (!indio_dev) + return -ENOMEM; + + adc = iio_priv(indio_dev); + adc->dev_data.regmap = regmap; + adc->dev = dev; + + ret = device_property_count_u32(dev, "reg"); + if (ret < 0) + return ret; + + adc->dev_data.num_sdams = ret; + + reg = devm_kcalloc(dev, adc->dev_data.num_sdams, sizeof(u32), + GFP_KERNEL); + if (!reg) + return -ENOMEM; + + ret = device_property_read_u32_array(dev, "reg", reg, + adc->dev_data.num_sdams); + if (ret) + return dev_err_probe(dev, ret, + "Failed to read reg property\n"); + + adc->dev_data.base = devm_kcalloc(dev, adc->dev_data.num_sdams, + sizeof(*adc->dev_data.base), + GFP_KERNEL); + if (!adc->dev_data.base) + return -ENOMEM; + + platform_set_drvdata(pdev, indio_dev); + init_completion(&adc->complete); + ret = devm_mutex_init(dev, &adc->lock); + if (ret) + return ret; + + for (i = 0; i < adc->dev_data.num_sdams; i++) { + adc->dev_data.base[i].base_addr = reg[i]; + + ret = platform_get_irq(pdev, i); + if (ret < 0) + return dev_err_probe(dev, ret, + "Getting IRQ %d failed\n", i); + + adc->dev_data.base[i].irq = ret; + + adc->dev_data.base[i].irq_name = devm_kasprintf(dev, GFP_KERNEL, + "sdam%d", i); + if (!adc->dev_data.base[i].irq_name) + return -ENOMEM; + } + + ret = devm_request_irq(dev, adc->dev_data.base[ADC5_GEN3_VADC_SDAM].irq, + adc5_gen3_isr, 0, + adc->dev_data.base[ADC5_GEN3_VADC_SDAM].irq_name, + adc); + if (ret) + return dev_err_probe(dev, ret, + "Failed to request SDAM%d irq\n", + ADC5_GEN3_VADC_SDAM); + + ret = adc5_get_fw_data(adc); + if (ret) + return ret; + + if (adc->n_tm_channels > 0) { + ret = adc5_gen3_add_aux_tm_device(adc); + if (ret) + dev_err_probe(dev, ret, + "Failed to add auxiliary TM device\n"); + } + + indio_dev->name = "spmi-adc5-gen3"; + indio_dev->modes = INDIO_DIRECT_MODE; + indio_dev->info = &adc5_gen3_info; + indio_dev->channels = adc->iio_chans; + indio_dev->num_channels = adc->nchannels; + + return devm_iio_device_register(dev, indio_dev); +} + +static struct platform_driver adc5_gen3_driver = { + .driver = { + .name = "qcom-spmi-adc5-gen3", + .of_match_table = adc5_match_table, + }, + .probe = adc5_gen3_probe, +}; +module_platform_driver(adc5_gen3_driver); + +MODULE_DESCRIPTION("Qualcomm Technologies Inc. PMIC5 Gen3 ADC driver"); +MODULE_LICENSE("GPL"); +MODULE_IMPORT_NS("QCOM_SPMI_ADC5_GEN3"); diff --git a/include/linux/iio/adc/qcom-adc5-gen3-common.h b/include/linux/iio/adc/qcom-adc5-gen3-common.h new file mode 100644 index 000000000000..6303eaa6640b --- /dev/null +++ b/include/linux/iio/adc/qcom-adc5-gen3-common.h @@ -0,0 +1,211 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + * + * Code used in the main and auxiliary Qualcomm PMIC voltage ADCs + * of type ADC5 Gen3. + */ + +#ifndef QCOM_ADC5_GEN3_COMMON_H +#define QCOM_ADC5_GEN3_COMMON_H + +#include +#include +#include +#include +#include +#include +#include + +#define ADC5_GEN3_HS 0x45 +#define ADC5_GEN3_HS_BUSY BIT(7) +#define ADC5_GEN3_HS_READY BIT(0) + +#define ADC5_GEN3_STATUS1 0x46 +#define ADC5_GEN3_STATUS1_CONV_FAULT BIT(7) +#define ADC5_GEN3_STATUS1_THR_CROSS BIT(6) +#define ADC5_GEN3_STATUS1_EOC BIT(0) + +#define ADC5_GEN3_TM_EN_STS 0x47 +#define ADC5_GEN3_TM_HIGH_STS 0x48 +#define ADC5_GEN3_TM_LOW_STS 0x49 + +#define ADC5_GEN3_EOC_STS 0x4a +#define ADC5_GEN3_EOC_CHAN_0 BIT(0) + +#define ADC5_GEN3_EOC_CLR 0x4b +#define ADC5_GEN3_TM_HIGH_STS_CLR 0x4c +#define ADC5_GEN3_TM_LOW_STS_CLR 0x4d +#define ADC5_GEN3_CONV_ERR_CLR 0x4e +#define ADC5_GEN3_CONV_ERR_CLR_REQ BIT(0) + +#define ADC5_GEN3_SID 0x4f +#define ADC5_GEN3_SID_MASK GENMASK(3, 0) + +#define ADC5_GEN3_PERPH_CH 0x50 +#define ADC5_GEN3_CHAN_CONV_REQ BIT(7) + +#define ADC5_GEN3_TIMER_SEL 0x51 +#define ADC5_GEN3_TIME_IMMEDIATE 0x1 + +#define ADC5_GEN3_DIG_PARAM 0x52 +#define ADC5_GEN3_DIG_PARAM_CAL_SEL_MASK GENMASK(5, 4) +#define ADC5_GEN3_DIG_PARAM_DEC_RATIO_SEL_MASK GENMASK(3, 2) + +#define ADC5_GEN3_FAST_AVG 0x53 +#define ADC5_GEN3_FAST_AVG_CTL_EN BIT(7) +#define ADC5_GEN3_FAST_AVG_CTL_SAMPLES_MASK GENMASK(2, 0) + +#define ADC5_GEN3_ADC_CH_SEL_CTL 0x54 +#define ADC5_GEN3_DELAY_CTL 0x55 +#define ADC5_GEN3_HW_SETTLE_DELAY_MASK GENMASK(3, 0) + +#define ADC5_GEN3_CH_EN 0x56 +#define ADC5_GEN3_HIGH_THR_INT_EN BIT(1) +#define ADC5_GEN3_LOW_THR_INT_EN BIT(0) + +#define ADC5_GEN3_LOW_THR0 0x57 +#define ADC5_GEN3_LOW_THR1 0x58 +#define ADC5_GEN3_HIGH_THR0 0x59 +#define ADC5_GEN3_HIGH_THR1 0x5a + +#define ADC5_GEN3_CH_DATA0(channel) (0x5c + (channel) * 2) +#define ADC5_GEN3_CH_DATA1(channel) (0x5d + (channel) * 2) + +#define ADC5_GEN3_CONV_REQ 0xe5 +#define ADC5_GEN3_CONV_REQ_REQ BIT(0) + +#define ADC5_GEN3_VIRTUAL_SID_MASK GENMASK(15, 8) +#define ADC5_GEN3_CHANNEL_MASK GENMASK(7, 0) +#define ADC5_GEN3_V_CHAN(x) \ + (FIELD_PREP(ADC5_GEN3_VIRTUAL_SID_MASK, (x).sid) | (x).channel) + +/* ADC channels for PMIC5 Gen3 */ +#define ADC5_GEN3_REF_GND 0x00 +#define ADC5_GEN3_1P25VREF 0x01 +#define ADC5_GEN3_DIE_TEMP 0x03 +#define ADC5_GEN3_USB_SNS_V_16 0x11 +#define ADC5_GEN3_VIN_DIV16_MUX 0x12 +#define ADC5_GEN3_VPH_PWR 0x8e +#define ADC5_GEN3_VBAT_SNS_QBG 0x8f +/* 100k pull-up channels */ +#define ADC5_GEN3_AMUX1_THM_100K_PU 0x44 +#define ADC5_GEN3_AMUX2_THM_100K_PU 0x45 +#define ADC5_GEN3_AMUX3_THM_100K_PU 0x46 +#define ADC5_GEN3_AMUX4_THM_100K_PU 0x47 +#define ADC5_GEN3_AMUX5_THM_100K_PU 0x48 +#define ADC5_GEN3_AMUX6_THM_100K_PU 0x49 +#define ADC5_GEN3_AMUX1_GPIO_100K_PU 0x4a +#define ADC5_GEN3_AMUX2_GPIO_100K_PU 0x4b +#define ADC5_GEN3_AMUX3_GPIO_100K_PU 0x4c +#define ADC5_GEN3_AMUX4_GPIO_100K_PU 0x4d + +#define ADC5_MAX_CHANNEL 0xc0 + +enum adc5_cal_method { + ADC5_NO_CAL = 0, + ADC5_RATIOMETRIC_CAL, + ADC5_ABSOLUTE_CAL, +}; + +enum adc5_time_select { + MEAS_INT_DISABLE = 0, + MEAS_INT_IMMEDIATE, + MEAS_INT_50MS, + MEAS_INT_100MS, + MEAS_INT_1S, + MEAS_INT_NONE, +}; + +/** + * struct adc5_sdam_data - data per SDAM allocated for adc usage + * @base_addr: base address for the ADC SDAM peripheral. + * @irq_name: ADC IRQ name. + * @irq: ADC IRQ number. + */ +struct adc5_sdam_data { + u16 base_addr; + const char *irq_name; + int irq; +}; + +/** + * struct adc5_device_data - Top-level ADC device data + * @regmap: ADC peripheral register map field. + * @base: array of SDAM data. + * @num_sdams: number of ADC SDAM peripherals. + */ +struct adc5_device_data { + struct regmap *regmap; + struct adc5_sdam_data *base; + int num_sdams; +}; + +/** + * struct adc5_channel_common_prop - ADC channel properties (common to ADC and TM). + * @channel: channel number, refer to the channel list. + * @cal_method: calibration method. + * @decimation: sampling rate supported for the channel. + * @sid: ID of PMIC owning the channel. + * @label: Channel name used in device tree. + * @prescale: channel scaling performed on the input signal. + * @hw_settle_time_us: the time between AMUX being configured and the + * start of conversion in uS. + * @avg_samples: ability to provide single result from the ADC + * that is an average of multiple measurements. + * @scale_fn_type: Represents the scaling function to convert voltage + * physical units desired by the client for the channel. + */ +struct adc5_channel_common_prop { + unsigned int channel; + enum adc5_cal_method cal_method; + unsigned int decimation; + unsigned int sid; + const char *label; + unsigned int prescale; + unsigned int hw_settle_time_us; + unsigned int avg_samples; + enum vadc_scale_fn_type scale_fn_type; +}; + +/** + * struct tm5_aux_dev_wrapper - wrapper structure around TM auxiliary device + * @aux_dev: TM auxiliary device structure. + * @dev_data: Top-level ADC device data. + * @tm_props: Array of common ADC channel properties for TM channels. + * @n_tm_channels: number of TM channels. + */ +struct tm5_aux_dev_wrapper { + struct auxiliary_device aux_dev; + struct adc5_device_data *dev_data; + struct adc5_channel_common_prop *tm_props; + unsigned int n_tm_channels; +}; + +int adc5_gen3_read(struct adc5_device_data *adc, unsigned int sdam_index, + u16 offset, u8 *data, int len); + +int adc5_gen3_write(struct adc5_device_data *adc, unsigned int sdam_index, + u16 offset, u8 *data, int len); + +int adc5_gen3_poll_wait_hs(struct adc5_device_data *adc, + unsigned int sdam_index); + +void adc5_gen3_update_dig_param(struct adc5_channel_common_prop *prop, + u8 *data); + +int adc5_gen3_status_clear(struct adc5_device_data *adc, + int sdam_index, u16 offset, u8 *val, int len); + +void adc5_gen3_mutex_lock(struct device *dev); +void adc5_gen3_mutex_unlock(struct device *dev); +int adc5_gen3_get_scaled_reading(struct device *dev, + struct adc5_channel_common_prop *common_props, + int *val); +int adc5_gen3_therm_code_to_temp(struct device *dev, + struct adc5_channel_common_prop *common_props, + u16 code, int *val); +void adc5_gen3_register_tm_event_notifier(struct device *dev, + void (*handler)(struct auxiliary_device *)); + +#endif /* QCOM_ADC5_GEN3_COMMON_H */ -- cgit v1.2.3 From 3d35d41169d000f4fbf3c23999b8443e1173efce Mon Sep 17 00:00:00 2001 From: Fabio Baltieri Date: Mon, 23 Feb 2026 13:25:55 -0800 Subject: Input: export input_default_setkeycode Export input_default_setkeycode so that a driver can set a custom setkeycode handler to take some driver specific action but still call the default handler at some point. Signed-off-by: Fabio Baltieri Reviewed-by: Tzung-Bi Shih Link: https://patch.msgid.link/20260222003717.471977-1-dmitry.torokhov@gmail.com Signed-off-by: Dmitry Torokhov --- drivers/input/input.c | 23 ++++++++++++++++++++--- include/linux/input.h | 4 ++++ 2 files changed, 24 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/input/input.c b/drivers/input/input.c index a500e1e276c2..c227eaa6271a 100644 --- a/drivers/input/input.c +++ b/drivers/input/input.c @@ -800,14 +800,30 @@ static int input_default_getkeycode(struct input_dev *dev, return 0; } -static int input_default_setkeycode(struct input_dev *dev, - const struct input_keymap_entry *ke, - unsigned int *old_keycode) +/** + * input_default_setkeycode - default setkeycode method + * @dev: input device which keymap is being updated. + * @ke: new keymap entry. + * @old_keycode: pointer to the location where old keycode should be stored. + * + * This function is the default implementation of &input_dev.setkeycode() + * method. It is typically used when a driver does not provide its own + * implementation, but it is also exported so drivers can extend it. + * + * The function must be called with &input_dev.event_lock held. + * + * Return: 0 on success, or a negative error code on failure. + */ +int input_default_setkeycode(struct input_dev *dev, + const struct input_keymap_entry *ke, + unsigned int *old_keycode) { unsigned int index; int error; int i; + lockdep_assert_held(&dev->event_lock); + if (!dev->keycodesize) return -EINVAL; @@ -861,6 +877,7 @@ static int input_default_setkeycode(struct input_dev *dev, __set_bit(ke->keycode, dev->keybit); return 0; } +EXPORT_SYMBOL(input_default_setkeycode); /** * input_get_keycode - retrieve keycode currently mapped to a given scancode diff --git a/include/linux/input.h b/include/linux/input.h index 7d7cb0593a63..06ca62328db1 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -517,6 +517,10 @@ INPUT_GENERATE_ABS_ACCESSORS(res, resolution) int input_scancode_to_scalar(const struct input_keymap_entry *ke, unsigned int *scancode); +int input_default_setkeycode(struct input_dev *dev, + const struct input_keymap_entry *ke, + unsigned int *old_keycode); + int input_get_keycode(struct input_dev *dev, struct input_keymap_entry *ke); int input_set_keycode(struct input_dev *dev, const struct input_keymap_entry *ke); -- cgit v1.2.3 From fa4f81a8c15d4018eb2053b093bf1584777e80d4 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 19 Feb 2026 10:47:03 +0900 Subject: ata: libata-scsi: make ata_scsi_simulate() static ata_scsi_simulate() is called only from libata-scsi.c. Move this function definition as a static function before its call in __ata_scsi_queuecmd() and remove its declaration from include/linux/libata.h. No functional changes. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke --- drivers/ata/libata-scsi.c | 147 +++++++++++++++++++++++----------------------- include/linux/libata.h | 1 - 2 files changed, 73 insertions(+), 75 deletions(-) (limited to 'include/linux') diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 41918e21d0f8..ad628b398fc3 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -4420,6 +4420,79 @@ static inline ata_xlat_func_t ata_get_xlat_func(struct ata_device *dev, u8 cmd) return NULL; } +/** + * ata_scsi_simulate - simulate SCSI command on ATA device + * @dev: the target device + * @cmd: SCSI command being sent to device. + * + * Interprets and directly executes a select list of SCSI commands + * that can be handled internally. + * + * LOCKING: + * spin_lock_irqsave(host lock) + */ +static void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd) +{ + const u8 *scsicmd = cmd->cmnd; + u8 tmp8; + + switch (scsicmd[0]) { + case INQUIRY: + ata_scsi_rbuf_fill(dev, cmd, ata_scsiop_inquiry); + break; + + case MODE_SENSE: + case MODE_SENSE_10: + ata_scsi_rbuf_fill(dev, cmd, ata_scsiop_mode_sense); + break; + + case READ_CAPACITY: + case SERVICE_ACTION_IN_16: + ata_scsi_rbuf_fill(dev, cmd, ata_scsiop_read_cap); + break; + + case REPORT_LUNS: + ata_scsi_rbuf_fill(dev, cmd, ata_scsiop_report_luns); + break; + + case REQUEST_SENSE: + ata_scsi_set_sense(dev, cmd, 0, 0, 0); + break; + + /* if we reach this, then writeback caching is disabled, + * turning this into a no-op. + */ + case SYNCHRONIZE_CACHE: + case SYNCHRONIZE_CACHE_16: + fallthrough; + + /* no-op's, complete with success */ + case REZERO_UNIT: + case SEEK_6: + case SEEK_10: + case TEST_UNIT_READY: + break; + + case SEND_DIAGNOSTIC: + tmp8 = scsicmd[1] & ~(1 << 3); + if (tmp8 != 0x4 || scsicmd[3] || scsicmd[4]) + ata_scsi_set_invalid_field(dev, cmd, 1, 0xff); + break; + + case MAINTENANCE_IN: + ata_scsi_rbuf_fill(dev, cmd, ata_scsiop_maint_in); + break; + + /* all other commands */ + default: + ata_scsi_set_sense(dev, cmd, ILLEGAL_REQUEST, 0x20, 0x0); + /* "Invalid command operation code" */ + break; + } + + scsi_done(cmd); +} + enum scsi_qc_status __ata_scsi_queuecmd(struct scsi_cmnd *scmd, struct ata_device *dev) { @@ -4522,80 +4595,6 @@ enum scsi_qc_status ata_scsi_queuecmd(struct Scsi_Host *shost, } EXPORT_SYMBOL_GPL(ata_scsi_queuecmd); -/** - * ata_scsi_simulate - simulate SCSI command on ATA device - * @dev: the target device - * @cmd: SCSI command being sent to device. - * - * Interprets and directly executes a select list of SCSI commands - * that can be handled internally. - * - * LOCKING: - * spin_lock_irqsave(host lock) - */ - -void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd) -{ - const u8 *scsicmd = cmd->cmnd; - u8 tmp8; - - switch(scsicmd[0]) { - case INQUIRY: - ata_scsi_rbuf_fill(dev, cmd, ata_scsiop_inquiry); - break; - - case MODE_SENSE: - case MODE_SENSE_10: - ata_scsi_rbuf_fill(dev, cmd, ata_scsiop_mode_sense); - break; - - case READ_CAPACITY: - case SERVICE_ACTION_IN_16: - ata_scsi_rbuf_fill(dev, cmd, ata_scsiop_read_cap); - break; - - case REPORT_LUNS: - ata_scsi_rbuf_fill(dev, cmd, ata_scsiop_report_luns); - break; - - case REQUEST_SENSE: - ata_scsi_set_sense(dev, cmd, 0, 0, 0); - break; - - /* if we reach this, then writeback caching is disabled, - * turning this into a no-op. - */ - case SYNCHRONIZE_CACHE: - case SYNCHRONIZE_CACHE_16: - fallthrough; - - /* no-op's, complete with success */ - case REZERO_UNIT: - case SEEK_6: - case SEEK_10: - case TEST_UNIT_READY: - break; - - case SEND_DIAGNOSTIC: - tmp8 = scsicmd[1] & ~(1 << 3); - if (tmp8 != 0x4 || scsicmd[3] || scsicmd[4]) - ata_scsi_set_invalid_field(dev, cmd, 1, 0xff); - break; - - case MAINTENANCE_IN: - ata_scsi_rbuf_fill(dev, cmd, ata_scsiop_maint_in); - break; - - /* all other commands */ - default: - ata_scsi_set_sense(dev, cmd, ILLEGAL_REQUEST, 0x20, 0x0); - /* "Invalid command operation code" */ - break; - } - - scsi_done(cmd); -} - int ata_scsi_add_hosts(struct ata_host *host, const struct scsi_host_template *sht) { int i, rc; diff --git a/include/linux/libata.h b/include/linux/libata.h index 00346ce3af5e..db87c99e4189 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1205,7 +1205,6 @@ extern unsigned int ata_do_dev_read_id(struct ata_device *dev, struct ata_taskfile *tf, __le16 *id); extern void ata_qc_complete(struct ata_queued_cmd *qc); extern u64 ata_qc_get_active(struct ata_port *ap); -extern void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd); extern int ata_std_bios_param(struct scsi_device *sdev, struct gendisk *unused, sector_t capacity, int geom[]); -- cgit v1.2.3 From ecfa23b486b22844855844202424bc1966cebb33 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Tue, 17 Feb 2026 12:26:15 +0100 Subject: jiffies: Remove unused __jiffy_arch_data The __jiffy_arch_data definition was added in 2017 by commit 60b0a8c3d248 ("frv: declare jiffies to be located in the .data section") for the needs of the frv port. The frv support was removed in 2018 by commit fd8773f9f544 ("arch: remove frv port") and no other architecture has required __jiffy_arch_data. Therefore, remove this unused definition. Signed-off-by: Petr Pavlu Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260217112638.1525094-1-petr.pavlu@suse.com --- include/linux/jiffies.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index fdef2c155c27..1a393d160420 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -67,10 +67,6 @@ extern void register_refined_jiffies(long clock_tick_rate); /* USER_TICK_USEC is the time between ticks in usec assuming fake USER_HZ */ #define USER_TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ) -#ifndef __jiffy_arch_data -#define __jiffy_arch_data -#endif - /* * The 64-bit value is not atomic on 32-bit systems - you MUST NOT read it * without sampling the sequence number in jiffies_lock. @@ -83,7 +79,7 @@ extern void register_refined_jiffies(long clock_tick_rate); * See arch/ARCH/kernel/vmlinux.lds.S */ extern u64 __cacheline_aligned_in_smp jiffies_64; -extern unsigned long volatile __cacheline_aligned_in_smp __jiffy_arch_data jiffies; +extern unsigned long volatile __cacheline_aligned_in_smp jiffies; #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void); -- cgit v1.2.3 From 38ab6557234d8629407a824be90e82514d6129a0 Mon Sep 17 00:00:00 2001 From: Sander Vanheule Date: Fri, 20 Feb 2026 17:01:11 +0100 Subject: regmap: sort header includes Sort the included headers to make spotting duplicates easier and avoid discussions on where to add new includes. Signed-off-by: Sander Vanheule Link: https://patch.msgid.link/20260220160112.543391-1-sander@svanheule.net Signed-off-by: Mark Brown --- include/linux/regmap.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index caff2240bdab..c8a6a05bdba1 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -10,15 +10,15 @@ * Author: Mark Brown */ -#include -#include -#include +#include #include #include -#include -#include -#include #include +#include +#include +#include +#include +#include struct module; struct clk; -- cgit v1.2.3 From 37983fad7f3ef296fa0504c8e945987459dc5487 Mon Sep 17 00:00:00 2001 From: Sander Vanheule Date: Fri, 20 Feb 2026 17:01:12 +0100 Subject: regmap: define cleanup helper for regmap_field For temporary field allocation, the user has to perform manual cleanup, or rely on devm_regmap_field_alloc() to (eventually) clean up the allocated resources when an error occurs. Add a cleanup helper that takes care of freeing the allocated regmap_field whenever it goes out of scope. This can simplify this example: struct regmap_field *field = regmap_field_alloc(...); if (IS_ERR(field)) return PTR_ERR(field); int err = regmap_field_read(...); if (err) goto out; /* some logic that may also error */ err = regmap_field_write(...); out: regmap_field_free(field); return err; into the shorter: struct regmap_field *field __free(regmap_field) = regmap_field_alloc(...); if (IS_ERR(field)) return PTR_ERR(field); int err = regmap_field_read(...); if (err) return err; /* some logic that may also error */ return regmap_field_write(...); Signed-off-by: Sander Vanheule Link: https://patch.msgid.link/20260220160112.543391-2-sander@svanheule.net Signed-off-by: Mark Brown --- include/linux/regmap.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index c8a6a05bdba1..f1c5cb63c171 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -11,6 +11,7 @@ */ #include +#include #include #include #include @@ -1460,6 +1461,8 @@ struct regmap_field *regmap_field_alloc(struct regmap *regmap, struct reg_field reg_field); void regmap_field_free(struct regmap_field *field); +DEFINE_FREE(regmap_field, struct regmap_field *, if (_T) regmap_field_free(_T)) + struct regmap_field *devm_regmap_field_alloc(struct device *dev, struct regmap *regmap, struct reg_field reg_field); void devm_regmap_field_free(struct device *dev, struct regmap_field *field); -- cgit v1.2.3 From aa8671af0c380c15989b88325a2d5a6c5341771d Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Tue, 24 Feb 2026 12:10:43 +0100 Subject: PCI/PTM: Drop pci_enable_ptm() granularity parameter No pci_enable_ptm() callers supply the "granularity" pointer where the clock granularity would be returned. Drop the unused pci_enable_ptm() parameter. Signed-off-by: Mika Westerberg [bhelgaas: commit log] Signed-off-by: Bjorn Helgaas Reviewed-by: Andy Shevchenko Link: https://patch.msgid.link/20260224111044.3487873-5-mika.westerberg@linux.intel.com --- drivers/net/ethernet/intel/ice/ice_main.c | 2 +- drivers/net/ethernet/intel/idpf/idpf_main.c | 2 +- drivers/net/ethernet/intel/igc/igc_main.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 +- drivers/pci/pcie/ptm.c | 11 +++-------- include/linux/pci.h | 4 ++-- 6 files changed, 9 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index ebf48feffb30..b35c4e4ecd2a 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -5028,7 +5028,7 @@ static int ice_init(struct ice_pf *pf) } if (pf->hw.mac_type == ICE_MAC_E830) { - err = pci_enable_ptm(pf->pdev, NULL); + err = pci_enable_ptm(pf->pdev); if (err) dev_dbg(dev, "PCIe PTM not supported by PCIe bus/controller\n"); } diff --git a/drivers/net/ethernet/intel/idpf/idpf_main.c b/drivers/net/ethernet/intel/idpf/idpf_main.c index 0dd741dcfcdb..ab3c409e587b 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_main.c +++ b/drivers/net/ethernet/intel/idpf/idpf_main.c @@ -257,7 +257,7 @@ static int idpf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto err_free; } - err = pci_enable_ptm(pdev, NULL); + err = pci_enable_ptm(pdev); if (err) pci_dbg(pdev, "PCIe PTM is not supported by PCIe bus/controller\n"); diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 27e5c2109138..b030acf94ac4 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -7123,7 +7123,7 @@ static int igc_probe(struct pci_dev *pdev, if (err) goto err_pci_reg; - err = pci_enable_ptm(pdev, NULL); + err = pci_enable_ptm(pdev); if (err < 0) dev_info(&pdev->dev, "PCIe PTM not supported by PCIe bus/controller\n"); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index fdc3ba20912e..0b94d4ed0ef6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -960,7 +960,7 @@ static int mlx5_pci_init(struct mlx5_core_dev *dev, struct pci_dev *pdev, mlx5_pci_vsc_init(dev); - pci_enable_ptm(pdev, NULL); + pci_enable_ptm(pdev); return 0; diff --git a/drivers/pci/pcie/ptm.c b/drivers/pci/pcie/ptm.c index 91a598ed534c..2c848ae4f15f 100644 --- a/drivers/pci/pcie/ptm.c +++ b/drivers/pci/pcie/ptm.c @@ -88,7 +88,7 @@ void pci_ptm_init(struct pci_dev *dev) if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT || pci_pcie_type(dev) == PCI_EXP_TYPE_UPSTREAM) - pci_enable_ptm(dev, NULL); + pci_enable_ptm(dev); } void pci_save_ptm_state(struct pci_dev *dev) @@ -182,15 +182,13 @@ static int __pci_enable_ptm(struct pci_dev *dev) /** * pci_enable_ptm() - Enable Precision Time Measurement * @dev: PCI device - * @granularity: pointer to return granularity * - * Enable Precision Time Measurement for @dev. If successful and - * @granularity is non-NULL, return the Effective Granularity. + * Enable Precision Time Measurement for @dev. * * Return: zero if successful, or -EINVAL if @dev lacks a PTM Capability or * is not a PTM Root and lacks an upstream path of PTM-enabled devices. */ -int pci_enable_ptm(struct pci_dev *dev, u8 *granularity) +int pci_enable_ptm(struct pci_dev *dev) { int rc; char clock_desc[8]; @@ -201,9 +199,6 @@ int pci_enable_ptm(struct pci_dev *dev, u8 *granularity) dev->ptm_enabled = 1; - if (granularity) - *granularity = dev->ptm_granularity; - switch (dev->ptm_granularity) { case 0: snprintf(clock_desc, sizeof(clock_desc), "unknown"); diff --git a/include/linux/pci.h b/include/linux/pci.h index 1c270f1d5123..8aaa72dcb164 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1975,11 +1975,11 @@ struct pci_ptm_debugfs { }; #ifdef CONFIG_PCIE_PTM -int pci_enable_ptm(struct pci_dev *dev, u8 *granularity); +int pci_enable_ptm(struct pci_dev *dev); void pci_disable_ptm(struct pci_dev *dev); bool pcie_ptm_enabled(struct pci_dev *dev); #else -static inline int pci_enable_ptm(struct pci_dev *dev, u8 *granularity) +static inline int pci_enable_ptm(struct pci_dev *dev) { return -EINVAL; } static inline void pci_disable_ptm(struct pci_dev *dev) { } static inline bool pcie_ptm_enabled(struct pci_dev *dev) -- cgit v1.2.3 From e02902dd493bf9c9b05353c761737ac514ad7a5c Mon Sep 17 00:00:00 2001 From: Antoniu Miclaus Date: Mon, 23 Feb 2026 18:21:01 +0200 Subject: spi: add devm_spi_new_ancillary_device() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a devres-managed version of spi_new_ancillary_device() that automatically unregisters the ancillary SPI device when the parent device is removed. This follows the same devm_add_action_or_reset() pattern used by the other managed SPI functions (devm_spi_optimize_message, devm_spi_register_controller, etc.) and eliminates the need for drivers to open-code their own devm cleanup callbacks for ancillary devices. Acked-by: Nuno Sá Signed-off-by: Antoniu Miclaus Link: https://patch.msgid.link/20260223162110.156746-3-antoniu.miclaus@analog.com Signed-off-by: Mark Brown --- drivers/spi/spi.c | 40 ++++++++++++++++++++++++++++++++++++++++ include/linux/spi/spi.h | 1 + 2 files changed, 41 insertions(+) (limited to 'include/linux') diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 8fbed4754de4..26cc10aa7533 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -2747,6 +2747,46 @@ err_out: } EXPORT_SYMBOL_GPL(spi_new_ancillary_device); +static void devm_spi_unregister_device(void *spi) +{ + spi_unregister_device(spi); +} + +/** + * devm_spi_new_ancillary_device() - Register managed ancillary SPI device + * @spi: Pointer to the main SPI device registering the ancillary device + * @chip_select: Chip Select of the ancillary device + * + * Register an ancillary SPI device; for example some chips have a chip-select + * for normal device usage and another one for setup/firmware upload. + * + * This is the managed version of spi_new_ancillary_device(). The ancillary + * device will be unregistered automatically when the parent SPI device is + * unregistered. + * + * This may only be called from main SPI device's probe routine. + * + * Return: Pointer to new ancillary device on success; ERR_PTR on failure + */ +struct spi_device *devm_spi_new_ancillary_device(struct spi_device *spi, + u8 chip_select) +{ + struct spi_device *ancillary; + int ret; + + ancillary = spi_new_ancillary_device(spi, chip_select); + if (IS_ERR(ancillary)) + return ancillary; + + ret = devm_add_action_or_reset(&spi->dev, devm_spi_unregister_device, + ancillary); + if (ret) + return ERR_PTR(ret); + + return ancillary; +} +EXPORT_SYMBOL_GPL(devm_spi_new_ancillary_device); + #ifdef CONFIG_ACPI struct acpi_spi_lookup { struct spi_controller *ctlr; diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index af7cfee7b8f6..1c9aab627583 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -387,6 +387,7 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv) } extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 chip_select); +extern struct spi_device *devm_spi_new_ancillary_device(struct spi_device *spi, u8 chip_select); /* Use a define to avoid include chaining to get THIS_MODULE */ #define spi_register_driver(driver) \ -- cgit v1.2.3 From 477174ac35c510d0ed3043f5bd4fba25546a21ce Mon Sep 17 00:00:00 2001 From: David Carlier Date: Tue, 24 Feb 2026 05:56:37 +0000 Subject: sched_ext: Optimize sched_ext_entity layout for cache locality Reorder struct sched_ext_entity to place ops_state, ddsp_dsq_id, and ddsp_enq_flags immediately after dsq. These fields are accessed together in the do_enqueue_task() and finish_dispatch() hot paths but were previously spread across three different cache lines. Grouping them on the same cache line reduces cache misses on every enqueue and dispatch operation. Signed-off-by: David Carlier Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 4601e5ecb43c..0150b3fe6230 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -163,6 +163,9 @@ struct scx_dsq_list_node { */ struct sched_ext_entity { struct scx_dispatch_q *dsq; + atomic_long_t ops_state; + u64 ddsp_dsq_id; + u64 ddsp_enq_flags; struct scx_dsq_list_node dsq_list; /* dispatch order */ struct rb_node dsq_priq; /* p->scx.dsq_vtime order */ u32 dsq_seq; @@ -174,7 +177,6 @@ struct sched_ext_entity { s32 selected_cpu; u32 kf_mask; /* see scx_kf_mask above */ struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */ - atomic_long_t ops_state; struct list_head runnable_node; /* rq->scx.runnable_list */ unsigned long runnable_at; @@ -182,8 +184,6 @@ struct sched_ext_entity { #ifdef CONFIG_SCHED_CORE u64 core_sched_at; /* see scx_prio_less() */ #endif - u64 ddsp_dsq_id; - u64 ddsp_enq_flags; /* BPF scheduler modifiable fields */ -- cgit v1.2.3 From cd0aa651535092afd5d776bfe94e4fdf750f89c3 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 23 Feb 2026 09:34:41 +0000 Subject: net: stmmac: pass interface mode into fix_mac_speed() method Pass the current interface mode reported by phylink into the fix_mac_speed() method. This will be used by qcom-ethqos for its "SGMII" configuration. Reviewed-by: Maxime Chevallier Tested-by: Mohd Ayaan Anwar Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1vuSKv-0000000AScG-1zv6@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c | 3 ++- drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c | 11 +++++++---- drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c | 3 ++- drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c | 3 ++- drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c | 3 ++- drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c | 11 +++++++---- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 3 ++- include/linux/stmmac.h | 3 ++- 8 files changed, 26 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c index d043bad4a862..0495437d3a6e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c @@ -112,7 +112,8 @@ static int dwc_qos_probe(struct platform_device *pdev, #define AUTO_CAL_STATUS 0x880c #define AUTO_CAL_STATUS_ACTIVE BIT(31) -static void tegra_eqos_fix_speed(void *bsp_priv, int speed, unsigned int mode) +static void tegra_eqos_fix_speed(void *bsp_priv, phy_interface_t interface, + int speed, unsigned int mode) { struct tegra_eqos *eqos = bsp_priv; bool needs_calibration = false; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c index c4e85197629d..9f5a15b81f8a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c @@ -48,7 +48,8 @@ struct imx_dwmac_ops { int (*fix_soc_reset)(struct stmmac_priv *priv); int (*set_intf_mode)(struct imx_priv_data *dwmac, u8 phy_intf_sel); - void (*fix_mac_speed)(void *priv, int speed, unsigned int mode); + void (*fix_mac_speed)(void *priv, phy_interface_t interface, + int speed, unsigned int mode); }; struct imx_priv_data { @@ -160,7 +161,8 @@ static int imx_dwmac_set_clk_tx_rate(void *bsp_priv, struct clk *clk_tx_i, return stmmac_set_clk_tx_rate(bsp_priv, clk_tx_i, interface, speed); } -static void imx_dwmac_fix_speed(void *priv, int speed, unsigned int mode) +static void imx_dwmac_fix_speed(void *priv, phy_interface_t interface, + int speed, unsigned int mode) { struct plat_stmmacenet_data *plat_dat; struct imx_priv_data *dwmac = priv; @@ -185,13 +187,14 @@ static void imx_dwmac_fix_speed(void *priv, int speed, unsigned int mode) dev_err(dwmac->dev, "failed to set tx rate %lu\n", rate); } -static void imx93_dwmac_fix_speed(void *priv, int speed, unsigned int mode) +static void imx93_dwmac_fix_speed(void *priv, phy_interface_t interface, + int speed, unsigned int mode) { struct imx_priv_data *dwmac = priv; unsigned int iface; int ctrl, old_ctrl; - imx_dwmac_fix_speed(priv, speed, mode); + imx_dwmac_fix_speed(priv, interface, speed, mode); if (!dwmac || mode != MLO_AN_FIXED) return; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c index 815213223583..9c51c96223ad 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c @@ -143,7 +143,8 @@ static struct stmmac_pci_info loongson_gmac_pci_info = { .setup = loongson_gmac_data, }; -static void loongson_gnet_fix_speed(void *priv, int speed, unsigned int mode) +static void loongson_gnet_fix_speed(void *priv, phy_interface_t interface, + int speed, unsigned int mode) { struct loongson_data *ld = (struct loongson_data *)priv; struct net_device *ndev = dev_get_drvdata(ld->dev); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c index bd5d3bf90400..9b29516a5a7c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c @@ -643,7 +643,8 @@ static void ethqos_configure(struct qcom_ethqos *ethqos, int speed) return ethqos->configure_func(ethqos, speed); } -static void ethqos_fix_mac_speed(void *priv, int speed, unsigned int mode) +static void ethqos_fix_mac_speed(void *priv, phy_interface_t interface, + int speed, unsigned int mode) { struct qcom_ethqos *ethqos = priv; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c index 4c8991f3b38d..c6b99814d391 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c @@ -72,7 +72,8 @@ struct socfpga_dwmac { const struct socfpga_dwmac_ops *ops; }; -static void socfpga_dwmac_fix_mac_speed(void *bsp_priv, int speed, +static void socfpga_dwmac_fix_mac_speed(void *bsp_priv, + phy_interface_t interface, int speed, unsigned int mode) { struct socfpga_dwmac *dwmac = (struct socfpga_dwmac *)bsp_priv; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c index f50547b67fbc..6ebbf95d158f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sti.c @@ -91,11 +91,13 @@ struct sti_dwmac { struct regmap *regmap; bool gmac_en; int speed; - void (*fix_retime_src)(void *priv, int speed, unsigned int mode); + void (*fix_retime_src)(void *priv, phy_interface_t interface, + int speed, unsigned int mode); }; struct sti_dwmac_of_data { - void (*fix_retime_src)(void *priv, int speed, unsigned int mode); + void (*fix_retime_src)(void *priv, phy_interface_t interface, + int speed, unsigned int mode); }; enum { @@ -114,7 +116,8 @@ static u32 stih4xx_tx_retime_val[] = { | STIH4XX_ETH_SEL_INTERNAL_NOTEXT_PHYCLK, }; -static void stih4xx_fix_retime_src(void *priv, int spd, unsigned int mode) +static void stih4xx_fix_retime_src(void *priv, phy_interface_t interface, + int spd, unsigned int mode) { struct sti_dwmac *dwmac = priv; u32 src = dwmac->tx_retime_src; @@ -170,7 +173,7 @@ static int sti_set_phy_intf_sel(void *bsp_priv, u8 phy_intf_sel) val = (dwmac->interface == PHY_INTERFACE_MODE_REVMII) ? 0 : ENMII; regmap_update_bits(regmap, reg, ENMII_MASK, val); - dwmac->fix_retime_src(dwmac, dwmac->speed, 0); + dwmac->fix_retime_src(dwmac, dwmac->interface, dwmac->speed, 0); return 0; } diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 82375d34ad57..d7c730179a7f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1071,7 +1071,8 @@ static void stmmac_mac_link_up(struct phylink_config *config, } if (priv->plat->fix_mac_speed) - priv->plat->fix_mac_speed(priv->plat->bsp_priv, speed, mode); + priv->plat->fix_mac_speed(priv->plat->bsp_priv, interface, + speed, mode); if (!duplex) ctrl &= ~priv->hw->link.duplex; diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 32352a216567..b96ae9dadfab 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -256,7 +256,8 @@ struct plat_stmmacenet_data { int (*set_phy_intf_sel)(void *priv, u8 phy_intf_sel); int (*set_clk_tx_rate)(void *priv, struct clk *clk_tx_i, phy_interface_t interface, int speed); - void (*fix_mac_speed)(void *priv, int speed, unsigned int mode); + void (*fix_mac_speed)(void *priv, phy_interface_t interface, + int speed, unsigned int mode); int (*fix_soc_reset)(struct stmmac_priv *priv); int (*serdes_powerup)(struct net_device *ndev, void *priv); void (*serdes_powerdown)(struct net_device *ndev, void *priv); -- cgit v1.2.3 From 46a9d97069cab311738c950d0fcef85a459c7b8f Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 24 Feb 2026 11:06:38 +0900 Subject: ata: libata-eh: avoid unnecessary calls to ata_scsi_port_error_handler() When handling SCSI command timeouts, if we had no actual command timeouts (either because the command was a deferred qc or the completion path won the race with ata_scsi_cmd_error_handler()), we do not need to go through a port error handling, as there was in fact no errors at all. Modify ata_scsi_cmd_error_handler() to return the number of commands that timed out and use this return value in ata_scsi_error() to call ata_scsi_port_error_handler() only if we had command timeouts, or if the port EH has already been scheduled due to failed commands. Otherwise, simply call scsi_eh_flush_done_q() to finish the completed commands without running the full port error handling. Signed-off-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Niklas Cassel Signed-off-by: Niklas Cassel --- drivers/ata/libata-eh.c | 28 +++++++++++++++++++--------- include/linux/libata.h | 3 ++- 2 files changed, 21 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c index 72a22b6c9682..12c6740398fa 100644 --- a/drivers/ata/libata-eh.c +++ b/drivers/ata/libata-eh.c @@ -560,21 +560,27 @@ void ata_scsi_error(struct Scsi_Host *host) { struct ata_port *ap = ata_shost_to_port(host); unsigned long flags; + int nr_timedout; LIST_HEAD(eh_work_q); spin_lock_irqsave(host->host_lock, flags); list_splice_init(&host->eh_cmd_q, &eh_work_q); spin_unlock_irqrestore(host->host_lock, flags); - ata_scsi_cmd_error_handler(host, ap, &eh_work_q); - - /* If we timed raced normal completion and there is nothing to - recover nr_timedout == 0 why exactly are we doing error recovery ? */ - ata_scsi_port_error_handler(host, ap); + /* + * First check what errors we got with ata_scsi_cmd_error_handler(). + * If we had no command timeouts and EH is not scheduled for this port, + * meaning that we do not have any failed command, then there is no + * need to go through the full port error handling. We only need to + * flush the completed commands we have. + */ + nr_timedout = ata_scsi_cmd_error_handler(host, ap, &eh_work_q); + if (nr_timedout || ata_port_eh_scheduled(ap)) + ata_scsi_port_error_handler(host, ap); + else + scsi_eh_flush_done_q(&ap->eh_done_q); - /* finish or retry handled scmd's and clean up */ WARN_ON(!list_empty(&eh_work_q)); - } /** @@ -586,9 +592,11 @@ void ata_scsi_error(struct Scsi_Host *host) * process the given list of commands and return those finished to the * ap->eh_done_q. This function is the first part of the libata error * handler which processes a given list of failed commands. + * + * Return the number of commands that timed out. */ -void ata_scsi_cmd_error_handler(struct Scsi_Host *host, struct ata_port *ap, - struct list_head *eh_work_q) +int ata_scsi_cmd_error_handler(struct Scsi_Host *host, struct ata_port *ap, + struct list_head *eh_work_q) { int i; unsigned long flags; @@ -678,6 +686,8 @@ void ata_scsi_cmd_error_handler(struct Scsi_Host *host, struct ata_port *ap, ap->eh_tries = ATA_EH_MAX_TRIES; spin_unlock_irqrestore(ap->lock, flags); + + return nr_timedout; } EXPORT_SYMBOL(ata_scsi_cmd_error_handler); diff --git a/include/linux/libata.h b/include/linux/libata.h index db87c99e4189..5c085ef4eda7 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1225,7 +1225,8 @@ extern int ata_ncq_prio_enable(struct ata_port *ap, struct scsi_device *sdev, extern struct ata_device *ata_dev_pair(struct ata_device *adev); int ata_set_mode(struct ata_link *link, struct ata_device **r_failed_dev); extern void ata_scsi_port_error_handler(struct Scsi_Host *host, struct ata_port *ap); -extern void ata_scsi_cmd_error_handler(struct Scsi_Host *host, struct ata_port *ap, struct list_head *eh_q); +int ata_scsi_cmd_error_handler(struct Scsi_Host *host, struct ata_port *ap, + struct list_head *eh_q); /* * SATA specific code - drivers/ata/libata-sata.c -- cgit v1.2.3 From d9d5e1bdd18074ea27985c777ddc3a8a0b007468 Mon Sep 17 00:00:00 2001 From: Koichiro Den Date: Mon, 16 Feb 2026 00:22:16 +0900 Subject: dmaengine: dw-edma: Add virtual IRQ for interrupt-emulation doorbells Interrupt emulation can assert the dw-edma IRQ line without updating the DONE/ABORT bits. With the shared read/write/common IRQ handlers, the driver cannot reliably distinguish such an emulated interrupt from a real one and leaving a level IRQ asserted may wedge the line. Allocate a dedicated, requestable Linux virtual IRQ (db_irq) for interrupt emulation and attach an irq_chip whose .irq_ack runs the core-specific deassert sequence (.ack_emulated_irq()). The physical dw-edma interrupt handlers raise this virtual IRQ via generic_handle_irq(), ensuring emulated IRQs are always deasserted. Export the virtual IRQ number (db_irq) and the doorbell register offset (db_offset) via struct dw_edma_chip so platform users can expose interrupt emulation as a doorbell. Without this, a single interrupt-emulation write can leave the level IRQ line asserted and cause the generic IRQ layer to disable it. Signed-off-by: Koichiro Den Reviewed-by: Frank Li Link: https://patch.msgid.link/20260215152216.3393561-3-den@valinux.co.jp Signed-off-by: Vinod Koul --- drivers/dma/dw-edma/dw-edma-core.c | 127 +++++++++++++++++++++++++++++++++++-- include/linux/dma/edma.h | 6 ++ 2 files changed, 128 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/dma/dw-edma/dw-edma-core.c b/drivers/dma/dw-edma/dw-edma-core.c index e7d698b352d3..cd34a3ea602d 100644 --- a/drivers/dma/dw-edma/dw-edma-core.c +++ b/drivers/dma/dw-edma/dw-edma-core.c @@ -663,7 +663,96 @@ static void dw_edma_abort_interrupt(struct dw_edma_chan *chan) chan->status = EDMA_ST_IDLE; } -static inline irqreturn_t dw_edma_interrupt_write(int irq, void *data) +static void dw_edma_emul_irq_ack(struct irq_data *d) +{ + struct dw_edma *dw = irq_data_get_irq_chip_data(d); + + dw_edma_core_ack_emulated_irq(dw); +} + +/* + * irq_chip implementation for interrupt-emulation doorbells. + * + * The emulated source has no mask/unmask mechanism. With handle_level_irq(), + * the flow is therefore: + * 1) .irq_ack() deasserts the source + * 2) registered handlers (if any) are dispatched + * Since deassertion is already done in .irq_ack(), handlers do not need to take + * care of it, hence IRQCHIP_ONESHOT_SAFE. + */ +static struct irq_chip dw_edma_emul_irqchip = { + .name = "dw-edma-emul", + .irq_ack = dw_edma_emul_irq_ack, + .flags = IRQCHIP_ONESHOT_SAFE | IRQCHIP_SKIP_SET_WAKE, +}; + +static int dw_edma_emul_irq_alloc(struct dw_edma *dw) +{ + struct dw_edma_chip *chip = dw->chip; + int virq; + + chip->db_irq = 0; + chip->db_offset = ~0; + + /* + * Only meaningful when the core provides the deassert sequence + * for interrupt emulation. + */ + if (!dw->core->ack_emulated_irq) + return 0; + + /* + * Allocate a single, requestable Linux virtual IRQ number. + * Use >= 1 so that 0 can remain a "not available" sentinel. + */ + virq = irq_alloc_desc(NUMA_NO_NODE); + if (virq < 0) + return virq; + + irq_set_chip_and_handler(virq, &dw_edma_emul_irqchip, handle_level_irq); + irq_set_chip_data(virq, dw); + irq_set_noprobe(virq); + + chip->db_irq = virq; + chip->db_offset = dw_edma_core_db_offset(dw); + + return 0; +} + +static void dw_edma_emul_irq_free(struct dw_edma *dw) +{ + struct dw_edma_chip *chip = dw->chip; + + if (!chip) + return; + if (chip->db_irq <= 0) + return; + + irq_free_descs(chip->db_irq, 1); + chip->db_irq = 0; + chip->db_offset = ~0; +} + +static inline irqreturn_t dw_edma_interrupt_emulated(void *data) +{ + struct dw_edma_irq *dw_irq = data; + struct dw_edma *dw = dw_irq->dw; + int db_irq = dw->chip->db_irq; + + if (db_irq > 0) { + /* + * Interrupt emulation may assert the IRQ line without updating the + * normal DONE/ABORT status bits. With a shared IRQ handler we + * cannot reliably detect such events by status registers alone, so + * always perform the core-specific deassert sequence. + */ + generic_handle_irq(db_irq); + return IRQ_HANDLED; + } + return IRQ_NONE; +} + +static inline irqreturn_t dw_edma_interrupt_write_inner(int irq, void *data) { struct dw_edma_irq *dw_irq = data; @@ -672,7 +761,7 @@ static inline irqreturn_t dw_edma_interrupt_write(int irq, void *data) dw_edma_abort_interrupt); } -static inline irqreturn_t dw_edma_interrupt_read(int irq, void *data) +static inline irqreturn_t dw_edma_interrupt_read_inner(int irq, void *data) { struct dw_edma_irq *dw_irq = data; @@ -681,12 +770,33 @@ static inline irqreturn_t dw_edma_interrupt_read(int irq, void *data) dw_edma_abort_interrupt); } -static irqreturn_t dw_edma_interrupt_common(int irq, void *data) +static inline irqreturn_t dw_edma_interrupt_write(int irq, void *data) +{ + irqreturn_t ret = IRQ_NONE; + + ret |= dw_edma_interrupt_write_inner(irq, data); + ret |= dw_edma_interrupt_emulated(data); + + return ret; +} + +static inline irqreturn_t dw_edma_interrupt_read(int irq, void *data) { irqreturn_t ret = IRQ_NONE; - ret |= dw_edma_interrupt_write(irq, data); - ret |= dw_edma_interrupt_read(irq, data); + ret |= dw_edma_interrupt_read_inner(irq, data); + ret |= dw_edma_interrupt_emulated(data); + + return ret; +} + +static inline irqreturn_t dw_edma_interrupt_common(int irq, void *data) +{ + irqreturn_t ret = IRQ_NONE; + + ret |= dw_edma_interrupt_write_inner(irq, data); + ret |= dw_edma_interrupt_read_inner(irq, data); + ret |= dw_edma_interrupt_emulated(data); return ret; } @@ -973,6 +1083,11 @@ int dw_edma_probe(struct dw_edma_chip *chip) if (err) return err; + /* Allocate a dedicated virtual IRQ for interrupt-emulation doorbells */ + err = dw_edma_emul_irq_alloc(dw); + if (err) + dev_warn(dev, "Failed to allocate emulation IRQ: %d\n", err); + /* Setup write/read channels */ err = dw_edma_channel_setup(dw, wr_alloc, rd_alloc); if (err) @@ -988,6 +1103,7 @@ int dw_edma_probe(struct dw_edma_chip *chip) err_irq_free: for (i = (dw->nr_irqs - 1); i >= 0; i--) free_irq(chip->ops->irq_vector(dev, i), &dw->irq[i]); + dw_edma_emul_irq_free(dw); return err; } @@ -1010,6 +1126,7 @@ int dw_edma_remove(struct dw_edma_chip *chip) /* Free irqs */ for (i = (dw->nr_irqs - 1); i >= 0; i--) free_irq(chip->ops->irq_vector(dev, i), &dw->irq[i]); + dw_edma_emul_irq_free(dw); /* Deregister eDMA device */ dma_async_device_unregister(&dw->dma); diff --git a/include/linux/dma/edma.h b/include/linux/dma/edma.h index 270b5458aecf..9da53c75e49b 100644 --- a/include/linux/dma/edma.h +++ b/include/linux/dma/edma.h @@ -73,6 +73,8 @@ enum dw_edma_chip_flags { * @ll_region_rd: DMA descriptor link list memory for read channel * @dt_region_wr: DMA data memory for write channel * @dt_region_rd: DMA data memory for read channel + * @db_irq: Virtual IRQ dedicated to interrupt emulation + * @db_offset: Offset from DMA register base * @mf: DMA register map format * @dw: struct dw_edma that is filled by dw_edma_probe() */ @@ -94,6 +96,10 @@ struct dw_edma_chip { struct dw_edma_region dt_region_wr[EDMA_MAX_WR_CH]; struct dw_edma_region dt_region_rd[EDMA_MAX_RD_CH]; + /* interrupt emulation */ + int db_irq; + resource_size_t db_offset; + enum dw_edma_map_format mf; struct dw_edma *dw; -- cgit v1.2.3 From 0289ada4a31661016a0611a41a4886bb958e9985 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Mon, 9 Feb 2026 12:44:33 +0000 Subject: coresight: Fix memory leak in coresight_alloc_device_name() The memory leak detector reports: echo clear > /sys/kernel/debug/kmemleak modprobe coresight_funnel rmmod coresight_funnel # Scan memory leak and report it echo scan > /sys/kernel/debug/kmemleak cat /sys/kernel/debug/kmemleak unreferenced object 0xffff0008020c7200 (size 64): comm "modprobe", pid 410, jiffies 4295333721 hex dump (first 32 bytes): d8 da fe 7e 09 00 ff ff e8 2e ff 7e 09 00 ff ff ...~.......~.... b0 6c ff 7e 09 00 ff ff 30 83 00 7f 09 00 ff ff .l.~....0....... backtrace (crc 4116a690): kmemleak_alloc+0xd8/0xf8 __kmalloc_node_track_caller_noprof+0x2c8/0x6f0 krealloc_node_align_noprof+0x13c/0x2c8 coresight_alloc_device_name+0xe4/0x158 [coresight] 0xffffd327ecef8394 0xffffd327ecef85ec amba_probe+0x118/0x1c8 really_probe+0xc8/0x3f0 __driver_probe_device+0x88/0x190 driver_probe_device+0x44/0x120 __driver_attach+0x100/0x238 bus_for_each_dev+0x84/0xf0 driver_attach+0x2c/0x40 bus_add_driver+0x128/0x258 driver_register+0x64/0x138 __amba_driver_register+0x2c/0x48 The memory leak is caused by not freeing the device list that maintains device indices. This device list preserves stable device indices across unbind and rebind device operations, so it does not share the same lifetime as a device instances and must only be freed when the module is unloaded. Some modules do not implement a module exit callback because they are registered using module_platform_driver(). As a result, the device list cannot be released during module exit for those modules. Fix this by moving the device list into the core layer. As a general solution, instead of maintaining a static list in each driver, drivers now allocate device lists via coresight_allocate_device_list() and device indices via coresight_allocate_device_idx(). The list is released only when the core module is unloaded by calling coresight_release_device_list(), avoiding the leak. Fixes: 0f5f9b6ba9e1 ("coresight: Use platform agnostic names") Reviewed-by: James Clark Signed-off-by: Leo Yan Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20260209-arm_coresight_refactor_dev_register-v4-1-62d6042f76f7@arm.com --- drivers/hwtracing/coresight/coresight-catu.c | 4 +- drivers/hwtracing/coresight/coresight-core.c | 127 +++++++++++++++------ drivers/hwtracing/coresight/coresight-ctcu-core.c | 4 +- drivers/hwtracing/coresight/coresight-cti-core.c | 19 ++- drivers/hwtracing/coresight/coresight-dummy.c | 7 +- drivers/hwtracing/coresight/coresight-etb10.c | 4 +- drivers/hwtracing/coresight/coresight-funnel.c | 4 +- drivers/hwtracing/coresight/coresight-replicator.c | 4 +- drivers/hwtracing/coresight/coresight-stm.c | 4 +- drivers/hwtracing/coresight/coresight-tmc-core.c | 12 +- drivers/hwtracing/coresight/coresight-tnoc.c | 4 +- drivers/hwtracing/coresight/coresight-tpda.c | 4 +- drivers/hwtracing/coresight/coresight-tpdm.c | 4 +- drivers/hwtracing/coresight/coresight-tpiu.c | 4 +- drivers/hwtracing/coresight/ultrasoc-smb.c | 4 +- include/linux/coresight.h | 14 +-- 16 files changed, 120 insertions(+), 103 deletions(-) (limited to 'include/linux') diff --git a/drivers/hwtracing/coresight/coresight-catu.c b/drivers/hwtracing/coresight/coresight-catu.c index dfd035852b12..ce71dcddfca2 100644 --- a/drivers/hwtracing/coresight/coresight-catu.c +++ b/drivers/hwtracing/coresight/coresight-catu.c @@ -30,8 +30,6 @@ #define catu_dbg(x, ...) do {} while (0) #endif -DEFINE_CORESIGHT_DEVLIST(catu_devs, "catu"); - struct catu_etr_buf { struct tmc_sg_table *catu_table; dma_addr_t sladdr; @@ -530,7 +528,7 @@ static int __catu_probe(struct device *dev, struct resource *res) if (ret) return ret; - catu_desc.name = coresight_alloc_device_name(&catu_devs, dev); + catu_desc.name = coresight_alloc_device_name("catu", dev); if (!catu_desc.name) return -ENOMEM; diff --git a/drivers/hwtracing/coresight/coresight-core.c b/drivers/hwtracing/coresight/coresight-core.c index 80e26396ad0a..6881fdc5da92 100644 --- a/drivers/hwtracing/coresight/coresight-core.c +++ b/drivers/hwtracing/coresight/coresight-core.c @@ -53,6 +53,9 @@ struct coresight_node { const u32 coresight_barrier_pkt[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}; EXPORT_SYMBOL_GPL(coresight_barrier_pkt); +/* List maintains the device index */ +static LIST_HEAD(coresight_dev_idx_list); + static const struct cti_assoc_op *cti_assoc_ops; void coresight_set_cti_ops(const struct cti_assoc_op *cti_op) @@ -1438,22 +1441,55 @@ void coresight_unregister(struct coresight_device *csdev) } EXPORT_SYMBOL_GPL(coresight_unregister); +static struct coresight_dev_list * +coresight_allocate_device_list(const char *prefix) +{ + struct coresight_dev_list *list; -/* - * coresight_search_device_idx - Search the fwnode handle of a device - * in the given dev_idx list. Must be called with the coresight_mutex held. - * - * Returns the index of the entry, when found. Otherwise, -ENOENT. - */ -static int coresight_search_device_idx(struct coresight_dev_list *dict, - struct fwnode_handle *fwnode) + /* Check if have already allocated */ + list_for_each_entry(list, &coresight_dev_idx_list, node) { + if (!strcmp(list->pfx, prefix)) + return list; + } + + list = kzalloc(sizeof(*list), GFP_KERNEL); + if (!list) + return NULL; + + list->pfx = kstrdup(prefix, GFP_KERNEL); + if (!list->pfx) { + kfree(list); + return NULL; + } + + list_add(&list->node, &coresight_dev_idx_list); + return list; +} + +static int coresight_allocate_device_idx(struct coresight_dev_list *list, + struct device *dev) { - int i; + struct fwnode_handle **fwnode_list; + struct fwnode_handle *fwnode = dev_fwnode(dev); + int idx; + + for (idx = 0; idx < list->nr_idx; idx++) + if (list->fwnode_list[idx] == fwnode) + return idx; + + /* Make space for the new entry */ + idx = list->nr_idx; + fwnode_list = krealloc_array(list->fwnode_list, + idx + 1, sizeof(*list->fwnode_list), + GFP_KERNEL); + if (!fwnode_list) + return -ENOMEM; - for (i = 0; i < dict->nr_idx; i++) - if (dict->fwnode_list[i] == fwnode) - return i; - return -ENOENT; + fwnode_list[idx] = fwnode; + list->fwnode_list = fwnode_list; + list->nr_idx = idx + 1; + + return idx; } static bool coresight_compare_type(enum coresight_dev_type type_a, @@ -1527,45 +1563,63 @@ bool coresight_loses_context_with_cpu(struct device *dev) EXPORT_SYMBOL_GPL(coresight_loses_context_with_cpu); /* - * coresight_alloc_device_name - Get an index for a given device in the - * device index list specific to a driver. An index is allocated for a - * device and is tracked with the fwnode_handle to prevent allocating + * coresight_alloc_device_name - Get an index for a given device in the list + * specific to a driver (presented by the prefix string). An index is allocated + * for a device and is tracked with the fwnode_handle to prevent allocating * duplicate indices for the same device (e.g, if we defer probing of * a device due to dependencies), in case the index is requested again. */ -char *coresight_alloc_device_name(struct coresight_dev_list *dict, - struct device *dev) +char *coresight_alloc_device_name(const char *prefix, struct device *dev) { - int idx; + struct coresight_dev_list *list; char *name = NULL; - struct fwnode_handle **list; + int idx; mutex_lock(&coresight_mutex); - idx = coresight_search_device_idx(dict, dev_fwnode(dev)); - if (idx < 0) { - /* Make space for the new entry */ - idx = dict->nr_idx; - list = krealloc_array(dict->fwnode_list, - idx + 1, sizeof(*dict->fwnode_list), - GFP_KERNEL); - if (ZERO_OR_NULL_PTR(list)) { - idx = -ENOMEM; - goto done; - } + list = coresight_allocate_device_list(prefix); + if (!list) + goto done; - list[idx] = dev_fwnode(dev); - dict->fwnode_list = list; - dict->nr_idx = idx + 1; - } + idx = coresight_allocate_device_idx(list, dev); - name = devm_kasprintf(dev, GFP_KERNEL, "%s%d", dict->pfx, idx); + /* + * If index allocation fails, the device list is not released here; + * it is instead freed later by coresight_release_device_list() when + * the coresight_core module is unloaded. + */ + if (idx < 0) + goto done; + + name = devm_kasprintf(dev, GFP_KERNEL, "%s%d", list->pfx, idx); done: mutex_unlock(&coresight_mutex); return name; } EXPORT_SYMBOL_GPL(coresight_alloc_device_name); +static void coresight_release_device_list(void) +{ + struct coresight_dev_list *list, *next; + int i; + + /* + * Here is no need to take coresight_mutex; this is during core module + * unloading, no race condition with other modules. + */ + + list_for_each_entry_safe(list, next, &coresight_dev_idx_list, node) { + for (i = 0; i < list->nr_idx; i++) + list->fwnode_list[i] = NULL; + list->nr_idx = 0; + list_del(&list->node); + + kfree(list->pfx); + kfree(list->fwnode_list); + kfree(list); + } +} + const struct bus_type coresight_bustype = { .name = "coresight", }; @@ -1639,6 +1693,7 @@ static void __exit coresight_exit(void) &coresight_notifier); etm_perf_exit(); bus_unregister(&coresight_bustype); + coresight_release_device_list(); } module_init(coresight_init); diff --git a/drivers/hwtracing/coresight/coresight-ctcu-core.c b/drivers/hwtracing/coresight/coresight-ctcu-core.c index abed15eb72b4..6813ae6e929b 100644 --- a/drivers/hwtracing/coresight/coresight-ctcu-core.c +++ b/drivers/hwtracing/coresight/coresight-ctcu-core.c @@ -19,8 +19,6 @@ #include "coresight-ctcu.h" #include "coresight-priv.h" -DEFINE_CORESIGHT_DEVLIST(ctcu_devs, "ctcu"); - #define ctcu_writel(drvdata, val, offset) __raw_writel((val), drvdata->base + offset) #define ctcu_readl(drvdata, offset) __raw_readl(drvdata->base + offset) @@ -187,7 +185,7 @@ static int ctcu_probe(struct platform_device *pdev) void __iomem *base; int i, ret; - desc.name = coresight_alloc_device_name(&ctcu_devs, dev); + desc.name = coresight_alloc_device_name("ctcu", dev); if (!desc.name) return -ENOMEM; diff --git a/drivers/hwtracing/coresight/coresight-cti-core.c b/drivers/hwtracing/coresight/coresight-cti-core.c index 7a8f1ef6b94e..fddc8f31b91d 100644 --- a/drivers/hwtracing/coresight/coresight-cti-core.c +++ b/drivers/hwtracing/coresight/coresight-cti-core.c @@ -48,15 +48,6 @@ static int nr_cti_cpu; /* quick lookup list for CPU bound CTIs when power handling */ static struct cti_drvdata *cti_cpu_drvdata[NR_CPUS]; -/* - * CTI naming. CTI bound to cores will have the name cti_cpu where - * N is the CPU ID. System CTIs will have the name cti_sys where I - * is an index allocated by order of discovery. - * - * CTI device name list - for CTI not bound to cores. - */ -DEFINE_CORESIGHT_DEVLIST(cti_sys_devs, "cti_sys"); - /* write set of regs to hardware - call with spinlock claimed */ void cti_write_all_hw_regs(struct cti_drvdata *drvdata) { @@ -889,12 +880,18 @@ static int cti_probe(struct amba_device *adev, const struct amba_id *id) /* default to powered - could change on PM notifications */ drvdata->config.hw_powered = true; - /* set up device name - will depend if cpu bound or otherwise */ + /* + * Set up device name - will depend if cpu bound or otherwise. + * + * CTI bound to cores will have the name cti_cpu where N is th + * eCPU ID. System CTIs will have the name cti_sys where I is an + * index allocated by order of discovery. + */ if (drvdata->ctidev.cpu >= 0) cti_desc.name = devm_kasprintf(dev, GFP_KERNEL, "cti_cpu%d", drvdata->ctidev.cpu); else - cti_desc.name = coresight_alloc_device_name(&cti_sys_devs, dev); + cti_desc.name = coresight_alloc_device_name("cti_sys", dev); if (!cti_desc.name) return -ENOMEM; diff --git a/drivers/hwtracing/coresight/coresight-dummy.c b/drivers/hwtracing/coresight/coresight-dummy.c index 14322c99e29d..c176a2f57300 100644 --- a/drivers/hwtracing/coresight/coresight-dummy.c +++ b/drivers/hwtracing/coresight/coresight-dummy.c @@ -19,9 +19,6 @@ struct dummy_drvdata { u8 traceid; }; -DEFINE_CORESIGHT_DEVLIST(source_devs, "dummy_source"); -DEFINE_CORESIGHT_DEVLIST(sink_devs, "dummy_sink"); - static int dummy_source_enable(struct coresight_device *csdev, struct perf_event *event, enum cs_mode mode, __maybe_unused struct coresight_path *path) @@ -126,7 +123,7 @@ static int dummy_probe(struct platform_device *pdev) if (of_device_is_compatible(node, "arm,coresight-dummy-source")) { - desc.name = coresight_alloc_device_name(&source_devs, dev); + desc.name = coresight_alloc_device_name("dummy_source", dev); if (!desc.name) return -ENOMEM; @@ -155,7 +152,7 @@ static int dummy_probe(struct platform_device *pdev) drvdata->traceid = (u8)trace_id; } else if (of_device_is_compatible(node, "arm,coresight-dummy-sink")) { - desc.name = coresight_alloc_device_name(&sink_devs, dev); + desc.name = coresight_alloc_device_name("dummy_sink", dev); if (!desc.name) return -ENOMEM; diff --git a/drivers/hwtracing/coresight/coresight-etb10.c b/drivers/hwtracing/coresight/coresight-etb10.c index 6657602d8f2e..b952a1d47f12 100644 --- a/drivers/hwtracing/coresight/coresight-etb10.c +++ b/drivers/hwtracing/coresight/coresight-etb10.c @@ -63,8 +63,6 @@ #define ETB_FFSR_BIT 1 #define ETB_FRAME_SIZE_WORDS 4 -DEFINE_CORESIGHT_DEVLIST(etb_devs, "etb"); - /** * struct etb_drvdata - specifics associated to an ETB component * @base: memory mapped base address for this component. @@ -722,7 +720,7 @@ static int etb_probe(struct amba_device *adev, const struct amba_id *id) struct resource *res = &adev->res; struct coresight_desc desc = { 0 }; - desc.name = coresight_alloc_device_name(&etb_devs, dev); + desc.name = coresight_alloc_device_name("etb", dev); if (!desc.name) return -ENOMEM; diff --git a/drivers/hwtracing/coresight/coresight-funnel.c b/drivers/hwtracing/coresight/coresight-funnel.c index 3b248e54471a..3f56ceccd8c9 100644 --- a/drivers/hwtracing/coresight/coresight-funnel.c +++ b/drivers/hwtracing/coresight/coresight-funnel.c @@ -30,8 +30,6 @@ #define FUNNEL_HOLDTIME (0x7 << FUNNEL_HOLDTIME_SHFT) #define FUNNEL_ENSx_MASK 0xff -DEFINE_CORESIGHT_DEVLIST(funnel_devs, "funnel"); - /** * struct funnel_drvdata - specifics associated to a funnel component * @base: memory mapped base address for this component. @@ -223,7 +221,7 @@ static int funnel_probe(struct device *dev, struct resource *res) of_device_is_compatible(dev->of_node, "arm,coresight-funnel")) dev_warn_once(dev, "Uses OBSOLETE CoreSight funnel binding\n"); - desc.name = coresight_alloc_device_name(&funnel_devs, dev); + desc.name = coresight_alloc_device_name("funnel", dev); if (!desc.name) return -ENOMEM; diff --git a/drivers/hwtracing/coresight/coresight-replicator.c b/drivers/hwtracing/coresight/coresight-replicator.c index e6472658235d..07fc04f53b88 100644 --- a/drivers/hwtracing/coresight/coresight-replicator.c +++ b/drivers/hwtracing/coresight/coresight-replicator.c @@ -24,8 +24,6 @@ #define REPLICATOR_IDFILTER0 0x000 #define REPLICATOR_IDFILTER1 0x004 -DEFINE_CORESIGHT_DEVLIST(replicator_devs, "replicator"); - /** * struct replicator_drvdata - specifics associated to a replicator component * @base: memory mapped base address for this component. Also indicates @@ -230,7 +228,7 @@ static int replicator_probe(struct device *dev, struct resource *res) dev_warn_once(dev, "Uses OBSOLETE CoreSight replicator binding\n"); - desc.name = coresight_alloc_device_name(&replicator_devs, dev); + desc.name = coresight_alloc_device_name("replicator", dev); if (!desc.name) return -ENOMEM; diff --git a/drivers/hwtracing/coresight/coresight-stm.c b/drivers/hwtracing/coresight/coresight-stm.c index e68529bf89c9..aca6cec7885a 100644 --- a/drivers/hwtracing/coresight/coresight-stm.c +++ b/drivers/hwtracing/coresight/coresight-stm.c @@ -110,8 +110,6 @@ struct channel_space { unsigned long *guaranteed; }; -DEFINE_CORESIGHT_DEVLIST(stm_devs, "stm"); - /** * struct stm_drvdata - specifics associated to an STM component * @base: memory mapped base address for this component. @@ -834,7 +832,7 @@ static int __stm_probe(struct device *dev, struct resource *res) struct resource ch_res; struct coresight_desc desc = { 0 }; - desc.name = coresight_alloc_device_name(&stm_devs, dev); + desc.name = coresight_alloc_device_name("stm", dev); if (!desc.name) return -ENOMEM; diff --git a/drivers/hwtracing/coresight/coresight-tmc-core.c b/drivers/hwtracing/coresight/coresight-tmc-core.c index 36599c431be6..58b469ee73b4 100644 --- a/drivers/hwtracing/coresight/coresight-tmc-core.c +++ b/drivers/hwtracing/coresight/coresight-tmc-core.c @@ -32,10 +32,6 @@ #include "coresight-priv.h" #include "coresight-tmc.h" -DEFINE_CORESIGHT_DEVLIST(etb_devs, "tmc_etb"); -DEFINE_CORESIGHT_DEVLIST(etf_devs, "tmc_etf"); -DEFINE_CORESIGHT_DEVLIST(etr_devs, "tmc_etr"); - int tmc_wait_for_tmcready(struct tmc_drvdata *drvdata) { struct coresight_device *csdev = drvdata->csdev; @@ -777,7 +773,7 @@ static int __tmc_probe(struct device *dev, struct resource *res) struct coresight_platform_data *pdata = NULL; struct tmc_drvdata *drvdata; struct coresight_desc desc = { 0 }; - struct coresight_dev_list *dev_list = NULL; + const char *dev_list = NULL; drvdata = devm_kzalloc(dev, sizeof(*drvdata), GFP_KERNEL); if (!drvdata) @@ -827,7 +823,7 @@ static int __tmc_probe(struct device *dev, struct resource *res) desc.type = CORESIGHT_DEV_TYPE_SINK; desc.subtype.sink_subtype = CORESIGHT_DEV_SUBTYPE_SINK_BUFFER; desc.ops = &tmc_etb_cs_ops; - dev_list = &etb_devs; + dev_list = "tmc_etb"; break; case TMC_CONFIG_TYPE_ETR: desc.groups = coresight_etr_groups; @@ -839,7 +835,7 @@ static int __tmc_probe(struct device *dev, struct resource *res) goto out; idr_init(&drvdata->idr); mutex_init(&drvdata->idr_mutex); - dev_list = &etr_devs; + dev_list = "tmc_etr"; break; case TMC_CONFIG_TYPE_ETF: desc.groups = coresight_etf_groups; @@ -847,7 +843,7 @@ static int __tmc_probe(struct device *dev, struct resource *res) desc.subtype.sink_subtype = CORESIGHT_DEV_SUBTYPE_SINK_BUFFER; desc.subtype.link_subtype = CORESIGHT_DEV_SUBTYPE_LINK_FIFO; desc.ops = &tmc_etf_cs_ops; - dev_list = &etf_devs; + dev_list = "tmc_etf"; break; default: pr_err("%s: Unsupported TMC config\n", desc.name); diff --git a/drivers/hwtracing/coresight/coresight-tnoc.c b/drivers/hwtracing/coresight/coresight-tnoc.c index 1128612e70a7..96a25877b824 100644 --- a/drivers/hwtracing/coresight/coresight-tnoc.c +++ b/drivers/hwtracing/coresight/coresight-tnoc.c @@ -47,8 +47,6 @@ struct trace_noc_drvdata { int atid; }; -DEFINE_CORESIGHT_DEVLIST(trace_noc_devs, "traceNoc"); - static void trace_noc_enable_hw(struct trace_noc_drvdata *drvdata) { u32 val; @@ -191,7 +189,7 @@ static int _tnoc_probe(struct device *dev, struct resource *res) struct coresight_desc desc = { 0 }; int ret; - desc.name = coresight_alloc_device_name(&trace_noc_devs, dev); + desc.name = coresight_alloc_device_name("traceNoc", dev); if (!desc.name) return -ENOMEM; diff --git a/drivers/hwtracing/coresight/coresight-tpda.c b/drivers/hwtracing/coresight/coresight-tpda.c index 7055f8f13427..89c8f71f0aff 100644 --- a/drivers/hwtracing/coresight/coresight-tpda.c +++ b/drivers/hwtracing/coresight/coresight-tpda.c @@ -20,8 +20,6 @@ #include "coresight-trace-id.h" #include "coresight-tpdm.h" -DEFINE_CORESIGHT_DEVLIST(tpda_devs, "tpda"); - static void tpda_clear_element_size(struct coresight_device *csdev) { struct tpda_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent); @@ -585,7 +583,7 @@ static int tpda_probe(struct amba_device *adev, const struct amba_id *id) if (ret) return ret; - desc.name = coresight_alloc_device_name(&tpda_devs, dev); + desc.name = coresight_alloc_device_name("tpda", dev); if (!desc.name) return -ENOMEM; desc.type = CORESIGHT_DEV_TYPE_LINK; diff --git a/drivers/hwtracing/coresight/coresight-tpdm.c b/drivers/hwtracing/coresight/coresight-tpdm.c index 06e0a905a67d..da77bdaad0a4 100644 --- a/drivers/hwtracing/coresight/coresight-tpdm.c +++ b/drivers/hwtracing/coresight/coresight-tpdm.c @@ -19,8 +19,6 @@ #include "coresight-priv.h" #include "coresight-tpdm.h" -DEFINE_CORESIGHT_DEVLIST(tpdm_devs, "tpdm"); - static bool tpdm_has_dsb_dataset(struct tpdm_drvdata *drvdata) { return (drvdata->datasets & TPDM_PIDR0_DS_DSB); @@ -1416,7 +1414,7 @@ static int tpdm_probe(struct device *dev, struct resource *res) } /* Set up coresight component description */ - desc.name = coresight_alloc_device_name(&tpdm_devs, dev); + desc.name = coresight_alloc_device_name("tpdm", dev); if (!desc.name) return -ENOMEM; desc.type = CORESIGHT_DEV_TYPE_SOURCE; diff --git a/drivers/hwtracing/coresight/coresight-tpiu.c b/drivers/hwtracing/coresight/coresight-tpiu.c index aaa44bc521c3..b8560b140e0f 100644 --- a/drivers/hwtracing/coresight/coresight-tpiu.c +++ b/drivers/hwtracing/coresight/coresight-tpiu.c @@ -49,8 +49,6 @@ #define FFCR_FON_MAN BIT(6) #define FFCR_STOP_FI BIT(12) -DEFINE_CORESIGHT_DEVLIST(tpiu_devs, "tpiu"); - /* * @base: memory mapped base address for this component. * @atclk: optional clock for the core parts of the TPIU. @@ -134,7 +132,7 @@ static int __tpiu_probe(struct device *dev, struct resource *res) struct coresight_desc desc = { 0 }; int ret; - desc.name = coresight_alloc_device_name(&tpiu_devs, dev); + desc.name = coresight_alloc_device_name("tpiu", dev); if (!desc.name) return -ENOMEM; diff --git a/drivers/hwtracing/coresight/ultrasoc-smb.c b/drivers/hwtracing/coresight/ultrasoc-smb.c index 8f7922a5e534..5776f63468fa 100644 --- a/drivers/hwtracing/coresight/ultrasoc-smb.c +++ b/drivers/hwtracing/coresight/ultrasoc-smb.c @@ -17,8 +17,6 @@ #include "coresight-priv.h" #include "ultrasoc-smb.h" -DEFINE_CORESIGHT_DEVLIST(sink_devs, "ultra_smb"); - #define ULTRASOC_SMB_DSM_UUID "82ae1283-7f6a-4cbe-aa06-53e8fb24db18" static bool smb_buffer_not_empty(struct smb_drv_data *drvdata) @@ -478,7 +476,7 @@ static int smb_register_sink(struct platform_device *pdev, desc.pdata = pdata; desc.dev = &pdev->dev; desc.groups = smb_sink_groups; - desc.name = coresight_alloc_device_name(&sink_devs, &pdev->dev); + desc.name = coresight_alloc_device_name("ultra_smb", &pdev->dev); if (!desc.name) { dev_err(&pdev->dev, "Failed to alloc coresight device name"); return -ENOMEM; diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 2b48be97fcd0..2131febebee9 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -306,24 +306,19 @@ struct coresight_device { * coresight_dev_list - Mapping for devices to "name" index for device * names. * + * @node: Node on the global device index list. * @nr_idx: Number of entries already allocated. * @pfx: Prefix pattern for device name. * @fwnode_list: Array of fwnode_handles associated with each allocated * index, upto nr_idx entries. */ struct coresight_dev_list { + struct list_head node; int nr_idx; - const char *pfx; + char *pfx; struct fwnode_handle **fwnode_list; }; -#define DEFINE_CORESIGHT_DEVLIST(var, dev_pfx) \ -static struct coresight_dev_list (var) = { \ - .pfx = dev_pfx, \ - .nr_idx = 0, \ - .fwnode_list = NULL, \ -} - #define to_coresight_device(d) container_of(d, struct coresight_device, dev) /** @@ -663,8 +658,7 @@ void coresight_clear_self_claim_tag(struct csdev_access *csa); void coresight_clear_self_claim_tag_unlocked(struct csdev_access *csa); void coresight_disclaim_device(struct coresight_device *csdev); void coresight_disclaim_device_unlocked(struct coresight_device *csdev); -char *coresight_alloc_device_name(struct coresight_dev_list *devs, - struct device *dev); +char *coresight_alloc_device_name(const char *prefix, struct device *dev); bool coresight_loses_context_with_cpu(struct device *dev); -- cgit v1.2.3 From 59509da0cb51dc48e4edc57d7d3ef1d424c58fc9 Mon Sep 17 00:00:00 2001 From: Amit Kumar Mahapatra Date: Wed, 4 Feb 2026 09:32:17 +0100 Subject: mtd: Move struct mtd_concat definition to header file To enable a more generic approach for concatenating MTD devices, struct mtd_concat should be accessible beyond the mtdconcat driver. Therefore, the definition is being moved to a header file. Signed-off-by: Amit Kumar Mahapatra Signed-off-by: Luca Ceresoli Signed-off-by: Miquel Raynal --- drivers/mtd/mtdconcat.c | 12 ------------ include/linux/mtd/concat.h | 12 ++++++++++++ 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c index 9eb5d919d9ba..241d15235d01 100644 --- a/drivers/mtd/mtdconcat.c +++ b/drivers/mtd/mtdconcat.c @@ -20,18 +20,6 @@ #include -/* - * Our storage structure: - * Subdev points to an array of pointers to struct mtd_info objects - * which is allocated along with this structure - * - */ -struct mtd_concat { - struct mtd_info mtd; - int num_subdev; - struct mtd_info **subdev; -}; - /* * how to calculate the size required for the above structure, * including the pointer array subdev points to: diff --git a/include/linux/mtd/concat.h b/include/linux/mtd/concat.h index d6f653e07426..b42d9af87c4e 100644 --- a/include/linux/mtd/concat.h +++ b/include/linux/mtd/concat.h @@ -9,6 +9,18 @@ #define MTD_CONCAT_H +/* + * Our storage structure: + * Subdev points to an array of pointers to struct mtd_info objects + * which is allocated along with this structure + * + */ +struct mtd_concat { + struct mtd_info mtd; + int num_subdev; + struct mtd_info **subdev; +}; + struct mtd_info *mtd_concat_create( struct mtd_info *subdev[], /* subdevices to concatenate */ int num_devs, /* number of subdevices */ -- cgit v1.2.3 From 43db6366fc2de02050e66389f5628d3fdc9af10a Mon Sep 17 00:00:00 2001 From: Amit Kumar Mahapatra Date: Wed, 4 Feb 2026 09:32:18 +0100 Subject: mtd: Add driver for concatenating devices Introducing CONFIG_MTD_VIRT_CONCAT to separate the legacy flow from the new approach, where only the concatenated partition is registered as an MTD device, while the individual partitions that form it are not registered independently, as they are typically not required by the user. CONFIG_MTD_VIRT_CONCAT is a boolean configuration option that depends on CONFIG_MTD_PARTITIONED_MASTER. When enabled, it allows flash nodes to be exposed as individual MTD devices along with the other partitions. The solution focuses on fixed-partitions description only as it depends on device boundaries. It supports multiple sets of concatenated devices, each comprising two or more partitions. flash@0 { reg = <0>; partitions { compatible = "fixed-partitions"; part0@0 { part-concat-next = <&flash0_part1>; label = "part0_0"; reg = <0x0 0x800000>; }; flash0_part1: part1@800000 { label = "part0_1"; reg = <800000 0x800000>; }; part2@1000000 { part-concat-next = <&flash1_part0>; label = "part0_2"; reg = <0x800000 0x800000>; }; }; }; flash@1 { reg = <1>; partitions { compatible = "fixed-partitions"; flash1_part0: part1@0 { label = "part1_0"; reg = <0x0 0x800000>; }; part1@800000 { label = "part1_1"; reg = <0x800000 0x800000>; }; }; }; The partitions that gets created are flash@0 part0_0-part0_1-concat flash@1 part1_1 part0_2-part1_0-concat Suggested-by: Bernhard Frauendienst Suggested-by: Miquel Raynal Signed-off-by: Amit Kumar Mahapatra Signed-off-by: Luca Ceresoli Signed-off-by: Miquel Raynal --- drivers/mtd/Kconfig | 9 ++ drivers/mtd/Makefile | 1 + drivers/mtd/mtd_virt_concat.c | 363 ++++++++++++++++++++++++++++++++++++++++++ drivers/mtd/mtdcore.c | 21 +++ drivers/mtd/mtdpart.c | 6 + include/linux/mtd/concat.h | 51 +++++- 6 files changed, 450 insertions(+), 1 deletion(-) create mode 100644 drivers/mtd/mtd_virt_concat.c (limited to 'include/linux') diff --git a/drivers/mtd/Kconfig b/drivers/mtd/Kconfig index 796a2eccbef0..0421c6208de7 100644 --- a/drivers/mtd/Kconfig +++ b/drivers/mtd/Kconfig @@ -206,6 +206,15 @@ config MTD_PARTITIONED_MASTER the parent of the partition device be the master device, rather than what lies behind the master. +config MTD_VIRT_CONCAT + bool "Virtual concatenated MTD devices" + depends on MTD_PARTITIONED_MASTER + help + The driver enables the creation of virtual MTD device by + concatenating multiple physical MTD devices into a single + entity. This allows for the creation of partitions larger than + the individual physical chips, extending across chip boundaries. + source "drivers/mtd/chips/Kconfig" source "drivers/mtd/maps/Kconfig" diff --git a/drivers/mtd/Makefile b/drivers/mtd/Makefile index 593d0593a038..7b6dd53e8150 100644 --- a/drivers/mtd/Makefile +++ b/drivers/mtd/Makefile @@ -6,6 +6,7 @@ # Core functionality. obj-$(CONFIG_MTD) += mtd.o mtd-y := mtdcore.o mtdsuper.o mtdconcat.o mtdpart.o mtdchar.o +mtd-$(CONFIG_MTD_VIRT_CONCAT) += mtd_virt_concat.o obj-y += parsers/ diff --git a/drivers/mtd/mtd_virt_concat.c b/drivers/mtd/mtd_virt_concat.c new file mode 100644 index 000000000000..aea88d1c9bc5 --- /dev/null +++ b/drivers/mtd/mtd_virt_concat.c @@ -0,0 +1,363 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Virtual concat MTD device driver + * + * Copyright (C) 2018 Bernhard Frauendienst + * Author: Bernhard Frauendienst + */ + +#include +#include +#include "mtdcore.h" +#include +#include +#include +#include +#include + +#define CONCAT_PROP "part-concat-next" +#define CONCAT_POSTFIX "concat" +#define MIN_DEV_PER_CONCAT 1 + +static LIST_HEAD(concat_node_list); + +/** + * struct mtd_virt_concat_node - components of a concatenation + * @head: List handle + * @count: Number of nodes + * @nodes: Pointer to the nodes (partitions) to concatenate + * @concat: Concatenation container + */ +struct mtd_virt_concat_node { + struct list_head head; + unsigned int count; + struct device_node **nodes; + struct mtd_concat *concat; +}; + +/** + * mtd_is_part_concat - Check if the device is already part + * of a concatenated device + * @dev: pointer to 'device_node' + * + * Return: true if the device is already part of a concatenation, + * false otherwise. + */ +static bool mtd_is_part_concat(struct device_node *dev) +{ + struct mtd_virt_concat_node *item; + int idx; + + list_for_each_entry(item, &concat_node_list, head) { + for (idx = 0; idx < item->count; idx++) { + if (item->nodes[idx] == dev) + return true; + } + } + return false; +} + +static void mtd_virt_concat_put_mtd_devices(struct mtd_concat *concat) +{ + int i; + + for (i = 0; i < concat->num_subdev; i++) + put_mtd_device(concat->subdev[i]); +} + +void mtd_virt_concat_destroy_joins(void) +{ + struct mtd_virt_concat_node *item, *tmp; + struct mtd_info *mtd; + + list_for_each_entry_safe(item, tmp, &concat_node_list, head) { + mtd = &item->concat->mtd; + if (item->concat) { + mtd_device_unregister(mtd); + kfree(mtd->name); + mtd_concat_destroy(mtd); + mtd_virt_concat_put_mtd_devices(item->concat); + } + } +} + +/** + * mtd_virt_concat_destroy - Destroy the concat that includes the mtd object + * @mtd: pointer to 'mtd_info' + * + * Return: 0 on success, -error otherwise. + */ +int mtd_virt_concat_destroy(struct mtd_info *mtd) +{ + struct mtd_info *child, *master = mtd_get_master(mtd); + struct mtd_virt_concat_node *item, *tmp; + struct mtd_concat *concat; + int idx, ret = 0; + bool is_mtd_found; + + list_for_each_entry_safe(item, tmp, &concat_node_list, head) { + is_mtd_found = false; + + /* Find the concat item that hold the mtd device */ + for (idx = 0; idx < item->count; idx++) { + if (item->nodes[idx] == mtd->dev.of_node) { + is_mtd_found = true; + break; + } + } + if (!is_mtd_found) + continue; + concat = item->concat; + + /* + * Since this concatenated device is being removed, retrieve + * all MTD devices that are part of it and register them + * individually. + */ + for (idx = 0; idx < concat->num_subdev; idx++) { + child = concat->subdev[idx]; + if (child->dev.of_node != mtd->dev.of_node) { + ret = add_mtd_device(child); + if (ret) + goto out; + } + } + /* Destroy the concat */ + if (concat->mtd.name) { + del_mtd_device(&concat->mtd); + kfree(concat->mtd.name); + mtd_concat_destroy(&concat->mtd); + mtd_virt_concat_put_mtd_devices(item->concat); + } + + for (idx = 0; idx < item->count; idx++) + of_node_put(item->nodes[idx]); + + kfree(item->nodes); + kfree(item); + } + return 0; +out: + mutex_lock(&master->master.partitions_lock); + list_del(&child->part.node); + mutex_unlock(&master->master.partitions_lock); + kfree(mtd->name); + kfree(mtd); + + return ret; +} + +/** + * mtd_virt_concat_create_item - Create a concat item + * @parts: pointer to 'device_node' + * @count: number of mtd devices that make up + * the concatenated device. + * + * Return: 0 on success, -error otherwise. + */ +static int mtd_virt_concat_create_item(struct device_node *parts, + unsigned int count) +{ + struct mtd_virt_concat_node *item; + struct mtd_concat *concat; + int i; + + for (i = 0; i < (count - 1); i++) { + if (mtd_is_part_concat(of_parse_phandle(parts, CONCAT_PROP, i))) + return 0; + } + + item = kzalloc(sizeof(*item), GFP_KERNEL); + if (!item) + return -ENOMEM; + + item->count = count; + item->nodes = kcalloc(count, sizeof(*item->nodes), GFP_KERNEL); + if (!item->nodes) { + kfree(item); + return -ENOMEM; + } + + /* + * The partition in which "part-concat-next" property + * is defined is the first device in the list of concat + * devices. + */ + item->nodes[0] = parts; + + for (i = 1; i < count; i++) + item->nodes[i] = of_parse_phandle(parts, CONCAT_PROP, (i - 1)); + + concat = kzalloc(sizeof(*concat), GFP_KERNEL); + if (!concat) { + kfree(item); + return -ENOMEM; + } + + concat->subdev = kcalloc(count, sizeof(*concat->subdev), GFP_KERNEL); + if (!concat->subdev) { + kfree(item); + kfree(concat); + return -ENOMEM; + } + item->concat = concat; + + list_add_tail(&item->head, &concat_node_list); + + return 0; +} + +void mtd_virt_concat_destroy_items(void) +{ + struct mtd_virt_concat_node *item, *temp; + int i; + + list_for_each_entry_safe(item, temp, &concat_node_list, head) { + for (i = 0; i < item->count; i++) + of_node_put(item->nodes[i]); + + kfree(item->nodes); + kfree(item); + } +} + +/** + * mtd_virt_concat_create_add - Add a mtd device to the concat list + * @mtd: pointer to 'mtd_info' + * + * Return: true on success, false otherwise. + */ +bool mtd_virt_concat_add(struct mtd_info *mtd) +{ + struct mtd_virt_concat_node *item; + struct mtd_concat *concat; + int idx; + + list_for_each_entry(item, &concat_node_list, head) { + concat = item->concat; + for (idx = 0; idx < item->count; idx++) { + if (item->nodes[idx] == mtd->dev.of_node) { + concat->subdev[concat->num_subdev++] = mtd; + return true; + } + } + } + return false; +} + +/** + * mtd_virt_concat_node_create - List all the concatenations found in DT + * + * Return: 0 on success, -error otherwise. + */ +int mtd_virt_concat_node_create(void) +{ + struct device_node *parts = NULL; + int ret = 0, count = 0; + + /* List all the concatenations found in DT */ + do { + parts = of_find_node_with_property(parts, CONCAT_PROP); + if (!of_device_is_available(parts)) + continue; + + if (mtd_is_part_concat(parts)) + continue; + + count = of_count_phandle_with_args(parts, CONCAT_PROP, NULL); + if (count < MIN_DEV_PER_CONCAT) + continue; + + /* + * The partition in which "part-concat-next" property is defined + * is also part of the concat device, so increament count by 1. + */ + count++; + + ret = mtd_virt_concat_create_item(parts, count); + if (ret) { + of_node_put(parts); + goto destroy_items; + } + } while (parts); + + return ret; + +destroy_items: + mtd_virt_concat_destroy_items(); + + return ret; +} + +/** + * mtd_virt_concat_create_join - Create and register the concatenated + * MTD device. + * + * Return: 0 on success, -error otherwise. + */ +int mtd_virt_concat_create_join(void) +{ + struct mtd_virt_concat_node *item; + struct mtd_concat *concat; + struct mtd_info *mtd; + ssize_t name_sz; + int ret, idx; + char *name; + + list_for_each_entry(item, &concat_node_list, head) { + concat = item->concat; + /* + * Check if item->count != concat->num_subdev, it indicates + * that the MTD information for all devices included in the + * concatenation are not handy, concat MTD device can't be + * created hence switch to next concat device. + */ + if (item->count != concat->num_subdev) { + continue; + } else { + /* Calculate the legth of the name of the virtual device */ + for (idx = 0, name_sz = 0; idx < concat->num_subdev; idx++) + name_sz += (strlen(concat->subdev[idx]->name) + 1); + name_sz += strlen(CONCAT_POSTFIX); + name = kmalloc(name_sz + 1, GFP_KERNEL); + if (!name) { + mtd_virt_concat_put_mtd_devices(concat); + return -ENOMEM; + } + + ret = 0; + for (idx = 0; idx < concat->num_subdev; idx++) { + ret += sprintf((name + ret), "%s-", + concat->subdev[idx]->name); + } + sprintf((name + ret), CONCAT_POSTFIX); + + if (concat->mtd.name) { + ret = memcmp(concat->mtd.name, name, name_sz); + if (ret == 0) + continue; + } + mtd = mtd_concat_create(concat->subdev, concat->num_subdev, name); + if (!mtd) { + kfree(name); + return -ENXIO; + } + concat->mtd = *mtd; + /* Arbitrary set the first device as parent */ + concat->mtd.dev.parent = concat->subdev[0]->dev.parent; + concat->mtd.dev = concat->subdev[0]->dev; + + /* Add the mtd device */ + ret = add_mtd_device(&concat->mtd); + if (ret) + goto destroy_concat; + } + } + + return 0; + +destroy_concat: + mtd_concat_destroy(mtd); + + return ret; +} diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index 64808493b4f5..576537774628 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -34,6 +34,7 @@ #include #include +#include #include "mtdcore.h" @@ -1120,6 +1121,12 @@ int mtd_device_parse_register(struct mtd_info *mtd, const char * const *types, goto out; } + if (IS_REACHABLE(CONFIG_MTD_VIRT_CONCAT)) { + ret = mtd_virt_concat_node_create(); + if (ret < 0) + goto out; + } + /* Prefer parsed partitions over driver-provided fallback */ ret = parse_mtd_partitions(mtd, types, parser_data); if (ret == -EPROBE_DEFER) @@ -1137,6 +1144,11 @@ int mtd_device_parse_register(struct mtd_info *mtd, const char * const *types, if (ret) goto out; + if (IS_REACHABLE(CONFIG_MTD_VIRT_CONCAT)) { + ret = mtd_virt_concat_create_join(); + if (ret < 0) + goto out; + } /* * FIXME: some drivers unfortunately call this function more than once. * So we have to check if we've already assigned the reboot notifier. @@ -1186,6 +1198,11 @@ int mtd_device_unregister(struct mtd_info *master) nvmem_unregister(master->otp_user_nvmem); nvmem_unregister(master->otp_factory_nvmem); + if (IS_REACHABLE(CONFIG_MTD_VIRT_CONCAT)) { + err = mtd_virt_concat_destroy(master); + if (err) + return err; + } err = del_mtd_partitions(master); if (err) return err; @@ -2621,6 +2638,10 @@ err_reg: static void __exit cleanup_mtd(void) { + if (IS_REACHABLE(CONFIG_MTD_VIRT_CONCAT)) { + mtd_virt_concat_destroy_joins(); + mtd_virt_concat_destroy_items(); + } debugfs_remove_recursive(dfs_dir_mtd); cleanup_mtdchar(); if (proc_mtd) diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c index e016cfbc7224..795a94e6b482 100644 --- a/drivers/mtd/mtdpart.c +++ b/drivers/mtd/mtdpart.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "mtdcore.h" @@ -409,6 +410,11 @@ int add_mtd_partitions(struct mtd_info *parent, goto err_del_partitions; } + if (IS_REACHABLE(CONFIG_MTD_VIRT_CONCAT)) { + if (mtd_virt_concat_add(child)) + continue; + } + mutex_lock(&master->master.partitions_lock); list_add_tail(&child->part.node, &parent->partitions); mutex_unlock(&master->master.partitions_lock); diff --git a/include/linux/mtd/concat.h b/include/linux/mtd/concat.h index b42d9af87c4e..2cd9d48958a8 100644 --- a/include/linux/mtd/concat.h +++ b/include/linux/mtd/concat.h @@ -28,5 +28,54 @@ struct mtd_info *mtd_concat_create( void mtd_concat_destroy(struct mtd_info *mtd); -#endif +/** + * mtd_virt_concat_node_create - Create a component for concatenation + * + * Returns a positive number representing the no. of devices found for + * concatenation, or a negative error code. + * + * List all the devices for concatenations found in DT and create a + * component for concatenation. + */ +int mtd_virt_concat_node_create(void); + +/** + * mtd_virt_concat_add - add mtd_info object to the list of subdevices for concatenation + * @mtd: pointer to new MTD device info structure + * + * Returns true if the mtd_info object is added successfully else returns false. + * + * The mtd_info object is added to the list of subdevices for concatenation. + * It returns true if a match is found, and false if all subdevices have + * already been added or if the mtd_info object does not match any of the + * intended MTD devices. + */ +bool mtd_virt_concat_add(struct mtd_info *mtd); +/** + * mtd_virt_concat_create_join - Create and register the concatenated MTD device + * + * Returns 0 on succes, or a negative error code. + * + * Creates and registers the concatenated MTD device + */ +int mtd_virt_concat_create_join(void); + +/** + * mtd_virt_concat_destroy - Remove the concat that includes a specific mtd device + * as one of its components. + * @mtd: pointer to MTD device info structure. + * + * Returns 0 on succes, or a negative error code. + * + * If the mtd_info object is part of a concatenated device, all other MTD devices + * within that concat are registered individually. The concatenated device is then + * removed, along with its concatenation component. + * + */ +int mtd_virt_concat_destroy(struct mtd_info *mtd); + +void mtd_virt_concat_destroy_joins(void); +void mtd_virt_concat_destroy_items(void); + +#endif -- cgit v1.2.3 From 43479bb3703f17da6cdfaa2a7f4b93db9c6908bc Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 5 Feb 2026 19:49:15 +0100 Subject: mtd: spinand: Clean the flags section Mention that we are declaring the main SPI NAND flags with a comment. Align the values with tabs. Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 6a024cf1c53a..58abd306ebe3 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -477,8 +477,9 @@ struct spinand_ecc_info { const struct mtd_ooblayout_ops *ooblayout; }; -#define SPINAND_HAS_QE_BIT BIT(0) -#define SPINAND_HAS_CR_FEAT_BIT BIT(1) +/* SPI NAND flags */ +#define SPINAND_HAS_QE_BIT BIT(0) +#define SPINAND_HAS_CR_FEAT_BIT BIT(1) #define SPINAND_HAS_PROG_PLANE_SELECT_BIT BIT(2) #define SPINAND_HAS_READ_PLANE_SELECT_BIT BIT(3) #define SPINAND_NO_RAW_ACCESS BIT(4) -- cgit v1.2.3 From 15c9ed1d8286dc0297f01347dc74f5a8cbc173de Mon Sep 17 00:00:00 2001 From: Qingfang Deng Date: Tue, 24 Feb 2026 09:50:52 +0800 Subject: pppoe: remove kernel-mode relay support The kernel-mode PPPoE relay feature and its two associated ioctls (PPPOEIOCSFWD and PPPOEIOCDFWD) are not used by any existing userspace PPPoE implementations. The most commonly-used package, RP-PPPoE [1], handles the relaying entirely in userspace. This legacy code has remained in the driver since its introduction in kernel 2.3.99-pre7 for over two decades, but has served no practical purpose. Remove the unused relay code. [1] https://dianne.skoll.ca/projects/rp-pppoe/ Signed-off-by: Qingfang Deng Acked-by: Arnd Bergmann Reviewed-by: Guillaume Nault Link: https://patch.msgid.link/20260224015053.42472-1-dqfext@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/ppp/pppoe.c | 79 ------------------------------------------- drivers/net/ppp/pppox.c | 3 -- include/linux/if_pppox.h | 6 ---- include/uapi/linux/if_pppox.h | 10 ------ 4 files changed, 98 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index 7900cc3212a5..1ac61c273b28 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -237,25 +237,6 @@ static inline struct pppox_sock *get_item(struct pppoe_net *pn, __be16 sid, return po; } -static inline struct pppox_sock *__get_item_by_addr(struct net *net, - struct sockaddr_pppox *sp) -{ - struct net_device *dev; - struct pppoe_net *pn; - struct pppox_sock *pppox_sock = NULL; - - int ifindex; - - dev = dev_get_by_name_rcu(net, sp->sa_addr.pppoe.dev); - if (dev) { - ifindex = dev->ifindex; - pn = pppoe_pernet(net); - pppox_sock = __get_item(pn, sp->sa_addr.pppoe.sid, - sp->sa_addr.pppoe.remote, ifindex); - } - return pppox_sock; -} - static inline void delete_item(struct pppoe_net *pn, __be16 sid, char *addr, int ifindex) { @@ -369,7 +350,6 @@ static struct notifier_block pppoe_notifier = { static int pppoe_rcv_core(struct sock *sk, struct sk_buff *skb) { struct pppox_sock *po = pppox_sk(sk); - struct pppox_sock *relay_po; /* Backlog receive. Semantics of backlog rcv preclude any code from * executing in lock_sock()/release_sock() bounds; meaning sk->sk_state @@ -378,17 +358,6 @@ static int pppoe_rcv_core(struct sock *sk, struct sk_buff *skb) if (sk->sk_state & PPPOX_BOUND) { ppp_input(&po->chan, skb); - } else if (sk->sk_state & PPPOX_RELAY) { - relay_po = __get_item_by_addr(sock_net(sk), - &po->pppoe_relay); - if (relay_po == NULL) - goto abort_kfree; - - if ((sk_pppox(relay_po)->sk_state & PPPOX_CONNECTED) == 0) - goto abort_kfree; - - if (!__pppoe_xmit(sk_pppox(relay_po), skb)) - goto abort_kfree; } else { if (sock_queue_rcv_skb(sk, skb)) goto abort_kfree; @@ -656,7 +625,6 @@ static int pppoe_connect(struct socket *sock, struct sockaddr_unsized *uservaddr po->pppoe_ifindex = 0; memset(&po->pppoe_pa, 0, sizeof(po->pppoe_pa)); - memset(&po->pppoe_relay, 0, sizeof(po->pppoe_relay)); memset(&po->chan, 0, sizeof(po->chan)); po->next = NULL; po->num = 0; @@ -783,53 +751,6 @@ static int pppoe_ioctl(struct socket *sock, unsigned int cmd, err = 0; break; - case PPPOEIOCSFWD: - { - struct pppox_sock *relay_po; - - err = -EBUSY; - if (sk->sk_state & (PPPOX_BOUND | PPPOX_DEAD)) - break; - - err = -ENOTCONN; - if (!(sk->sk_state & PPPOX_CONNECTED)) - break; - - /* PPPoE address from the user specifies an outbound - PPPoE address which frames are forwarded to */ - err = -EFAULT; - if (copy_from_user(&po->pppoe_relay, - (void __user *)arg, - sizeof(struct sockaddr_pppox))) - break; - - err = -EINVAL; - if (po->pppoe_relay.sa_family != AF_PPPOX || - po->pppoe_relay.sa_protocol != PX_PROTO_OE) - break; - - /* Check that the socket referenced by the address - actually exists. */ - rcu_read_lock(); - relay_po = __get_item_by_addr(sock_net(sk), &po->pppoe_relay); - rcu_read_unlock(); - if (!relay_po) - break; - - sk->sk_state |= PPPOX_RELAY; - err = 0; - break; - } - - case PPPOEIOCDFWD: - err = -EALREADY; - if (!(sk->sk_state & PPPOX_RELAY)) - break; - - sk->sk_state &= ~PPPOX_RELAY; - err = 0; - break; - default: err = -ENOTTY; } diff --git a/drivers/net/ppp/pppox.c b/drivers/net/ppp/pppox.c index 08364f10a43f..5861a2f6ce3e 100644 --- a/drivers/net/ppp/pppox.c +++ b/drivers/net/ppp/pppox.c @@ -102,9 +102,6 @@ EXPORT_SYMBOL(pppox_ioctl); #ifdef CONFIG_COMPAT int pppox_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { - if (cmd == PPPOEIOCSFWD32) - cmd = PPPOEIOCSFWD; - return pppox_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); } diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h index db45d6f1c4f4..8bbf676c2a85 100644 --- a/include/linux/if_pppox.h +++ b/include/linux/if_pppox.h @@ -25,8 +25,6 @@ struct pppoe_opt { struct net_device *dev; /* device associated with socket*/ int ifindex; /* ifindex of device associated with socket */ struct pppoe_addr pa; /* what this socket is bound to*/ - struct sockaddr_pppox relay; /* what socket data will be - relayed to (PPPoE relaying) */ struct work_struct padt_work;/* Work item for handling PADT */ }; @@ -53,7 +51,6 @@ struct pppox_sock { #define pppoe_dev proto.pppoe.dev #define pppoe_ifindex proto.pppoe.ifindex #define pppoe_pa proto.pppoe.pa -#define pppoe_relay proto.pppoe.relay static inline struct pppox_sock *pppox_sk(struct sock *sk) { @@ -80,14 +77,11 @@ extern void pppox_unbind_sock(struct sock *sk);/* delete ppp-channel binding */ extern int pppox_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); extern int pppox_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); -#define PPPOEIOCSFWD32 _IOW(0xB1 ,0, compat_size_t) - /* PPPoX socket states */ enum { PPPOX_NONE = 0, /* initial state */ PPPOX_CONNECTED = 1, /* connection established ==TCP_ESTABLISHED */ PPPOX_BOUND = 2, /* bound to ppp device */ - PPPOX_RELAY = 4, /* forwarding is enabled */ PPPOX_DEAD = 16 /* dead, useless, please clean me up!*/ }; diff --git a/include/uapi/linux/if_pppox.h b/include/uapi/linux/if_pppox.h index 29b804aa7474..7ae044d71fb7 100644 --- a/include/uapi/linux/if_pppox.h +++ b/include/uapi/linux/if_pppox.h @@ -103,16 +103,6 @@ struct sockaddr_pppol2tpv3in6 { struct pppol2tpv3in6_addr pppol2tp; } __packed; -/********************************************************************* - * - * ioctl interface for defining forwarding of connections - * - ********************************************************************/ - -#define PPPOEIOCSFWD _IOW(0xB1 ,0, size_t) -#define PPPOEIOCDFWD _IO(0xB1 ,1) -/*#define PPPOEIOCGFWD _IOWR(0xB1,2, size_t)*/ - /* Codes to identify message types */ #define PADI_CODE 0x09 #define PADO_CODE 0x07 -- cgit v1.2.3 From 1ae2f435350ec05224a39995c3a680aa6fdae5a5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 23 Feb 2026 16:29:37 +0100 Subject: ACPI: x86: cmos_rtc: Create a CMOS RTC platform device Make the CMOS RTC ACPI scan handler create a platform device that will be used subsequently by rtc-cmos for driver binding on x86 systems with ACPI and update add_rtc_cmos() to skip registering a fallback platform device for the CMOS RTC when the above one has been registered. Signed-off-by: Rafael J. Wysocki Acked-by: Dave Hansen # x86 Link: https://patch.msgid.link/1962427.tdWV9SEqCh@rafael.j.wysocki --- arch/x86/kernel/rtc.c | 4 ++++ drivers/acpi/x86/cmos_rtc.c | 8 ++++++++ include/linux/acpi.h | 4 ++++ 3 files changed, 16 insertions(+) (limited to 'include/linux') diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 51a849a79c98..b112178e8185 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -2,6 +2,7 @@ /* * RTC related functions */ +#include #include #include #include @@ -146,6 +147,9 @@ static __init int add_rtc_cmos(void) } } #endif + if (cmos_rtc_platform_device_present) + return 0; + if (!x86_platform.legacy.rtc) return -ENODEV; diff --git a/drivers/acpi/x86/cmos_rtc.c b/drivers/acpi/x86/cmos_rtc.c index 45db7e51cbe6..bdd66dfd4a44 100644 --- a/drivers/acpi/x86/cmos_rtc.c +++ b/drivers/acpi/x86/cmos_rtc.c @@ -24,6 +24,8 @@ static const struct acpi_device_id acpi_cmos_rtc_ids[] = { {} }; +bool cmos_rtc_platform_device_present; + static bool cmos_rtc_space_handler_present __read_mostly; static acpi_status acpi_cmos_rtc_space_handler(u32 function, @@ -103,6 +105,12 @@ static int acpi_cmos_rtc_attach(struct acpi_device *adev, if (ret < 0) return ret; + if (IS_ERR_OR_NULL(acpi_create_platform_device(adev, NULL))) { + pr_err("Failed to create CMOS-RTC platform device\n"); + return 0; + } else { + cmos_rtc_platform_device_present = true; + } return 1; } diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 4d2f0bed7a06..2bdb801cee01 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -791,6 +791,8 @@ const char *acpi_get_subsystem_id(acpi_handle handle); int acpi_mrrm_max_mem_region(void); #endif +extern bool cmos_rtc_platform_device_present; + #else /* !CONFIG_ACPI */ #define acpi_disabled 1 @@ -1116,6 +1118,8 @@ static inline int acpi_mrrm_max_mem_region(void) return 1; } +#define cmos_rtc_platform_device_present false + #endif /* !CONFIG_ACPI */ #ifdef CONFIG_ACPI_HMAT -- cgit v1.2.3 From 2a78e42104444f948698f1225deaf515e9b7224d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 23 Feb 2026 16:30:21 +0100 Subject: ACPI: x86/rtc-cmos: Use platform device for driver binding Modify the rtc-cmos driver to bind to a platform device on systems with ACPI via acpi_match_table and advertise the CMOST RTC ACPI device IDs for driver auto-loading. Note that adding the requisite device IDs to it and exposing them via MODULE_DEVICE_TABLE() is sufficient for this purpose. Since the ACPI device IDs in question are the same as for the CMOS RTC ACPI scan handler, put them into a common header file and use the definition from there in both places. Additionally, to prevent a PNP device from being created for the CMOS RTC if a platform one is present already, make is_cmos_rtc_device() check cmos_rtc_platform_device_present introduced previously. Signed-off-by: Rafael J. Wysocki Acked-by: Alexandre Belloni Link: https://patch.msgid.link/13969123.uLZWGnKmhe@rafael.j.wysocki --- drivers/acpi/acpi_pnp.c | 2 +- drivers/acpi/x86/cmos_rtc.c | 5 +---- drivers/rtc/rtc-cmos.c | 10 ++++++++++ include/linux/acpi.h | 6 ++++++ 4 files changed, 18 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/acpi/acpi_pnp.c b/drivers/acpi/acpi_pnp.c index 85d9f78619a2..4ad8f56d1a5d 100644 --- a/drivers/acpi/acpi_pnp.c +++ b/drivers/acpi/acpi_pnp.c @@ -368,7 +368,7 @@ static int is_cmos_rtc_device(struct acpi_device *adev) { "PNP0B02" }, {""}, }; - return !acpi_match_device_ids(adev, ids); + return !cmos_rtc_platform_device_present && !acpi_match_device_ids(adev, ids); } bool acpi_is_pnp_device(struct acpi_device *adev) diff --git a/drivers/acpi/x86/cmos_rtc.c b/drivers/acpi/x86/cmos_rtc.c index bdd66dfd4a44..a6df5b991c96 100644 --- a/drivers/acpi/x86/cmos_rtc.c +++ b/drivers/acpi/x86/cmos_rtc.c @@ -18,10 +18,7 @@ #include "../internal.h" static const struct acpi_device_id acpi_cmos_rtc_ids[] = { - { "PNP0B00" }, - { "PNP0B01" }, - { "PNP0B02" }, - {} + ACPI_CMOS_RTC_IDS }; bool cmos_rtc_platform_device_present; diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index 0743c6acd6e2..7457f42fd6f0 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -27,6 +27,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include @@ -1476,6 +1477,14 @@ static __init void cmos_of_init(struct platform_device *pdev) #else static inline void cmos_of_init(struct platform_device *pdev) {} #endif + +#ifdef CONFIG_ACPI +static const struct acpi_device_id acpi_cmos_rtc_ids[] = { + ACPI_CMOS_RTC_IDS +}; +MODULE_DEVICE_TABLE(acpi, acpi_cmos_rtc_ids); +#endif + /*----------------------------------------------------------------*/ /* Platform setup should have set up an RTC device, when PNP is @@ -1530,6 +1539,7 @@ static struct platform_driver cmos_platform_driver = { .name = driver_name, .pm = &cmos_pm_ops, .of_match_table = of_match_ptr(of_cmos_match), + .acpi_match_table = ACPI_PTR(acpi_cmos_rtc_ids), } }; diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 2bdb801cee01..5ecdcdaf31aa 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -791,6 +791,12 @@ const char *acpi_get_subsystem_id(acpi_handle handle); int acpi_mrrm_max_mem_region(void); #endif +#define ACPI_CMOS_RTC_IDS \ + { "PNP0B00", }, \ + { "PNP0B01", }, \ + { "PNP0B02", }, \ + { "", } + extern bool cmos_rtc_platform_device_present; #else /* !CONFIG_ACPI */ -- cgit v1.2.3 From c8e9b1d9febc83ee94944695a07cfd40a1b29743 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 25 Feb 2026 21:12:20 -0800 Subject: dmaengine: fsl-edma: fix all kernel-doc warnings Use the correct kernel-doc format and struct member names to eliminate these kernel-doc warnings: Warning: include/linux/platform_data/dma-mcf-edma.h:35 struct member 'dma_channels' not described in 'mcf_edma_platform_data' Warning: include/linux/platform_data/dma-mcf-edma.h:35 struct member 'slave_map' not described in 'mcf_edma_platform_data' Warning: include/linux/platform_data/dma-mcf-edma.h:35 struct member 'slavecnt' not described in 'mcf_edma_platform_data' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260226051220.548566-1-rdunlap@infradead.org Signed-off-by: Vinod Koul --- include/linux/platform_data/dma-mcf-edma.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/dma-mcf-edma.h b/include/linux/platform_data/dma-mcf-edma.h index d718ccfa3421..0b31af66a1ac 100644 --- a/include/linux/platform_data/dma-mcf-edma.h +++ b/include/linux/platform_data/dma-mcf-edma.h @@ -26,8 +26,9 @@ bool mcf_edma_filter_fn(struct dma_chan *chan, void *param); /** * struct mcf_edma_platform_data - platform specific data for eDMA engine * - * @ver The eDMA module version. - * @dma_channels The number of eDMA channels. + * @dma_channels: The number of eDMA channels. + * @slave_map: Slave device map + * @slavecnt: Number of entries in @slave_map */ struct mcf_edma_platform_data { int dma_channels; -- cgit v1.2.3 From b2d51bc1601c762c63f19c119589a0a0c44bc8ec Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 26 Feb 2026 10:20:23 +0100 Subject: gpio: generic: Don't use 'proxy' headers Update header inclusions to follow IWYU (Include What You Use) principle. Signed-off-by: Andy Shevchenko Reviewed-by: Linus Walleij Link: https://patch.msgid.link/20260226092023.4096921-1-andriy.shevchenko@linux.intel.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-mmio.c | 4 +--- include/linux/gpio/generic.h | 8 +++++++- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpio/gpio-mmio.c b/drivers/gpio/gpio-mmio.c index edbcaad57d00..0941d034a49c 100644 --- a/drivers/gpio/gpio-mmio.c +++ b/drivers/gpio/gpio-mmio.c @@ -42,18 +42,16 @@ o ` ~~~~\___/~~~~ ` controller in FPGA is ,.` #include #include -#include #include -#include #include #include +#include #include #include #include #include #include #include -#include #include #include diff --git a/include/linux/gpio/generic.h b/include/linux/gpio/generic.h index ff566dc9c3cb..de43c06c83ef 100644 --- a/include/linux/gpio/generic.h +++ b/include/linux/gpio/generic.h @@ -3,9 +3,15 @@ #ifndef __LINUX_GPIO_GENERIC_H #define __LINUX_GPIO_GENERIC_H +#include +#include #include -#include +#include +#include #include +#include + +#include struct device; -- cgit v1.2.3 From fa4a3a95139e7293c1333a33bd7b19e7261e3bd0 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 23 Feb 2026 18:20:06 +0100 Subject: gpio: introduce a header for symbols shared by suppliers and consumers GPIO_LINE_DIRECTION_IN/OUT definitions are used both in supplier (GPIO controller drivers) as well as consumer code. In order to not force the consumers to include gpio/driver.h or - even worse - to redefine these values, create a new header file - gpio/defs.h - and move them over there. Include this header from both gpio/consumer.h and gpio/driver.h. Reviewed-by: Linus Walleij Suggested-by: Andy Shevchenko Reviewed-by: Andy Shevchenko Link: https://patch.msgid.link/20260223172006.204268-1-bartosz.golaszewski@oss.qualcomm.com Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/consumer.h | 2 ++ include/linux/gpio/defs.h | 9 +++++++++ include/linux/gpio/driver.h | 5 ++--- 3 files changed, 13 insertions(+), 3 deletions(-) create mode 100644 include/linux/gpio/defs.h (limited to 'include/linux') diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index 0d8408582918..3efb5cb1e1d1 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -6,6 +6,8 @@ #include #include +#include "defs.h" + struct acpi_device; struct device; struct fwnode_handle; diff --git a/include/linux/gpio/defs.h b/include/linux/gpio/defs.h new file mode 100644 index 000000000000..b69fd7c041b2 --- /dev/null +++ b/include/linux/gpio/defs.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __LINUX_GPIO_DEFS_H +#define __LINUX_GPIO_DEFS_H + +#define GPIO_LINE_DIRECTION_IN 1 +#define GPIO_LINE_DIRECTION_OUT 0 + +#endif /* __LINUX_GPIO_DEFS_H */ diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index fabe2baf7b50..5f5ddcbfa445 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -20,6 +20,8 @@ #include #endif +#include "defs.h" + struct device; struct irq_chip; struct irq_data; @@ -42,9 +44,6 @@ union gpio_irq_fwspec { #endif }; -#define GPIO_LINE_DIRECTION_IN 1 -#define GPIO_LINE_DIRECTION_OUT 0 - /** * struct gpio_irq_chip - GPIO interrupt controller */ -- cgit v1.2.3 From 0a93d30861617ecf207dcc4c6c736435fac36dae Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:35:42 +0100 Subject: hrtimer: Provide a static branch based hrtimer_hres_enabled() The scheduler evaluates this via hrtimer_is_hres_active() every time it has to update HRTICK. This needs to follow three pointers, which is expensive. Provide a static branch based mechanism to avoid that. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163429.136503358@kernel.org --- include/linux/hrtimer.h | 13 +++++++++---- kernel/time/hrtimer.c | 28 +++++++++++++++++++++++++--- 2 files changed, 34 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 74adbd4e7003..c9ca105ba009 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -153,17 +153,22 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer) } #ifdef CONFIG_HIGH_RES_TIMERS +extern unsigned int hrtimer_resolution; struct clock_event_device; extern void hrtimer_interrupt(struct clock_event_device *dev); -extern unsigned int hrtimer_resolution; +extern struct static_key_false hrtimer_highres_enabled_key; -#else +static inline bool hrtimer_highres_enabled(void) +{ + return static_branch_likely(&hrtimer_highres_enabled_key); +} +#else /* CONFIG_HIGH_RES_TIMERS */ #define hrtimer_resolution (unsigned int)LOW_RES_NSEC - -#endif +static inline bool hrtimer_highres_enabled(void) { return false; } +#endif /* !CONFIG_HIGH_RES_TIMERS */ static inline ktime_t __hrtimer_expires_remaining_adjusted(const struct hrtimer *timer, ktime_t now) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 3088db419aa6..67917ce696d4 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -126,6 +126,25 @@ static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base) return likely(base->online); } +#ifdef CONFIG_HIGH_RES_TIMERS +DEFINE_STATIC_KEY_FALSE(hrtimer_highres_enabled_key); + +static void hrtimer_hres_workfn(struct work_struct *work) +{ + static_branch_enable(&hrtimer_highres_enabled_key); +} + +static DECLARE_WORK(hrtimer_hres_work, hrtimer_hres_workfn); + +static inline void hrtimer_schedule_hres_work(void) +{ + if (!hrtimer_highres_enabled()) + schedule_work(&hrtimer_hres_work); +} +#else +static inline void hrtimer_schedule_hres_work(void) { } +#endif + /* * Functions and macros which are different for UP/SMP systems are kept in a * single place @@ -649,7 +668,9 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) } /* - * Is the high resolution mode active ? + * Is the high resolution mode active in the CPU base. This cannot use the + * static key as the CPUs are switched to high resolution mode + * asynchronously. */ static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) { @@ -750,6 +771,7 @@ static void hrtimer_switch_to_hres(void) tick_setup_sched_timer(true); /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); + hrtimer_schedule_hres_work(); } #else @@ -947,11 +969,10 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, */ void clock_was_set(unsigned int bases) { - struct hrtimer_cpu_base *cpu_base = raw_cpu_ptr(&hrtimer_bases); cpumask_var_t mask; int cpu; - if (!hrtimer_hres_active(cpu_base) && !tick_nohz_is_active()) + if (!hrtimer_highres_enabled() && !tick_nohz_is_active()) goto out_timerfd; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { @@ -962,6 +983,7 @@ void clock_was_set(unsigned int bases) /* Avoid interrupting CPUs if possible */ cpus_read_lock(); for_each_online_cpu(cpu) { + struct hrtimer_cpu_base *cpu_base; unsigned long flags; cpu_base = &per_cpu(hrtimer_bases, cpu); -- cgit v1.2.3 From c3a92213eb3dd8ea6f664d16a08eda800e34eaad Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:35:47 +0100 Subject: sched: Use hrtimer_highres_enabled() Use the static branch based variant and thereby avoid following three pointers. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163429.203610956@kernel.org --- include/linux/hrtimer.h | 6 ------ kernel/sched/sched.h | 37 +++++++++---------------------------- 2 files changed, 9 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index c9ca105ba009..b5003856fd60 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -146,12 +146,6 @@ static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) return ktime_sub(timer->node.expires, hrtimer_cb_get_time(timer)); } -static inline int hrtimer_is_hres_active(struct hrtimer *timer) -{ - return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ? - timer->base->cpu_base->hres_active : 0; -} - #ifdef CONFIG_HIGH_RES_TIMERS extern unsigned int hrtimer_resolution; struct clock_event_device; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 73bc20c47631..0aa089dfaaa4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3019,25 +3019,19 @@ extern unsigned int sysctl_numa_balancing_hot_threshold; * - enabled by features * - hrtimer is actually high res */ -static inline int hrtick_enabled(struct rq *rq) +static inline bool hrtick_enabled(struct rq *rq) { - if (!cpu_active(cpu_of(rq))) - return 0; - return hrtimer_is_hres_active(&rq->hrtick_timer); + return cpu_active(cpu_of(rq)) && hrtimer_highres_enabled(); } -static inline int hrtick_enabled_fair(struct rq *rq) +static inline bool hrtick_enabled_fair(struct rq *rq) { - if (!sched_feat(HRTICK)) - return 0; - return hrtick_enabled(rq); + return sched_feat(HRTICK) && hrtick_enabled(rq); } -static inline int hrtick_enabled_dl(struct rq *rq) +static inline bool hrtick_enabled_dl(struct rq *rq) { - if (!sched_feat(HRTICK_DL)) - return 0; - return hrtick_enabled(rq); + return sched_feat(HRTICK_DL) && hrtick_enabled(rq); } extern void hrtick_start(struct rq *rq, u64 delay); @@ -3047,22 +3041,9 @@ static inline bool hrtick_active(struct rq *rq) } #else /* !CONFIG_SCHED_HRTICK: */ - -static inline int hrtick_enabled_fair(struct rq *rq) -{ - return 0; -} - -static inline int hrtick_enabled_dl(struct rq *rq) -{ - return 0; -} - -static inline int hrtick_enabled(struct rq *rq) -{ - return 0; -} - +static inline bool hrtick_enabled_fair(struct rq *rq) { return false; } +static inline bool hrtick_enabled_dl(struct rq *rq) { return false; } +static inline bool hrtick_enabled(struct rq *rq) { return false; } #endif /* !CONFIG_SCHED_HRTICK */ #ifndef arch_scale_freq_tick -- cgit v1.2.3 From b7dd64778aa3f89de9afa1e81171cfe110ddc525 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 Feb 2026 17:36:01 +0100 Subject: hrtimer: Provide LAZY_REARM mode The hrtick timer is frequently rearmed before expiry and most of the time the new expiry is past the armed one. As this happens on every context switch it becomes expensive with scheduling heavy work loads especially in virtual machines as the "hardware" reprogamming implies a VM exit. Add a lazy rearm mode flag which skips the reprogamming if: 1) The timer was the first expiring timer before the rearm 2) The new expiry time is farther out than the armed time This avoids a massive amount of reprogramming operations of the hrtick timer for the price of eventually taking the alredy armed interrupt for nothing. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163429.408524456@kernel.org --- include/linux/hrtimer.h | 8 ++++++++ include/linux/hrtimer_types.h | 3 +++ kernel/time/hrtimer.c | 17 ++++++++++++++++- 3 files changed, 27 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index b5003856fd60..c924bb2498db 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -31,6 +31,13 @@ * soft irq context * HRTIMER_MODE_HARD - Timer callback function will be executed in * hard irq context even on PREEMPT_RT. + * HRTIMER_MODE_LAZY_REARM - Avoid reprogramming if the timer was the + * first expiring timer and is moved into the + * future. Special mode for the HRTICK timer to + * avoid extensive reprogramming of the hardware, + * which is expensive in virtual machines. Risks + * a pointless expiry, but that's better than + * reprogramming on every context switch, */ enum hrtimer_mode { HRTIMER_MODE_ABS = 0x00, @@ -38,6 +45,7 @@ enum hrtimer_mode { HRTIMER_MODE_PINNED = 0x02, HRTIMER_MODE_SOFT = 0x04, HRTIMER_MODE_HARD = 0x08, + HRTIMER_MODE_LAZY_REARM = 0x10, HRTIMER_MODE_ABS_PINNED = HRTIMER_MODE_ABS | HRTIMER_MODE_PINNED, HRTIMER_MODE_REL_PINNED = HRTIMER_MODE_REL | HRTIMER_MODE_PINNED, diff --git a/include/linux/hrtimer_types.h b/include/linux/hrtimer_types.h index 8fbbb6bdf7a1..64381c64cdbd 100644 --- a/include/linux/hrtimer_types.h +++ b/include/linux/hrtimer_types.h @@ -33,6 +33,8 @@ enum hrtimer_restart { * @is_soft: Set if hrtimer will be expired in soft interrupt context. * @is_hard: Set if hrtimer will be expired in hard interrupt context * even on RT. + * @is_lazy: Set if the timer is frequently rearmed to avoid updates + * of the clock event device * * The hrtimer structure must be initialized by hrtimer_setup() */ @@ -45,6 +47,7 @@ struct hrtimer { u8 is_rel; u8 is_soft; u8 is_hard; + u8 is_lazy; }; #endif /* _LINUX_HRTIMER_TYPES_H */ diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 67917ce696d4..e54f8b59f6b4 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1152,7 +1152,7 @@ static void __remove_hrtimer(struct hrtimer *timer, * an superfluous call to hrtimer_force_reprogram() on the * remote cpu later on if the same timer gets enqueued again. */ - if (reprogram && timer == cpu_base->next_timer) + if (reprogram && timer == cpu_base->next_timer && !timer->is_lazy) hrtimer_force_reprogram(cpu_base, 1); } @@ -1321,6 +1321,20 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, return 0; } + /* + * Special case for the HRTICK timer. It is frequently rearmed and most + * of the time moves the expiry into the future. That's expensive in + * virtual machines and it's better to take the pointless already armed + * interrupt than reprogramming the hardware on every context switch. + * + * If the new expiry is before the armed time, then reprogramming is + * required. + */ + if (timer->is_lazy) { + if (new_base->cpu_base->expires_next <= hrtimer_get_expires(timer)) + return 0; + } + /* * Timer was forced to stay on the current CPU to avoid * reprogramming on removal and enqueue. Force reprogram the @@ -1675,6 +1689,7 @@ static void __hrtimer_setup(struct hrtimer *timer, base += hrtimer_clockid_to_base(clock_id); timer->is_soft = softtimer; timer->is_hard = !!(mode & HRTIMER_MODE_HARD); + timer->is_lazy = !!(mode & HRTIMER_MODE_LAZY_REARM); timer->base = &cpu_base->clock_base[base]; timerqueue_init(&timer->node); -- cgit v1.2.3 From 70802807398c65f5a49b2baec87e1f6c8db43de6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:36:15 +0100 Subject: clockevents: Remove redundant CLOCK_EVT_FEAT_KTIME The only real usecase for this is the hrtimer based broadcast device. No point in using two different feature flags for this. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163429.609049777@kernel.org --- include/linux/clockchips.h | 1 - kernel/time/clockevents.c | 4 ++-- kernel/time/tick-broadcast-hrtimer.c | 1 - 3 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index b0df28ddd394..5e8f7819f6a6 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -45,7 +45,6 @@ enum clock_event_state { */ # define CLOCK_EVT_FEAT_PERIODIC 0x000001 # define CLOCK_EVT_FEAT_ONESHOT 0x000002 -# define CLOCK_EVT_FEAT_KTIME 0x000004 /* * x86(64) specific (mis)features: diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index eaae1ce9f060..5abaeef08e6a 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -319,8 +319,8 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n", clockevent_get_state(dev)); - /* Shortcut for clockevent devices that can deal with ktime. */ - if (dev->features & CLOCK_EVT_FEAT_KTIME) + /* ktime_t based reprogramming for the broadcast hrtimer device */ + if (unlikely(dev->features & CLOCK_EVT_FEAT_HRTIMER)) return dev->set_next_ktime(expires, dev); delta = ktime_to_ns(ktime_sub(expires, ktime_get())); diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index a88b72b0f35e..51f6a1032c83 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -78,7 +78,6 @@ static struct clock_event_device ce_broadcast_hrtimer = { .set_state_shutdown = bc_shutdown, .set_next_ktime = bc_set_next, .features = CLOCK_EVT_FEAT_ONESHOT | - CLOCK_EVT_FEAT_KTIME | CLOCK_EVT_FEAT_HRTIMER, .rating = 0, .bound_on = -1, -- cgit v1.2.3 From 2e27beeb66e43f3b84aef5a07e486a5d50695c06 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:36:20 +0100 Subject: timekeeping: Allow inlining clocksource::read() On some architectures clocksource::read() boils down to a single instruction, so the indirect function call is just a massive overhead especially with speculative execution mitigations in effect. Allow architectures to enable conditional inlining of that read to avoid that by: - providing a static branch to switch to the inlined variant - disabling the branch before clocksource changes - enabling the branch after a clocksource change, when the clocksource indicates in a feature flag that it is the one which provides the inlined variant This is intentionally not a static call as that would only remove the indirect call, but not the rest of the overhead. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163429.675151545@kernel.org --- include/linux/clocksource.h | 2 ++ kernel/time/Kconfig | 3 ++ kernel/time/timekeeping.c | 74 +++++++++++++++++++++++++++++++++------------ 3 files changed, 60 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 65b7c41471c3..54366d5c4d19 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -149,6 +149,8 @@ struct clocksource { #define CLOCK_SOURCE_SUSPEND_NONSTOP 0x80 #define CLOCK_SOURCE_RESELECT 0x100 #define CLOCK_SOURCE_VERIFY_PERCPU 0x200 +#define CLOCK_SOURCE_CAN_INLINE_READ 0x400 + /* simplify initialization of mask field */ #define CLOCKSOURCE_MASK(bits) GENMASK_ULL((bits) - 1, 0) diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 7c6a52f7836c..07b048ba0cca 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -17,6 +17,9 @@ config ARCH_CLOCKSOURCE_DATA config ARCH_CLOCKSOURCE_INIT bool +config ARCH_WANTS_CLOCKSOURCE_READ_INLINE + bool + # Timekeeping vsyscall support config GENERIC_TIME_VSYSCALL bool diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 91fa2003351c..63aa31f02ebc 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -3,34 +3,30 @@ * Kernel timekeeping code and accessor functions. Based on code from * timer.c, moved in commit 8524070b7982. */ -#include -#include -#include +#include +#include +#include +#include #include -#include -#include -#include +#include #include -#include -#include +#include +#include #include +#include +#include +#include #include -#include -#include +#include #include #include -#include -#include -#include -#include -#include -#include +#include #include #include "tick-internal.h" -#include "ntp_internal.h" #include "timekeeping_internal.h" +#include "ntp_internal.h" #define TK_CLEAR_NTP (1 << 0) #define TK_CLOCK_WAS_SET (1 << 1) @@ -275,6 +271,11 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot); } +#ifdef CONFIG_ARCH_WANTS_CLOCKSOURCE_READ_INLINE +#include + +static DEFINE_STATIC_KEY_FALSE(clocksource_read_inlined); + /* * tk_clock_read - atomic clocksource read() helper * @@ -288,13 +289,36 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) * a read of the fast-timekeeper tkrs (which is protected by its own locking * and update logic). */ -static inline u64 tk_clock_read(const struct tk_read_base *tkr) +static __always_inline u64 tk_clock_read(const struct tk_read_base *tkr) { struct clocksource *clock = READ_ONCE(tkr->clock); + if (static_branch_likely(&clocksource_read_inlined)) + return arch_inlined_clocksource_read(clock); + return clock->read(clock); } +static inline void clocksource_disable_inline_read(void) +{ + static_branch_disable(&clocksource_read_inlined); +} + +static inline void clocksource_enable_inline_read(void) +{ + static_branch_enable(&clocksource_read_inlined); +} +#else +static __always_inline u64 tk_clock_read(const struct tk_read_base *tkr) +{ + struct clocksource *clock = READ_ONCE(tkr->clock); + + return clock->read(clock); +} +static inline void clocksource_disable_inline_read(void) { } +static inline void clocksource_enable_inline_read(void) { } +#endif + /** * tk_setup_internals - Set up internals to use clocksource clock. * @@ -375,7 +399,7 @@ static noinline u64 delta_to_ns_safe(const struct tk_read_base *tkr, u64 delta) return mul_u64_u32_add_u64_shr(delta, tkr->mult, tkr->xtime_nsec, tkr->shift); } -static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles) +static __always_inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles) { /* Calculate the delta since the last update_wall_time() */ u64 mask = tkr->mask, delta = (cycles - tkr->cycle_last) & mask; @@ -1631,7 +1655,19 @@ int timekeeping_notify(struct clocksource *clock) if (tk->tkr_mono.clock == clock) return 0; + + /* Disable inlined reads accross the clocksource switch */ + clocksource_disable_inline_read(); + stop_machine(change_clocksource, clock, NULL); + + /* + * If the clocksource has been selected and supports inlined reads + * enable the branch. + */ + if (tk->tkr_mono.clock == clock && clock->flags & CLOCK_SOURCE_CAN_INLINE_READ) + clocksource_enable_inline_read(); + tick_clock_notify(); return tk->tkr_mono.clock == clock ? 0 : -1; } -- cgit v1.2.3 From cd38bdb8e696a1a1eb12fc6662a6e420977aacfd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:36:40 +0100 Subject: timekeeping: Provide infrastructure for coupled clockevents Some architectures have clockevent devices which are coupled to the system clocksource by implementing a less than or equal comparator which compares the programmed absolute expiry time against the underlying time counter. Well known examples are TSC/TSC deadline timer and the S390 TOD clocksource/comparator. While the concept is nice it has some downsides: 1) The clockevents core code is strictly based on relative expiry times as that's the most common case for clockevent device hardware. That requires to convert the absolute expiry time provided by the caller (hrtimers, NOHZ code) to a relative expiry time by reading and substracting the current time. The clockevent::set_next_event() callback must then read the counter again to convert the relative expiry back into a absolute one. 2) The conversion factors from nanoseconds to counter clock cycles are set up when the clockevent is registered. When NTP applies corrections then the clockevent conversion factors can deviate from the clocksource conversion substantially which either results in timers firing late or in the worst case early. The early expiry then needs to do a reprogam with a short delta. In most cases this is papered over by the fact that the read in the set_next_event() callback happens after the read which is used to calculate the delta. So the tendency is that timers expire mostly late. All of this can be avoided by providing support for these devices in the core code: 1) The timekeeping core keeps track of the last update to the clocksource by storing the base nanoseconds and the corresponding clocksource counter value. That's used to keep the conversion math for reading the time within 64-bit in the common case. This information can be used to avoid both reads of the underlying clocksource in the clockevents reprogramming path: delta = expiry - base_ns; cycles = base_cycles + ((delta * clockevent::mult) >> clockevent::shift); The resulting cycles value can be directly used to program the comparator. 2) As #1 does not longer provide the "compensation" through the second read the deviation of the clocksource and clockevent conversions caused by NTP become more prominent. This can be cured by letting the timekeeping core compute and store the reverse conversion factors when the clocksource cycles to nanoseconds factors are modified by NTP: CS::MULT (1 << NS_TO_CYC_SHIFT) --------------- = ---------------------- (1 << CS:SHIFT) NS_TO_CYC_MULT Ergo: NS_TO_CYC_MULT = (1 << (CS::SHIFT + NS_TO_CYC_SHIFT)) / CS::MULT The NS_TO_CYC_SHIFT value is calculated when the clocksource is installed so that it aims for a one hour maximum sleep time. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163429.944763521@kernel.org --- include/linux/clocksource.h | 1 + include/linux/timekeeper_internal.h | 8 +++ kernel/time/Kconfig | 3 + kernel/time/timekeeping.c | 110 ++++++++++++++++++++++++++++++++++++ kernel/time/timekeeping.h | 2 + 5 files changed, 124 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 54366d5c4d19..25774fc5b53d 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -150,6 +150,7 @@ struct clocksource { #define CLOCK_SOURCE_RESELECT 0x100 #define CLOCK_SOURCE_VERIFY_PERCPU 0x200 #define CLOCK_SOURCE_CAN_INLINE_READ 0x400 +#define CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT 0x800 /* simplify initialization of mask field */ #define CLOCKSOURCE_MASK(bits) GENMASK_ULL((bits) - 1, 0) diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index b8ae89ea28ab..e36d11e33e0c 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -72,6 +72,10 @@ struct tk_read_base { * @id: The timekeeper ID * @tkr_raw: The readout base structure for CLOCK_MONOTONIC_RAW * @raw_sec: CLOCK_MONOTONIC_RAW time in seconds + * @cs_id: The ID of the current clocksource + * @cs_ns_to_cyc_mult: Multiplicator for nanoseconds to cycles conversion + * @cs_ns_to_cyc_shift: Shift value for nanoseconds to cycles conversion + * @cs_ns_to_cyc_maxns: Maximum nanoseconds to cyles conversion range * @clock_was_set_seq: The sequence number of clock was set events * @cs_was_changed_seq: The sequence number of clocksource change events * @clock_valid: Indicator for valid clock @@ -159,6 +163,10 @@ struct timekeeper { u64 raw_sec; /* Cachline 3 and 4 (timekeeping internal variables): */ + enum clocksource_ids cs_id; + u32 cs_ns_to_cyc_mult; + u32 cs_ns_to_cyc_shift; + u64 cs_ns_to_cyc_maxns; unsigned int clock_was_set_seq; u8 cs_was_changed_seq; u8 clock_valid; diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 07b048ba0cca..b51bc5625129 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -47,6 +47,9 @@ config GENERIC_CLOCKEVENTS_BROADCAST_IDLE config GENERIC_CLOCKEVENTS_MIN_ADJUST bool +config GENERIC_CLOCKEVENTS_COUPLED + bool + # Generic update of CMOS clock config GENERIC_CMOS_UPDATE bool diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 63aa31f02ebc..b7a0f93011e0 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -391,6 +391,20 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) tk->tkr_raw.mult = clock->mult; tk->ntp_err_mult = 0; tk->skip_second_overflow = 0; + + tk->cs_id = clock->id; + + /* Coupled clockevent data */ + if (IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED) && + clock->flags & CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT) { + /* + * Aim for an one hour maximum delta and use KHz to handle + * clocksources with a frequency above 4GHz correctly as + * the frequency argument of clocks_calc_mult_shift() is u32. + */ + clocks_calc_mult_shift(&tk->cs_ns_to_cyc_mult, &tk->cs_ns_to_cyc_shift, + NSEC_PER_MSEC, clock->freq_khz, 3600 * 1000); + } } /* Timekeeper helper functions. */ @@ -720,6 +734,36 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC); } +static inline void tk_update_ns_to_cyc(struct timekeeper *tks, struct timekeeper *tkc) +{ + struct tk_read_base *tkrs = &tks->tkr_mono; + struct tk_read_base *tkrc = &tkc->tkr_mono; + unsigned int shift; + + if (!IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED) || + !(tkrs->clock->flags & CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT)) + return; + + if (tkrs->mult == tkrc->mult && tkrs->shift == tkrc->shift) + return; + /* + * The conversion math is simple: + * + * CS::MULT (1 << NS_TO_CYC_SHIFT) + * --------------- = ---------------------- + * (1 << CS:SHIFT) NS_TO_CYC_MULT + * + * Ergo: + * + * NS_TO_CYC_MULT = (1 << (CS::SHIFT + NS_TO_CYC_SHIFT)) / CS::MULT + * + * NS_TO_CYC_SHIFT has been set up in tk_setup_internals() + */ + shift = tkrs->shift + tks->cs_ns_to_cyc_shift; + tks->cs_ns_to_cyc_mult = (u32)div_u64(1ULL << shift, tkrs->mult); + tks->cs_ns_to_cyc_maxns = div_u64(tkrs->clock->mask, tks->cs_ns_to_cyc_mult); +} + /* * Restore the shadow timekeeper from the real timekeeper. */ @@ -754,6 +798,7 @@ static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int act tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real; if (tk->id == TIMEKEEPER_CORE) { + tk_update_ns_to_cyc(tk, &tkd->timekeeper); update_vsyscall(tk); update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); @@ -808,6 +853,71 @@ static void timekeeping_forward_now(struct timekeeper *tk) tk_update_coarse_nsecs(tk); } +/* + * ktime_expiry_to_cycles - Convert a expiry time to clocksource cycles + * @id: Clocksource ID which is required for validity + * @expires_ns: Absolute CLOCK_MONOTONIC expiry time (nsecs) to be converted + * @cycles: Pointer to storage for corresponding absolute cycles value + * + * Convert a CLOCK_MONOTONIC based absolute expiry time to a cycles value + * based on the correlated clocksource of the clockevent device by using + * the base nanoseconds and cycles values of the last timekeeper update and + * converting the delta between @expires_ns and base nanoseconds to cycles. + * + * This only works for clockevent devices which are using a less than or + * equal comparator against the clocksource. + * + * Utilizing this avoids two clocksource reads for such devices, the + * ktime_get() in clockevents_program_event() to calculate the delta expiry + * value and the readout in the device::set_next_event() callback to + * convert the delta back to a absolute comparator value. + * + * Returns: True if @id matches the current clocksource ID, false otherwise + */ +bool ktime_expiry_to_cycles(enum clocksource_ids id, ktime_t expires_ns, u64 *cycles) +{ + struct timekeeper *tk = &tk_core.timekeeper; + struct tk_read_base *tkrm = &tk->tkr_mono; + ktime_t base_ns, delta_ns, max_ns; + u64 base_cycles, delta_cycles; + unsigned int seq; + u32 mult, shift; + + /* + * Racy check to avoid the seqcount overhead when ID does not match. If + * the relevant clocksource is installed concurrently, then this will + * just delay the switch over to this mechanism until the next event is + * programmed. If the ID is not matching the clock events code will use + * the regular relative set_next_event() callback as before. + */ + if (data_race(tk->cs_id) != id) + return false; + + do { + seq = read_seqcount_begin(&tk_core.seq); + + if (tk->cs_id != id) + return false; + + base_cycles = tkrm->cycle_last; + base_ns = tkrm->base + (tkrm->xtime_nsec >> tkrm->shift); + + mult = tk->cs_ns_to_cyc_mult; + shift = tk->cs_ns_to_cyc_shift; + max_ns = tk->cs_ns_to_cyc_maxns; + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + /* Prevent negative deltas and multiplication overflows */ + delta_ns = min(expires_ns - base_ns, max_ns); + delta_ns = max(delta_ns, 0); + + /* Convert to cycles */ + delta_cycles = ((u64)delta_ns * mult) >> shift; + *cycles = base_cycles + delta_cycles; + return true; +} + /** * ktime_get_real_ts64 - Returns the time of day in a timespec64. * @ts: pointer to the timespec to be set diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 543beba096c7..198d0608db74 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -9,6 +9,8 @@ extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_boot, ktime_t *offs_tai); +bool ktime_expiry_to_cycles(enum clocksource_ids id, ktime_t expires_ns, u64 *cycles); + extern int timekeeping_valid_for_hres(void); extern u64 timekeeping_max_deferment(void); extern void timekeeping_warp_clock(void); -- cgit v1.2.3 From 89f951a1e8ad781e7ac70eccddab0e0c270485f9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:36:45 +0100 Subject: clockevents: Provide support for clocksource coupled comparators Some clockevent devices are coupled to the system clocksource by implementing a less than or equal comparator which compares the programmed absolute expiry time against the underlying time counter. The timekeeping core provides a function to convert and absolute CLOCK_MONOTONIC based expiry time to a absolute clock cycles time which can be directly fed into the comparator. That spares two time reads in the next event progamming path, one to convert the absolute nanoseconds time to a delta value and the other to convert the delta value back to a absolute time value suitable for the comparator. Provide a new clocksource callback which takes the absolute cycle value and wire it up in clockevents_program_event(). Similar to clocksources allow architectures to inline the rearm operation. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163430.010425428@kernel.org --- include/linux/clockchips.h | 7 +++++-- kernel/time/Kconfig | 4 ++++ kernel/time/clockevents.c | 44 +++++++++++++++++++++++++++++++++++++++----- 3 files changed, 48 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 5e8f7819f6a6..92d90220c0d4 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -43,8 +43,9 @@ enum clock_event_state { /* * Clock event features */ -# define CLOCK_EVT_FEAT_PERIODIC 0x000001 -# define CLOCK_EVT_FEAT_ONESHOT 0x000002 +# define CLOCK_EVT_FEAT_PERIODIC 0x000001 +# define CLOCK_EVT_FEAT_ONESHOT 0x000002 +# define CLOCK_EVT_FEAT_CLOCKSOURCE_COUPLED 0x000004 /* * x86(64) specific (mis)features: @@ -100,6 +101,7 @@ struct clock_event_device { void (*event_handler)(struct clock_event_device *); int (*set_next_event)(unsigned long evt, struct clock_event_device *); int (*set_next_ktime)(ktime_t expires, struct clock_event_device *); + void (*set_next_coupled)(u64 cycles, struct clock_event_device *); ktime_t next_event; u64 max_delta_ns; u64 min_delta_ns; @@ -107,6 +109,7 @@ struct clock_event_device { u32 shift; enum clock_event_state state_use_accessors; unsigned int features; + enum clocksource_ids cs_id; unsigned long retries; int (*set_state_periodic)(struct clock_event_device *); diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index b51bc5625129..e1968ab8b37f 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -50,6 +50,10 @@ config GENERIC_CLOCKEVENTS_MIN_ADJUST config GENERIC_CLOCKEVENTS_COUPLED bool +config GENERIC_CLOCKEVENTS_COUPLED_INLINE + select GENERIC_CLOCKEVENTS_COUPLED + bool + # Generic update of CMOS clock config GENERIC_CMOS_UPDATE bool diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 5abaeef08e6a..83712aa1d385 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -292,6 +292,38 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) #endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ +#ifdef CONFIG_GENERIC_CLOCKEVENTS_COUPLED +#ifdef CONFIG_GENERIC_CLOCKEVENTS_COUPLED_INLINE +#include +#else +static __always_inline void +arch_inlined_clockevent_set_next_coupled(u64 u64 cycles, struct clock_event_device *dev) { } +#endif + +static inline bool clockevent_set_next_coupled(struct clock_event_device *dev, ktime_t expires) +{ + u64 cycles; + + if (unlikely(!(dev->features & CLOCK_EVT_FEAT_CLOCKSOURCE_COUPLED))) + return false; + + if (unlikely(!ktime_expiry_to_cycles(dev->cs_id, expires, &cycles))) + return false; + + if (IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED_INLINE)) + arch_inlined_clockevent_set_next_coupled(cycles, dev); + else + dev->set_next_coupled(cycles, dev); + return true; +} + +#else +static inline bool clockevent_set_next_coupled(struct clock_event_device *dev, ktime_t expires) +{ + return false; +} +#endif + /** * clockevents_program_event - Reprogram the clock event device. * @dev: device to program @@ -300,11 +332,10 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) * * Returns 0 on success, -ETIME when the event is in the past. */ -int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, - bool force) +int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, bool force) { - unsigned long long clc; int64_t delta; + u64 cycles; int rc; if (WARN_ON_ONCE(expires < 0)) @@ -323,6 +354,9 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, if (unlikely(dev->features & CLOCK_EVT_FEAT_HRTIMER)) return dev->set_next_ktime(expires, dev); + if (likely(clockevent_set_next_coupled(dev, expires))) + return 0; + delta = ktime_to_ns(ktime_sub(expires, ktime_get())); if (delta <= 0) return force ? clockevents_program_min_delta(dev) : -ETIME; @@ -330,8 +364,8 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, delta = min(delta, (int64_t) dev->max_delta_ns); delta = max(delta, (int64_t) dev->min_delta_ns); - clc = ((unsigned long long) delta * dev->mult) >> dev->shift; - rc = dev->set_next_event((unsigned long) clc, dev); + cycles = ((u64)delta * dev->mult) >> dev->shift; + rc = dev->set_next_event((unsigned long) cycles, dev); return (rc && force) ? clockevents_program_min_delta(dev) : rc; } -- cgit v1.2.3 From 7d27eafe54659d19cef10dab4520cbcdfb17b0e3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:37:18 +0100 Subject: hrtimer: Replace the bitfield in hrtimer_cpu_base Use bool for the various flags as that creates better code in the hot path. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163430.475262618@kernel.org --- include/linux/hrtimer_defs.h | 10 +++++----- kernel/time/hrtimer.c | 25 +++++++++++++------------ 2 files changed, 18 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index 02b010df6570..f9fbf9a48f59 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -83,11 +83,11 @@ struct hrtimer_cpu_base { unsigned int cpu; unsigned int active_bases; unsigned int clock_was_set_seq; - unsigned int hres_active : 1, - in_hrtirq : 1, - hang_detected : 1, - softirq_activated : 1, - online : 1; + bool hres_active; + bool in_hrtirq; + bool hang_detected; + bool softirq_activated; + bool online; #ifdef CONFIG_HIGH_RES_TIMERS unsigned int nr_events; unsigned short nr_retries; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index e6f02e980371..3b80a4453ee6 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -741,7 +741,7 @@ static void hrtimer_switch_to_hres(void) pr_warn("Could not switch to high resolution mode on CPU %u\n", base->cpu); return; } - base->hres_active = 1; + base->hres_active = true; hrtimer_resolution = HIGH_RES_NSEC; tick_setup_sched_timer(true); @@ -1854,7 +1854,7 @@ static __latent_entropy void hrtimer_run_softirq(void) now = hrtimer_update_base(cpu_base); __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT); - cpu_base->softirq_activated = 0; + cpu_base->softirq_activated = false; hrtimer_update_softirq_timer(cpu_base, true); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); @@ -1881,7 +1881,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) raw_spin_lock_irqsave(&cpu_base->lock, flags); entry_time = now = hrtimer_update_base(cpu_base); retry: - cpu_base->in_hrtirq = 1; + cpu_base->in_hrtirq = true; /* * Set expires_next to KTIME_MAX, which prevents that remote CPUs queue * timers while __hrtimer_run_queues() is expiring the clock bases. @@ -1892,7 +1892,7 @@ retry: if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; - cpu_base->softirq_activated = 1; + cpu_base->softirq_activated = true; raise_timer_softirq(HRTIMER_SOFTIRQ); } @@ -1905,12 +1905,12 @@ retry: * against it. */ cpu_base->expires_next = expires_next; - cpu_base->in_hrtirq = 0; + cpu_base->in_hrtirq = false; raw_spin_unlock_irqrestore(&cpu_base->lock, flags); /* Reprogramming necessary ? */ if (!tick_program_event(expires_next, 0)) { - cpu_base->hang_detected = 0; + cpu_base->hang_detected = false; return; } @@ -1939,7 +1939,7 @@ retry: * time away. */ cpu_base->nr_hangs++; - cpu_base->hang_detected = 1; + cpu_base->hang_detected = true; raw_spin_unlock_irqrestore(&cpu_base->lock, flags); delta = ktime_sub(now, entry_time); @@ -1987,7 +1987,7 @@ void hrtimer_run_queues(void) if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; - cpu_base->softirq_activated = 1; + cpu_base->softirq_activated = true; raise_timer_softirq(HRTIMER_SOFTIRQ); } @@ -2239,13 +2239,14 @@ int hrtimers_cpu_starting(unsigned int cpu) /* Clear out any left over state from a CPU down operation */ cpu_base->active_bases = 0; - cpu_base->hres_active = 0; - cpu_base->hang_detected = 0; + cpu_base->hres_active = false; + cpu_base->hang_detected = false; cpu_base->next_timer = NULL; cpu_base->softirq_next_timer = NULL; cpu_base->expires_next = KTIME_MAX; cpu_base->softirq_expires_next = KTIME_MAX; - cpu_base->online = 1; + cpu_base->softirq_activated = false; + cpu_base->online = true; return 0; } @@ -2303,7 +2304,7 @@ int hrtimers_cpu_dying(unsigned int dying_cpu) smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); raw_spin_unlock(&new_base->lock); - old_base->online = 0; + old_base->online = false; raw_spin_unlock(&old_base->lock); return 0; -- cgit v1.2.3 From 22f011be7aaa77ca8f502b9dd07b7334f9965d18 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:37:23 +0100 Subject: hrtimer: Convert state and properties to boolean All 'u8' flags are true booleans, so make it entirely clear that these can only contain true or false. This is especially true for hrtimer::state, which has a historical leftover of using the state with bitwise operations. That was used in the early hrtimer implementation with several bits, but then converted to a boolean state. But that conversion missed to replace the bit OR and bit check operations all over the place, which creates suboptimal code. As of today 'state' is a misnomer because it's only purpose is to reflect whether the timer is enqueued into the RB-tree or not. Rename it to 'is_queued' and make all operations on it boolean. This reduces text size from 8926 to 8732 bytes. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163430.542427240@kernel.org --- include/linux/hrtimer.h | 31 ++--------------------- include/linux/hrtimer_types.h | 12 ++++----- kernel/time/hrtimer.c | 58 +++++++++++++++++++++++++++++-------------- kernel/time/timer_list.c | 2 +- 4 files changed, 49 insertions(+), 54 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index c924bb2498db..4ad4a454b4c5 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -63,33 +63,6 @@ enum hrtimer_mode { HRTIMER_MODE_REL_PINNED_HARD = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_HARD, }; -/* - * Values to track state of the timer - * - * Possible states: - * - * 0x00 inactive - * 0x01 enqueued into rbtree - * - * The callback state is not part of the timer->state because clearing it would - * mean touching the timer after the callback, this makes it impossible to free - * the timer from the callback function. - * - * Therefore we track the callback state in: - * - * timer->base->cpu_base->running == timer - * - * On SMP it is possible to have a "callback function running and enqueued" - * status. It happens for example when a posix timer expired and the callback - * queued a signal. Between dropping the lock which protects the posix timer - * and reacquiring the base lock of the hrtimer, another CPU can deliver the - * signal and rearm the timer. - * - * All state transitions are protected by cpu_base->lock. - */ -#define HRTIMER_STATE_INACTIVE 0x00 -#define HRTIMER_STATE_ENQUEUED 0x01 - /** * struct hrtimer_sleeper - simple sleeper structure * @timer: embedded timer structure @@ -300,8 +273,8 @@ extern bool hrtimer_active(const struct hrtimer *timer); */ static inline bool hrtimer_is_queued(struct hrtimer *timer) { - /* The READ_ONCE pairs with the update functions of timer->state */ - return !!(READ_ONCE(timer->state) & HRTIMER_STATE_ENQUEUED); + /* The READ_ONCE pairs with the update functions of timer->is_queued */ + return READ_ONCE(timer->is_queued); } /* diff --git a/include/linux/hrtimer_types.h b/include/linux/hrtimer_types.h index 64381c64cdbd..0e22bc91d00f 100644 --- a/include/linux/hrtimer_types.h +++ b/include/linux/hrtimer_types.h @@ -28,7 +28,7 @@ enum hrtimer_restart { * was armed. * @function: timer expiry callback function * @base: pointer to the timer base (per cpu and per clock) - * @state: state information (See bit values above) + * @is_queued: Indicates whether a timer is enqueued or not * @is_rel: Set if the timer was armed relative * @is_soft: Set if hrtimer will be expired in soft interrupt context. * @is_hard: Set if hrtimer will be expired in hard interrupt context @@ -43,11 +43,11 @@ struct hrtimer { ktime_t _softexpires; enum hrtimer_restart (*__private function)(struct hrtimer *); struct hrtimer_clock_base *base; - u8 state; - u8 is_rel; - u8 is_soft; - u8 is_hard; - u8 is_lazy; + bool is_queued; + bool is_rel; + bool is_soft; + bool is_hard; + bool is_lazy; }; #endif /* _LINUX_HRTIMER_TYPES_H */ diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 3b80a4453ee6..6bab3b7eb0de 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -49,6 +49,28 @@ #include "tick-internal.h" +/* + * Constants to set the queued state of the timer (INACTIVE, ENQUEUED) + * + * The callback state is kept separate in the CPU base because having it in + * the timer would required touching the timer after the callback, which + * makes it impossible to free the timer from the callback function. + * + * Therefore we track the callback state in: + * + * timer->base->cpu_base->running == timer + * + * On SMP it is possible to have a "callback function running and enqueued" + * status. It happens for example when a posix timer expired and the callback + * queued a signal. Between dropping the lock which protects the posix timer + * and reacquiring the base lock of the hrtimer, another CPU can deliver the + * signal and rearm the timer. + * + * All state transitions are protected by cpu_base->lock. + */ +#define HRTIMER_STATE_INACTIVE false +#define HRTIMER_STATE_ENQUEUED true + /* * The resolution of the clocks. The resolution value is returned in * the clock_getres() system call to give application programmers an @@ -1038,7 +1060,7 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) if (delta < 0) return 0; - if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED)) + if (WARN_ON(timer->is_queued)) return 0; if (interval < hrtimer_resolution) @@ -1082,7 +1104,7 @@ static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *ba base->cpu_base->active_bases |= 1 << base->index; /* Pairs with the lockless read in hrtimer_is_queued() */ - WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED); + WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED); return timerqueue_add(&base->active, &timer->node); } @@ -1096,18 +1118,18 @@ static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *ba * anyway (e.g. timer interrupt) */ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, - u8 newstate, bool reprogram) + bool newstate, bool reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; - u8 state = timer->state; lockdep_assert_held(&cpu_base->lock); - /* Pairs with the lockless read in hrtimer_is_queued() */ - WRITE_ONCE(timer->state, newstate); - if (!(state & HRTIMER_STATE_ENQUEUED)) + if (!timer->is_queued) return; + /* Pairs with the lockless read in hrtimer_is_queued() */ + WRITE_ONCE(timer->is_queued, newstate); + if (!timerqueue_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); @@ -1127,11 +1149,11 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *b static inline bool remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart, bool keep_local) { - u8 state = timer->state; + bool queued_state = timer->is_queued; lockdep_assert_held(&base->cpu_base->lock); - if (state & HRTIMER_STATE_ENQUEUED) { + if (queued_state) { bool reprogram; debug_hrtimer_deactivate(timer); @@ -1153,11 +1175,11 @@ static inline bool remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_ba * and a moment later when it's requeued). */ if (!restart) - state = HRTIMER_STATE_INACTIVE; + queued_state = HRTIMER_STATE_INACTIVE; else reprogram &= !keep_local; - __remove_hrtimer(timer, base, state, reprogram); + __remove_hrtimer(timer, base, queued_state, reprogram); return true; } return false; @@ -1704,7 +1726,7 @@ bool hrtimer_active(const struct hrtimer *timer) base = READ_ONCE(timer->base); seq = raw_read_seqcount_begin(&base->seq); - if (timer->state != HRTIMER_STATE_INACTIVE || base->running == timer) + if (timer->is_queued || base->running == timer) return true; } while (read_seqcount_retry(&base->seq, seq) || base != READ_ONCE(timer->base)); @@ -1721,7 +1743,7 @@ EXPORT_SYMBOL_GPL(hrtimer_active); * - callback: the timer is being ran * - post: the timer is inactive or (re)queued * - * On the read side we ensure we observe timer->state and cpu_base->running + * On the read side we ensure we observe timer->is_queued and cpu_base->running * from the same section, if anything changed while we looked at it, we retry. * This includes timer->base changing because sequence numbers alone are * insufficient for that. @@ -1744,11 +1766,11 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_cloc base->running = timer; /* - * Separate the ->running assignment from the ->state assignment. + * Separate the ->running assignment from the ->is_queued assignment. * * As with a regular write barrier, this ensures the read side in * hrtimer_active() cannot observe base->running == NULL && - * timer->state == INACTIVE. + * timer->is_queued == INACTIVE. */ raw_write_seqcount_barrier(&base->seq); @@ -1787,15 +1809,15 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_cloc * hrtimer_start_range_ns() can have popped in and enqueued the timer * for us already. */ - if (restart != HRTIMER_NORESTART && !(timer->state & HRTIMER_STATE_ENQUEUED)) + if (restart == HRTIMER_RESTART && !timer->is_queued) enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS, false); /* - * Separate the ->running assignment from the ->state assignment. + * Separate the ->running assignment from the ->is_queued assignment. * * As with a regular write barrier, this ensures the read side in * hrtimer_active() cannot observe base->running.timer == NULL && - * timer->state == INACTIVE. + * timer->is_queued == INACTIVE. */ raw_write_seqcount_barrier(&base->seq); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 488e47e96e93..19e61826b7de 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -47,7 +47,7 @@ print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, int idx, u64 now) { SEQ_printf(m, " #%d: <%p>, %ps", idx, taddr, ACCESS_PRIVATE(timer, function)); - SEQ_printf(m, ", S:%02x", timer->state); + SEQ_printf(m, ", S:%02x", timer->is_queued); SEQ_printf(m, "\n"); SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n", (unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)), -- cgit v1.2.3 From 9e07a9c980eaa93fd1bba722d31eeb4bf0cbbfb4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:37:53 +0100 Subject: hrtimer: Rename hrtimer_cpu_base::in_hrtirq to deferred_rearm The upcoming deferred rearming scheme has the same effect as the deferred rearming when the hrtimer interrupt is executing. So it can reuse the in_hrtirq flag, but when it gets deferred beyond the hrtimer interrupt path, then the name does not make sense anymore. Rename it to deferred_rearm upfront to keep the actual functional change separate from the mechanical rename churn. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163430.935623347@kernel.org --- include/linux/hrtimer_defs.h | 4 ++-- kernel/time/hrtimer.c | 28 +++++++++------------------- 2 files changed, 11 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index f9fbf9a48f59..2c3bdbd562d2 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -53,7 +53,7 @@ enum hrtimer_base_type { * @active_bases: Bitfield to mark bases with active timers * @clock_was_set_seq: Sequence counter of clock was set events * @hres_active: State of high resolution mode - * @in_hrtirq: hrtimer_interrupt() is currently executing + * @deferred_rearm: A deferred rearm is pending * @hang_detected: The last hrtimer interrupt detected a hang * @softirq_activated: displays, if the softirq is raised - update of softirq * related settings is not required then. @@ -84,7 +84,7 @@ struct hrtimer_cpu_base { unsigned int active_bases; unsigned int clock_was_set_seq; bool hres_active; - bool in_hrtirq; + bool deferred_rearm; bool hang_detected; bool softirq_activated; bool online; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 2e05a1885d24..6f05d2569286 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -883,11 +883,8 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) if (expires >= cpu_base->expires_next) return; - /* - * If the hrtimer interrupt is running, then it will reevaluate the - * clock bases and reprogram the clock event device. - */ - if (cpu_base->in_hrtirq) + /* If a deferred rearm is pending skip reprogramming the device */ + if (cpu_base->deferred_rearm) return; cpu_base->next_timer = timer; @@ -921,12 +918,8 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, unsigned int act if (seq == cpu_base->clock_was_set_seq) return false; - /* - * If the remote CPU is currently handling an hrtimer interrupt, it - * will reevaluate the first expiring timer of all clock bases - * before reprogramming. Nothing to do here. - */ - if (cpu_base->in_hrtirq) + /* If a deferred rearm is pending the remote CPU will take care of it */ + if (cpu_base->deferred_rearm) return false; /* @@ -1334,11 +1327,8 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del first = enqueue_hrtimer(timer, base, mode, was_armed); } - /* - * If the hrtimer interrupt is running, then it will reevaluate the - * clock bases and reprogram the clock event device. - */ - if (cpu_base->in_hrtirq) + /* If a deferred rearm is pending skip reprogramming the device */ + if (cpu_base->deferred_rearm) return false; if (!was_first || cpu_base != this_cpu_base) { @@ -1947,14 +1937,14 @@ static __latent_entropy void hrtimer_run_softirq(void) /* * Very similar to hrtimer_force_reprogram(), except it deals with - * in_hrtirq and hang_detected. + * deferred_rearm and hang_detected. */ static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now) { ktime_t expires_next = hrtimer_update_next_event(cpu_base); cpu_base->expires_next = expires_next; - cpu_base->in_hrtirq = false; + cpu_base->deferred_rearm = false; if (unlikely(cpu_base->hang_detected)) { /* @@ -1985,7 +1975,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) raw_spin_lock_irqsave(&cpu_base->lock, flags); entry_time = now = hrtimer_update_base(cpu_base); retry: - cpu_base->in_hrtirq = true; + cpu_base->deferred_rearm = true; /* * Set expires_next to KTIME_MAX, which prevents that remote CPUs queue * timers while __hrtimer_run_queues() is expiring the clock bases. -- cgit v1.2.3 From a43b4856bc039675165a50d9ef5f41b28520f0f4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 Feb 2026 17:37:58 +0100 Subject: hrtimer: Prepare stubs for deferred rearming The hrtimer interrupt expires timers and at the end of the interrupt it rearms the clockevent device for the next expiring timer. That's obviously correct, but in the case that a expired timer set NEED_RESCHED the return from interrupt ends up in schedule(). If HRTICK is enabled then schedule() will modify the hrtick timer, which causes another reprogramming of the hardware. That can be avoided by deferring the rearming to the return from interrupt path and if the return results in a immediate schedule() invocation then it can be deferred until the end of schedule(). To make this correct the affected code parts need to be made aware of this. Provide empty stubs for the deferred rearming mechanism, so that the relevant code changes for entry, softirq and scheduler can be split up into separate changes independent of the actual enablement in the hrtimer code. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163431.000891171@kernel.org --- include/linux/hrtimer.h | 1 + include/linux/hrtimer_rearm.h | 21 +++++++++++++++++++++ kernel/time/Kconfig | 4 ++++ 3 files changed, 26 insertions(+) create mode 100644 include/linux/hrtimer_rearm.h (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 4ad4a454b4c5..c087b7142330 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -13,6 +13,7 @@ #define _LINUX_HRTIMER_H #include +#include #include #include #include diff --git a/include/linux/hrtimer_rearm.h b/include/linux/hrtimer_rearm.h new file mode 100644 index 000000000000..6293076c03a6 --- /dev/null +++ b/include/linux/hrtimer_rearm.h @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef _LINUX_HRTIMER_REARM_H +#define _LINUX_HRTIMER_REARM_H + +#ifdef CONFIG_HRTIMER_REARM_DEFERRED +static __always_inline void __hrtimer_rearm_deferred(void) { } +static __always_inline void hrtimer_rearm_deferred(void) { } +static __always_inline void hrtimer_rearm_deferred_tif(unsigned long tif_work) { } +static __always_inline bool +hrtimer_rearm_deferred_user_irq(unsigned long *tif_work, const unsigned long tif_mask) { return false; } +static __always_inline bool hrtimer_test_and_clear_rearm_deferred(void) { return false; } +#else /* CONFIG_HRTIMER_REARM_DEFERRED */ +static __always_inline void __hrtimer_rearm_deferred(void) { } +static __always_inline void hrtimer_rearm_deferred(void) { } +static __always_inline void hrtimer_rearm_deferred_tif(unsigned long tif_work) { } +static __always_inline bool +hrtimer_rearm_deferred_user_irq(unsigned long *tif_work, const unsigned long tif_mask) { return false; } +static __always_inline bool hrtimer_test_and_clear_rearm_deferred(void) { return false; } +#endif /* !CONFIG_HRTIMER_REARM_DEFERRED */ + +#endif diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index e1968ab8b37f..b95bfee3f592 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -58,6 +58,10 @@ config GENERIC_CLOCKEVENTS_COUPLED_INLINE config GENERIC_CMOS_UPDATE bool +# Deferred rearming of the hrtimer interrupt +config HRTIMER_REARM_DEFERRED + def_bool n + # Select to handle posix CPU timers from task_work # and not from the timer interrupt context config HAVE_POSIX_CPU_TIMERS_TASK_WORK -- cgit v1.2.3 From 0e98eb14814ef669e07ca6effaa03df2e57ef956 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 Feb 2026 17:38:03 +0100 Subject: entry: Prepare for deferred hrtimer rearming The hrtimer interrupt expires timers and at the end of the interrupt it rearms the clockevent device for the next expiring timer. That's obviously correct, but in the case that a expired timer sets NEED_RESCHED the return from interrupt ends up in schedule(). If HRTICK is enabled then schedule() will modify the hrtick timer, which causes another reprogramming of the hardware. That can be avoided by deferring the rearming to the return from interrupt path and if the return results in a immediate schedule() invocation then it can be deferred until the end of schedule(), which avoids multiple rearms and re-evaluation of the timer wheel. As this is only relevant for interrupt to user return split the work masks up and hand them in as arguments from the relevant exit to user functions, which allows the compiler to optimize the deferred handling out for the syscall exit to user case. Add the rearm checks to the approritate places in the exit to user loop and the interrupt return to kernel path, so that the rearming is always guaranteed. In the return to user space path this is handled in the same way as TIF_RSEQ to avoid extra instructions in the fast path, which are truly hurtful for device interrupt heavy work loads as the extra instructions and conditionals while benign at first sight accumulate quickly into measurable regressions. The return from syscall path is completely unaffected due to the above mentioned split so syscall heavy workloads wont have any extra burden. For now this is just placing empty stubs at the right places which are all optimized out by the compiler until the actual functionality is in place. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163431.066469985@kernel.org --- include/linux/irq-entry-common.h | 25 +++++++++++++++++++------ include/linux/rseq_entry.h | 16 +++++++++++++--- kernel/entry/common.c | 4 +++- 3 files changed, 35 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index d26d1b1bcbfb..b976946b3cdb 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -3,6 +3,7 @@ #define __LINUX_IRQENTRYCOMMON_H #include +#include #include #include #include @@ -33,6 +34,14 @@ _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | _TIF_RSEQ | \ ARCH_EXIT_TO_USER_MODE_WORK) +#ifdef CONFIG_HRTIMER_REARM_DEFERRED +# define EXIT_TO_USER_MODE_WORK_SYSCALL (EXIT_TO_USER_MODE_WORK) +# define EXIT_TO_USER_MODE_WORK_IRQ (EXIT_TO_USER_MODE_WORK | _TIF_HRTIMER_REARM) +#else +# define EXIT_TO_USER_MODE_WORK_SYSCALL (EXIT_TO_USER_MODE_WORK) +# define EXIT_TO_USER_MODE_WORK_IRQ (EXIT_TO_USER_MODE_WORK) +#endif + /** * arch_enter_from_user_mode - Architecture specific sanity check for user mode regs * @regs: Pointer to currents pt_regs @@ -203,6 +212,7 @@ unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work /** * __exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required * @regs: Pointer to pt_regs on entry stack + * @work_mask: Which TIF bits need to be evaluated * * 1) check that interrupts are disabled * 2) call tick_nohz_user_enter_prepare() @@ -212,7 +222,8 @@ unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work * * Don't invoke directly, use the syscall/irqentry_ prefixed variants below */ -static __always_inline void __exit_to_user_mode_prepare(struct pt_regs *regs) +static __always_inline void __exit_to_user_mode_prepare(struct pt_regs *regs, + const unsigned long work_mask) { unsigned long ti_work; @@ -222,8 +233,10 @@ static __always_inline void __exit_to_user_mode_prepare(struct pt_regs *regs) tick_nohz_user_enter_prepare(); ti_work = read_thread_flags(); - if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) - ti_work = exit_to_user_mode_loop(regs, ti_work); + if (unlikely(ti_work & work_mask)) { + if (!hrtimer_rearm_deferred_user_irq(&ti_work, work_mask)) + ti_work = exit_to_user_mode_loop(regs, ti_work); + } arch_exit_to_user_mode_prepare(regs, ti_work); } @@ -239,7 +252,7 @@ static __always_inline void __exit_to_user_mode_validate(void) /* Temporary workaround to keep ARM64 alive */ static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *regs) { - __exit_to_user_mode_prepare(regs); + __exit_to_user_mode_prepare(regs, EXIT_TO_USER_MODE_WORK); rseq_exit_to_user_mode_legacy(); __exit_to_user_mode_validate(); } @@ -253,7 +266,7 @@ static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *reg */ static __always_inline void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) { - __exit_to_user_mode_prepare(regs); + __exit_to_user_mode_prepare(regs, EXIT_TO_USER_MODE_WORK_SYSCALL); rseq_syscall_exit_to_user_mode(); __exit_to_user_mode_validate(); } @@ -267,7 +280,7 @@ static __always_inline void syscall_exit_to_user_mode_prepare(struct pt_regs *re */ static __always_inline void irqentry_exit_to_user_mode_prepare(struct pt_regs *regs) { - __exit_to_user_mode_prepare(regs); + __exit_to_user_mode_prepare(regs, EXIT_TO_USER_MODE_WORK_IRQ); rseq_irqentry_exit_to_user_mode(); __exit_to_user_mode_validate(); } diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index cbc4a791618b..17956e119e81 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -40,6 +40,7 @@ DECLARE_PER_CPU(struct rseq_stats, rseq_stats); #endif /* !CONFIG_RSEQ_STATS */ #ifdef CONFIG_RSEQ +#include #include #include #include @@ -110,7 +111,7 @@ static __always_inline void rseq_slice_clear_grant(struct task_struct *t) t->rseq.slice.state.granted = false; } -static __always_inline bool rseq_grant_slice_extension(bool work_pending) +static __always_inline bool __rseq_grant_slice_extension(bool work_pending) { struct task_struct *curr = current; struct rseq_slice_ctrl usr_ctrl; @@ -215,11 +216,20 @@ efault: return false; } +static __always_inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) +{ + if (unlikely(__rseq_grant_slice_extension(ti_work & mask))) { + hrtimer_rearm_deferred_tif(ti_work); + return true; + } + return false; +} + #else /* CONFIG_RSEQ_SLICE_EXTENSION */ static inline bool rseq_slice_extension_enabled(void) { return false; } static inline bool rseq_arm_slice_extension_timer(void) { return false; } static inline void rseq_slice_clear_grant(struct task_struct *t) { } -static inline bool rseq_grant_slice_extension(bool work_pending) { return false; } +static inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; } #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */ bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); @@ -778,7 +788,7 @@ static inline void rseq_syscall_exit_to_user_mode(void) { } static inline void rseq_irqentry_exit_to_user_mode(void) { } static inline void rseq_exit_to_user_mode_legacy(void) { } static inline void rseq_debug_syscall_return(struct pt_regs *regs) { } -static inline bool rseq_grant_slice_extension(bool work_pending) { return false; } +static inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; } #endif /* !CONFIG_RSEQ */ #endif /* _LINUX_RSEQ_ENTRY_H */ diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 9ef63e414791..9e1a6afb07f2 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -50,7 +50,7 @@ static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *re local_irq_enable_exit_to_user(ti_work); if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) { - if (!rseq_grant_slice_extension(ti_work & TIF_SLICE_EXT_DENY)) + if (!rseq_grant_slice_extension(ti_work, TIF_SLICE_EXT_DENY)) schedule(); } @@ -225,6 +225,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) */ if (state.exit_rcu) { instrumentation_begin(); + hrtimer_rearm_deferred(); /* Tell the tracer that IRET will enable interrupts */ trace_hardirqs_on_prepare(); lockdep_hardirqs_on_prepare(); @@ -238,6 +239,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) if (IS_ENABLED(CONFIG_PREEMPTION)) irqentry_exit_cond_resched(); + hrtimer_rearm_deferred(); /* Covers both tracing and lockdep */ trace_hardirqs_on(); instrumentation_end(); -- cgit v1.2.3 From 15dd3a9488557d3e6ebcecacab79f4e56b69ab54 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 Feb 2026 17:38:18 +0100 Subject: hrtimer: Push reprogramming timers into the interrupt return path Currently hrtimer_interrupt() runs expired timers, which can re-arm themselves, after which it computes the next expiration time and re-programs the hardware. However, things like HRTICK, a highres timer driving preemption, cannot re-arm itself at the point of running, since the next task has not been determined yet. The schedule() in the interrupt return path will switch to the next task, which then causes a new hrtimer to be programmed. This then results in reprogramming the hardware at least twice, once after running the timers, and once upon selecting the new task. Notably, *both* events happen in the interrupt. By pushing the hrtimer reprogram all the way into the interrupt return path, it runs after schedule() picks the new task and the double reprogram can be avoided. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163431.273488269@kernel.org --- include/asm-generic/thread_info_tif.h | 5 ++- include/linux/hrtimer_rearm.h | 72 ++++++++++++++++++++++++++++++++--- kernel/time/Kconfig | 4 +- kernel/time/hrtimer.c | 38 +++++++++++++++--- 4 files changed, 107 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/asm-generic/thread_info_tif.h b/include/asm-generic/thread_info_tif.h index da1610a78f92..528e6fc7efe9 100644 --- a/include/asm-generic/thread_info_tif.h +++ b/include/asm-generic/thread_info_tif.h @@ -41,11 +41,14 @@ #define _TIF_PATCH_PENDING BIT(TIF_PATCH_PENDING) #ifdef HAVE_TIF_RESTORE_SIGMASK -# define TIF_RESTORE_SIGMASK 10 // Restore signal mask in do_signal() */ +# define TIF_RESTORE_SIGMASK 10 // Restore signal mask in do_signal() # define _TIF_RESTORE_SIGMASK BIT(TIF_RESTORE_SIGMASK) #endif #define TIF_RSEQ 11 // Run RSEQ fast path #define _TIF_RSEQ BIT(TIF_RSEQ) +#define TIF_HRTIMER_REARM 12 // re-arm the timer +#define _TIF_HRTIMER_REARM BIT(TIF_HRTIMER_REARM) + #endif /* _ASM_GENERIC_THREAD_INFO_TIF_H_ */ diff --git a/include/linux/hrtimer_rearm.h b/include/linux/hrtimer_rearm.h index 6293076c03a6..a6f2e5d5e1c7 100644 --- a/include/linux/hrtimer_rearm.h +++ b/include/linux/hrtimer_rearm.h @@ -3,12 +3,74 @@ #define _LINUX_HRTIMER_REARM_H #ifdef CONFIG_HRTIMER_REARM_DEFERRED -static __always_inline void __hrtimer_rearm_deferred(void) { } -static __always_inline void hrtimer_rearm_deferred(void) { } -static __always_inline void hrtimer_rearm_deferred_tif(unsigned long tif_work) { } +#include + +void __hrtimer_rearm_deferred(void); + +/* + * This is purely CPU local, so check the TIF bit first to avoid the overhead of + * the atomic test_and_clear_bit() operation for the common case where the bit + * is not set. + */ +static __always_inline bool hrtimer_test_and_clear_rearm_deferred_tif(unsigned long tif_work) +{ + lockdep_assert_irqs_disabled(); + + if (unlikely(tif_work & _TIF_HRTIMER_REARM)) { + clear_thread_flag(TIF_HRTIMER_REARM); + return true; + } + return false; +} + +#define TIF_REARM_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | _TIF_HRTIMER_REARM) + +/* Invoked from the exit to user before invoking exit_to_user_mode_loop() */ static __always_inline bool -hrtimer_rearm_deferred_user_irq(unsigned long *tif_work, const unsigned long tif_mask) { return false; } -static __always_inline bool hrtimer_test_and_clear_rearm_deferred(void) { return false; } +hrtimer_rearm_deferred_user_irq(unsigned long *tif_work, const unsigned long tif_mask) +{ + /* Help the compiler to optimize the function out for syscall returns */ + if (!(tif_mask & _TIF_HRTIMER_REARM)) + return false; + /* + * Rearm the timer if none of the resched flags is set before going into + * the loop which re-enables interrupts. + */ + if (unlikely((*tif_work & TIF_REARM_MASK) == _TIF_HRTIMER_REARM)) { + clear_thread_flag(TIF_HRTIMER_REARM); + __hrtimer_rearm_deferred(); + /* Don't go into the loop if HRTIMER_REARM was the only flag */ + *tif_work &= ~TIF_HRTIMER_REARM; + return !*tif_work; + } + return false; +} + +/* Invoked from the time slice extension decision function */ +static __always_inline void hrtimer_rearm_deferred_tif(unsigned long tif_work) +{ + if (hrtimer_test_and_clear_rearm_deferred_tif(tif_work)) + __hrtimer_rearm_deferred(); +} + +/* + * This is to be called on all irqentry_exit() paths that will enable + * interrupts. + */ +static __always_inline void hrtimer_rearm_deferred(void) +{ + hrtimer_rearm_deferred_tif(read_thread_flags()); +} + +/* + * Invoked from the scheduler on entry to __schedule() so it can defer + * rearming after the load balancing callbacks which might change hrtick. + */ +static __always_inline bool hrtimer_test_and_clear_rearm_deferred(void) +{ + return hrtimer_test_and_clear_rearm_deferred_tif(read_thread_flags()); +} + #else /* CONFIG_HRTIMER_REARM_DEFERRED */ static __always_inline void __hrtimer_rearm_deferred(void) { } static __always_inline void hrtimer_rearm_deferred(void) { } diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index b95bfee3f592..6d6aace0a693 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -60,7 +60,9 @@ config GENERIC_CMOS_UPDATE # Deferred rearming of the hrtimer interrupt config HRTIMER_REARM_DEFERRED - def_bool n + def_bool y + depends on GENERIC_ENTRY && HAVE_GENERIC_TIF_BITS + depends on HIGH_RES_TIMERS && SCHED_HRTICK # Select to handle posix CPU timers from task_work # and not from the timer interrupt context diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 6f05d2569286..2e5f0e292efb 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1939,10 +1939,9 @@ static __latent_entropy void hrtimer_run_softirq(void) * Very similar to hrtimer_force_reprogram(), except it deals with * deferred_rearm and hang_detected. */ -static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now) +static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now, + ktime_t expires_next, bool deferred) { - ktime_t expires_next = hrtimer_update_next_event(cpu_base); - cpu_base->expires_next = expires_next; cpu_base->deferred_rearm = false; @@ -1954,9 +1953,37 @@ static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now) expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC); cpu_base->hang_detected = false; } - hrtimer_rearm_event(expires_next, false); + hrtimer_rearm_event(expires_next, deferred); +} + +#ifdef CONFIG_HRTIMER_REARM_DEFERRED +void __hrtimer_rearm_deferred(void) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + ktime_t now, expires_next; + + if (!cpu_base->deferred_rearm) + return; + + guard(raw_spinlock)(&cpu_base->lock); + now = hrtimer_update_base(cpu_base); + expires_next = hrtimer_update_next_event(cpu_base); + hrtimer_rearm(cpu_base, now, expires_next, true); } +static __always_inline void +hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now, ktime_t expires_next) +{ + set_thread_flag(TIF_HRTIMER_REARM); +} +#else /* CONFIG_HRTIMER_REARM_DEFERRED */ +static __always_inline void +hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now, ktime_t expires_next) +{ + hrtimer_rearm(cpu_base, now, expires_next, false); +} +#endif /* !CONFIG_HRTIMER_REARM_DEFERRED */ + /* * High resolution timer interrupt * Called with interrupts disabled @@ -2014,9 +2041,10 @@ retry: cpu_base->hang_detected = true; } - hrtimer_rearm(cpu_base, now); + hrtimer_interrupt_rearm(cpu_base, now, expires_next); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); } + #endif /* !CONFIG_HIGH_RES_TIMERS */ /* -- cgit v1.2.3 From b95c4442b02162904e9012e670b602ebeb3c6c1b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:38:23 +0100 Subject: hrtimer: Avoid re-evaluation when nothing changed Most times there is no change between hrtimer_interrupt() deferring the rearm and the invocation of hrtimer_rearm_deferred(). In those cases it's a pointless exercise to re-evaluate the next expiring timer. Cache the required data and use it if nothing changed. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163431.338569372@kernel.org --- include/linux/hrtimer_defs.h | 53 ++++++++++++++++++++++---------------------- kernel/time/hrtimer.c | 45 +++++++++++++++++++++++++------------ 2 files changed, 58 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index 2c3bdbd562d2..b6846efec210 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -47,32 +47,31 @@ enum hrtimer_base_type { /** * struct hrtimer_cpu_base - the per cpu clock bases - * @lock: lock protecting the base and associated clock bases - * and timers - * @cpu: cpu number - * @active_bases: Bitfield to mark bases with active timers - * @clock_was_set_seq: Sequence counter of clock was set events - * @hres_active: State of high resolution mode - * @deferred_rearm: A deferred rearm is pending - * @hang_detected: The last hrtimer interrupt detected a hang - * @softirq_activated: displays, if the softirq is raised - update of softirq - * related settings is not required then. - * @nr_events: Total number of hrtimer interrupt events - * @nr_retries: Total number of hrtimer interrupt retries - * @nr_hangs: Total number of hrtimer interrupt hangs - * @max_hang_time: Maximum time spent in hrtimer_interrupt - * @softirq_expiry_lock: Lock which is taken while softirq based hrtimer are - * expired - * @online: CPU is online from an hrtimers point of view - * @timer_waiters: A hrtimer_cancel() invocation waits for the timer - * callback to finish. - * @expires_next: absolute time of the next event, is required for remote - * hrtimer enqueue; it is the total first expiry time (hard - * and soft hrtimer are taken into account) - * @next_timer: Pointer to the first expiring timer - * @softirq_expires_next: Time to check, if soft queues needs also to be expired - * @softirq_next_timer: Pointer to the first expiring softirq based timer - * @clock_base: array of clock bases for this cpu + * @lock: lock protecting the base and associated clock bases and timers + * @cpu: cpu number + * @active_bases: Bitfield to mark bases with active timers + * @clock_was_set_seq: Sequence counter of clock was set events + * @hres_active: State of high resolution mode + * @deferred_rearm: A deferred rearm is pending + * @deferred_needs_update: The deferred rearm must re-evaluate the first timer + * @hang_detected: The last hrtimer interrupt detected a hang + * @softirq_activated: displays, if the softirq is raised - update of softirq + * related settings is not required then. + * @nr_events: Total number of hrtimer interrupt events + * @nr_retries: Total number of hrtimer interrupt retries + * @nr_hangs: Total number of hrtimer interrupt hangs + * @max_hang_time: Maximum time spent in hrtimer_interrupt + * @softirq_expiry_lock: Lock which is taken while softirq based hrtimer are expired + * @online: CPU is online from an hrtimers point of view + * @timer_waiters: A hrtimer_cancel() waiters for the timer callback to finish. + * @expires_next: Absolute time of the next event, is required for remote + * hrtimer enqueue; it is the total first expiry time (hard + * and soft hrtimer are taken into account) + * @next_timer: Pointer to the first expiring timer + * @softirq_expires_next: Time to check, if soft queues needs also to be expired + * @softirq_next_timer: Pointer to the first expiring softirq based timer + * @deferred_expires_next: Cached expires next value for deferred rearm + * @clock_base: Array of clock bases for this cpu * * Note: next_timer is just an optimization for __remove_hrtimer(). * Do not dereference the pointer because it is not reliable on @@ -85,6 +84,7 @@ struct hrtimer_cpu_base { unsigned int clock_was_set_seq; bool hres_active; bool deferred_rearm; + bool deferred_needs_update; bool hang_detected; bool softirq_activated; bool online; @@ -102,6 +102,7 @@ struct hrtimer_cpu_base { struct hrtimer *next_timer; ktime_t softirq_expires_next; struct hrtimer *softirq_next_timer; + ktime_t deferred_expires_next; struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; call_single_data_t csd; } ____cacheline_aligned; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 2e5f0e292efb..e9592cb1e39a 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -919,8 +919,10 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, unsigned int act return false; /* If a deferred rearm is pending the remote CPU will take care of it */ - if (cpu_base->deferred_rearm) + if (cpu_base->deferred_rearm) { + cpu_base->deferred_needs_update = true; return false; + } /* * Walk the affected clock bases and check whether the first expiring @@ -1141,7 +1143,12 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *b * a local timer is removed to be immediately restarted. That's handled * at the call site. */ - if (reprogram && timer == cpu_base->next_timer && !timer->is_lazy) + if (!reprogram || timer != cpu_base->next_timer || timer->is_lazy) + return; + + if (cpu_base->deferred_rearm) + cpu_base->deferred_needs_update = true; + else hrtimer_force_reprogram(cpu_base, /* skip_equal */ true); } @@ -1328,8 +1335,10 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del } /* If a deferred rearm is pending skip reprogramming the device */ - if (cpu_base->deferred_rearm) + if (cpu_base->deferred_rearm) { + cpu_base->deferred_needs_update = true; return false; + } if (!was_first || cpu_base != this_cpu_base) { /* @@ -1939,8 +1948,7 @@ static __latent_entropy void hrtimer_run_softirq(void) * Very similar to hrtimer_force_reprogram(), except it deals with * deferred_rearm and hang_detected. */ -static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now, - ktime_t expires_next, bool deferred) +static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next, bool deferred) { cpu_base->expires_next = expires_next; cpu_base->deferred_rearm = false; @@ -1950,7 +1958,7 @@ static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now, * Give the system a chance to do something else than looping * on hrtimer interrupts. */ - expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC); + expires_next = ktime_add_ns(ktime_get(), 100 * NSEC_PER_MSEC); cpu_base->hang_detected = false; } hrtimer_rearm_event(expires_next, deferred); @@ -1960,27 +1968,36 @@ static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now, void __hrtimer_rearm_deferred(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - ktime_t now, expires_next; + ktime_t expires_next; if (!cpu_base->deferred_rearm) return; guard(raw_spinlock)(&cpu_base->lock); - now = hrtimer_update_base(cpu_base); - expires_next = hrtimer_update_next_event(cpu_base); - hrtimer_rearm(cpu_base, now, expires_next, true); + if (cpu_base->deferred_needs_update) { + hrtimer_update_base(cpu_base); + expires_next = hrtimer_update_next_event(cpu_base); + } else { + /* No timer added/removed. Use the cached value */ + expires_next = cpu_base->deferred_expires_next; + } + hrtimer_rearm(cpu_base, expires_next, true); } static __always_inline void -hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now, ktime_t expires_next) +hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next) { + /* hrtimer_interrupt() just re-evaluated the first expiring timer */ + cpu_base->deferred_needs_update = false; + /* Cache the expiry time */ + cpu_base->deferred_expires_next = expires_next; set_thread_flag(TIF_HRTIMER_REARM); } #else /* CONFIG_HRTIMER_REARM_DEFERRED */ static __always_inline void -hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now, ktime_t expires_next) +hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next) { - hrtimer_rearm(cpu_base, now, expires_next, false); + hrtimer_rearm(cpu_base, expires_next, false); } #endif /* !CONFIG_HRTIMER_REARM_DEFERRED */ @@ -2041,7 +2058,7 @@ retry: cpu_base->hang_detected = true; } - hrtimer_interrupt_rearm(cpu_base, now, expires_next); + hrtimer_interrupt_rearm(cpu_base, expires_next); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); } -- cgit v1.2.3 From eddffab8282e388dddf032f3295fcec87eb08095 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:38:28 +0100 Subject: hrtimer: Keep track of first expiring timer per clock base Evaluating the next expiry time of all clock bases is cache line expensive as the expiry time of the first expiring timer is not cached in the base and requires to access the timer itself, which is definitely in a different cache line. It's way more efficient to keep track of the expiry time on enqueue and dequeue operations as the relevant data is already in the cache at that point. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163431.404839710@kernel.org --- include/linux/hrtimer_defs.h | 2 ++ kernel/time/hrtimer.c | 37 ++++++++++++++++++++++++++++++++++--- 2 files changed, 36 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index b6846efec210..fb38df4c0b64 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -19,6 +19,7 @@ * timer to a base on another cpu. * @clockid: clock id for per_cpu support * @seq: seqcount around __run_hrtimer + * @expires_next: Absolute time of the next event in this clock base * @running: pointer to the currently running hrtimer * @active: red black tree root node for the active timers * @offset: offset of this clock to the monotonic base @@ -28,6 +29,7 @@ struct hrtimer_clock_base { unsigned int index; clockid_t clockid; seqcount_raw_spinlock_t seq; + ktime_t expires_next; struct hrtimer *running; struct timerqueue_head active; ktime_t offset; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index e9592cb1e39a..d70899a9ddc1 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1107,7 +1107,18 @@ static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *ba /* Pairs with the lockless read in hrtimer_is_queued() */ WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED); - return timerqueue_add(&base->active, &timer->node); + if (!timerqueue_add(&base->active, &timer->node)) + return false; + + base->expires_next = hrtimer_get_expires(timer); + return true; +} + +static inline void base_update_next_timer(struct hrtimer_clock_base *base) +{ + struct timerqueue_node *next = timerqueue_getnext(&base->active); + + base->expires_next = next ? next->expires : KTIME_MAX; } /* @@ -1122,6 +1133,7 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *b bool newstate, bool reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; + bool was_first; lockdep_assert_held(&cpu_base->lock); @@ -1131,9 +1143,17 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *b /* Pairs with the lockless read in hrtimer_is_queued() */ WRITE_ONCE(timer->is_queued, newstate); + was_first = &timer->node == timerqueue_getnext(&base->active); + if (!timerqueue_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); + /* Nothing to update if this was not the first timer in the base */ + if (!was_first) + return; + + base_update_next_timer(base); + /* * If reprogram is false don't update cpu_base->next_timer and do not * touch the clock event device. @@ -1182,9 +1202,12 @@ static inline bool remove_and_enqueue_same_base(struct hrtimer *timer, struct hrtimer_clock_base *base, const enum hrtimer_mode mode, ktime_t expires, u64 delta_ns) { + bool was_first = false; + /* Remove it from the timer queue if active */ if (timer->is_queued) { debug_hrtimer_deactivate(timer); + was_first = &timer->node == timerqueue_getnext(&base->active); timerqueue_del(&base->active, &timer->node); } @@ -1197,8 +1220,16 @@ remove_and_enqueue_same_base(struct hrtimer *timer, struct hrtimer_clock_base *b /* Pairs with the lockless read in hrtimer_is_queued() */ WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED); - /* Returns true if this is the first expiring timer */ - return timerqueue_add(&base->active, &timer->node); + /* If it's the first expiring timer now or again, update base */ + if (timerqueue_add(&base->active, &timer->node)) { + base->expires_next = expires; + return true; + } + + if (was_first) + base_update_next_timer(base); + + return false; } static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, -- cgit v1.2.3 From 671047943dce5af24e023bca3c5cc244d7565f5a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:38:47 +0100 Subject: rbtree: Provide rbtree with links Some RB tree users require quick access to the next and the previous node, e.g. to check whether a modification of the node results in a change of the nodes position in the tree. If the node position does not change, then the modification can happen in place without going through a full enqueue requeue cycle. A upcoming use case for this are the timer queues of the hrtimer subsystem as they can optimize for timers which are frequently rearmed while enqueued. This can be obviously achieved with rb_next() and rb_prev(), but those turned out to be quite expensive for hotpath operations depending on the tree depth. Add a linked RB tree variant where add() and erase() maintain the links between the nodes. Like the cached variant it provides a pointer to the left most node in the root. It intentionally does not use a [h]list head as there is no real need for true list operations as the list is strictly coupled to the tree and and cannot be manipulated independently. It sets the nodes previous pointer to NULL for the left most node and the next pointer to NULL for the right most node. This allows a quick check especially for the left most node without consulting the list head address, which creates better code. Aside of the rb_leftmost cached pointer this could trivially provide a rb_rightmost pointer as well, but there is no usage for that (yet). Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163431.668401024@kernel.org --- include/linux/rbtree.h | 81 +++++++++++++++++++++++++++++++++++++++----- include/linux/rbtree_types.h | 16 +++++++++ lib/rbtree.c | 17 ++++++++++ 3 files changed, 105 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 4091e978aef2..48acdc3889dd 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -35,10 +35,15 @@ #define RB_CLEAR_NODE(node) \ ((node)->__rb_parent_color = (unsigned long)(node)) +#define RB_EMPTY_LINKED_NODE(lnode) RB_EMPTY_NODE(&(lnode)->node) +#define RB_CLEAR_LINKED_NODE(lnode) ({ \ + RB_CLEAR_NODE(&(lnode)->node); \ + (lnode)->prev = (lnode)->next = NULL; \ +}) extern void rb_insert_color(struct rb_node *, struct rb_root *); extern void rb_erase(struct rb_node *, struct rb_root *); - +extern bool rb_erase_linked(struct rb_node_linked *, struct rb_root_linked *); /* Find logical next and previous nodes in a tree */ extern struct rb_node *rb_next(const struct rb_node *); @@ -213,15 +218,10 @@ rb_add_cached(struct rb_node *node, struct rb_root_cached *tree, return leftmost ? node : NULL; } -/** - * rb_add() - insert @node into @tree - * @node: node to insert - * @tree: tree to insert @node into - * @less: operator defining the (partial) node order - */ static __always_inline void -rb_add(struct rb_node *node, struct rb_root *tree, - bool (*less)(struct rb_node *, const struct rb_node *)) +__rb_add(struct rb_node *node, struct rb_root *tree, + bool (*less)(struct rb_node *, const struct rb_node *), + void (*linkop)(struct rb_node *, struct rb_node *, struct rb_node **)) { struct rb_node **link = &tree->rb_node; struct rb_node *parent = NULL; @@ -234,10 +234,73 @@ rb_add(struct rb_node *node, struct rb_root *tree, link = &parent->rb_right; } + linkop(node, parent, link); rb_link_node(node, parent, link); rb_insert_color(node, tree); } +#define __node_2_linked_node(_n) \ + rb_entry((_n), struct rb_node_linked, node) + +static inline void +rb_link_linked_node(struct rb_node *node, struct rb_node *parent, struct rb_node **link) +{ + if (!parent) + return; + + struct rb_node_linked *nnew = __node_2_linked_node(node); + struct rb_node_linked *npar = __node_2_linked_node(parent); + + if (link == &parent->rb_left) { + nnew->prev = npar->prev; + nnew->next = npar; + npar->prev = nnew; + if (nnew->prev) + nnew->prev->next = nnew; + } else { + nnew->next = npar->next; + nnew->prev = npar; + npar->next = nnew; + if (nnew->next) + nnew->next->prev = nnew; + } +} + +/** + * rb_add_linked() - insert @node into the leftmost linked tree @tree + * @node: node to insert + * @tree: linked tree to insert @node into + * @less: operator defining the (partial) node order + * + * Returns @true when @node is the new leftmost, @false otherwise. + */ +static __always_inline bool +rb_add_linked(struct rb_node_linked *node, struct rb_root_linked *tree, + bool (*less)(struct rb_node *, const struct rb_node *)) +{ + __rb_add(&node->node, &tree->rb_root, less, rb_link_linked_node); + if (!node->prev) + tree->rb_leftmost = node; + return !node->prev; +} + +/* Empty linkop function which is optimized away by the compiler */ +static __always_inline void +rb_link_noop(struct rb_node *n, struct rb_node *p, struct rb_node **l) { } + +/** + * rb_add() - insert @node into @tree + * @node: node to insert + * @tree: tree to insert @node into + * @less: operator defining the (partial) node order + */ +static __always_inline void +rb_add(struct rb_node *node, struct rb_root *tree, + bool (*less)(struct rb_node *, const struct rb_node *)) +{ + __rb_add(node, tree, less, rb_link_noop); +} + /** * rb_find_add_cached() - find equivalent @node in @tree, or add @node * @node: node to look-for / insert diff --git a/include/linux/rbtree_types.h b/include/linux/rbtree_types.h index 45b6ecde3665..3c7ae53e8139 100644 --- a/include/linux/rbtree_types.h +++ b/include/linux/rbtree_types.h @@ -9,6 +9,12 @@ struct rb_node { } __attribute__((aligned(sizeof(long)))); /* The alignment might seem pointless, but allegedly CRIS needs it */ +struct rb_node_linked { + struct rb_node node; + struct rb_node_linked *prev; + struct rb_node_linked *next; +}; + struct rb_root { struct rb_node *rb_node; }; @@ -28,7 +34,17 @@ struct rb_root_cached { struct rb_node *rb_leftmost; }; +/* + * Leftmost tree with links. This would allow a trivial rb_rightmost update, + * but that has been omitted due to the lack of users. + */ +struct rb_root_linked { + struct rb_root rb_root; + struct rb_node_linked *rb_leftmost; +}; + #define RB_ROOT (struct rb_root) { NULL, } #define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL } +#define RB_ROOT_LINKED (struct rb_root_linked) { {NULL, }, NULL } #endif diff --git a/lib/rbtree.c b/lib/rbtree.c index 18d42bcf4ec9..5790d6ecba4e 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -446,6 +446,23 @@ void rb_erase(struct rb_node *node, struct rb_root *root) } EXPORT_SYMBOL(rb_erase); +bool rb_erase_linked(struct rb_node_linked *node, struct rb_root_linked *root) +{ + if (node->prev) + node->prev->next = node->next; + else + root->rb_leftmost = node->next; + + if (node->next) + node->next->prev = node->prev; + + rb_erase(&node->node, &root->rb_root); + RB_CLEAR_LINKED_NODE(node); + + return !!root->rb_leftmost; +} +EXPORT_SYMBOL_GPL(rb_erase_linked); + /* * Augmented rbtree manipulation functions. * -- cgit v1.2.3 From 1339eeb73d6b99cf3aa9981f3f91d6ac4a49c72e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:38:52 +0100 Subject: timerqueue: Provide linked timerqueue The hrtimer subsystem wants to peak ahead to the next and previous timer to evaluated whether a to be rearmed timer can stay at the same position in the RB tree with the new expiry time. The linked RB tree provides the infrastructure for this as it maintains links to the previous and next nodes for each entry in the tree. Provide timerqueue wrappers around that. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163431.734827095@kernel.org --- include/linux/timerqueue.h | 56 ++++++++++++++++++++++++++++++++++------ include/linux/timerqueue_types.h | 15 ++++++++--- lib/timerqueue.c | 14 ++++++++++ 3 files changed, 74 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h index d306d9dd2207..7d0aaa766580 100644 --- a/include/linux/timerqueue.h +++ b/include/linux/timerqueue.h @@ -5,12 +5,11 @@ #include #include -extern bool timerqueue_add(struct timerqueue_head *head, - struct timerqueue_node *node); -extern bool timerqueue_del(struct timerqueue_head *head, - struct timerqueue_node *node); -extern struct timerqueue_node *timerqueue_iterate_next( - struct timerqueue_node *node); +bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node); +bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node); +struct timerqueue_node *timerqueue_iterate_next(struct timerqueue_node *node); + +bool timerqueue_linked_add(struct timerqueue_linked_head *head, struct timerqueue_linked_node *node); /** * timerqueue_getnext - Returns the timer with the earliest expiration time @@ -19,8 +18,7 @@ extern struct timerqueue_node *timerqueue_iterate_next( * * Returns a pointer to the timer node that has the earliest expiration time. */ -static inline -struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head) +static inline struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head) { struct rb_node *leftmost = rb_first_cached(&head->rb_root); @@ -41,4 +39,46 @@ static inline void timerqueue_init_head(struct timerqueue_head *head) { head->rb_root = RB_ROOT_CACHED; } + +/* Timer queues with linked nodes */ + +static __always_inline +struct timerqueue_linked_node *timerqueue_linked_first(struct timerqueue_linked_head *head) +{ + return rb_entry_safe(head->rb_root.rb_leftmost, struct timerqueue_linked_node, node); +} + +static __always_inline +struct timerqueue_linked_node *timerqueue_linked_next(struct timerqueue_linked_node *node) +{ + return rb_entry_safe(node->node.next, struct timerqueue_linked_node, node); +} + +static __always_inline +struct timerqueue_linked_node *timerqueue_linked_prev(struct timerqueue_linked_node *node) +{ + return rb_entry_safe(node->node.prev, struct timerqueue_linked_node, node); +} + +static __always_inline +bool timerqueue_linked_del(struct timerqueue_linked_head *head, struct timerqueue_linked_node *node) +{ + return rb_erase_linked(&node->node, &head->rb_root); +} + +static __always_inline void timerqueue_linked_init(struct timerqueue_linked_node *node) +{ + RB_CLEAR_LINKED_NODE(&node->node); +} + +static __always_inline bool timerqueue_linked_node_queued(struct timerqueue_linked_node *node) +{ + return !RB_EMPTY_LINKED_NODE(&node->node); +} + +static __always_inline void timerqueue_linked_init_head(struct timerqueue_linked_head *head) +{ + head->rb_root = RB_ROOT_LINKED; +} + #endif /* _LINUX_TIMERQUEUE_H */ diff --git a/include/linux/timerqueue_types.h b/include/linux/timerqueue_types.h index dc298d0923e3..be2218b147c4 100644 --- a/include/linux/timerqueue_types.h +++ b/include/linux/timerqueue_types.h @@ -6,12 +6,21 @@ #include struct timerqueue_node { - struct rb_node node; - ktime_t expires; + struct rb_node node; + ktime_t expires; }; struct timerqueue_head { - struct rb_root_cached rb_root; + struct rb_root_cached rb_root; +}; + +struct timerqueue_linked_node { + struct rb_node_linked node; + ktime_t expires; +}; + +struct timerqueue_linked_head { + struct rb_root_linked rb_root; }; #endif /* _LINUX_TIMERQUEUE_TYPES_H */ diff --git a/lib/timerqueue.c b/lib/timerqueue.c index cdb9c7658478..e2a1e08cb4bd 100644 --- a/lib/timerqueue.c +++ b/lib/timerqueue.c @@ -82,3 +82,17 @@ struct timerqueue_node *timerqueue_iterate_next(struct timerqueue_node *node) return container_of(next, struct timerqueue_node, node); } EXPORT_SYMBOL_GPL(timerqueue_iterate_next); + +#define __node_2_tq_linked(_n) \ + container_of(rb_entry((_n), struct rb_node_linked, node), struct timerqueue_linked_node, node) + +static __always_inline bool __tq_linked_less(struct rb_node *a, const struct rb_node *b) +{ + return __node_2_tq_linked(a)->expires < __node_2_tq_linked(b)->expires; +} + +bool timerqueue_linked_add(struct timerqueue_linked_head *head, struct timerqueue_linked_node *node) +{ + return rb_add_linked(&node->node, &head->rb_root, __tq_linked_less); +} +EXPORT_SYMBOL_GPL(timerqueue_linked_add); -- cgit v1.2.3 From b7418e6e9b87b849af4df93d527ff83498d1e4c3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:38:57 +0100 Subject: hrtimer: Use linked timerqueue To prepare for optimizing the rearming of enqueued timers, switch to the linked timerqueue. That allows to check whether the new expiry time changes the position of the timer in the RB tree or not, by checking the new expiry time against the previous and the next timers expiry. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163431.806643179@kernel.org --- include/linux/hrtimer_defs.h | 16 ++++++++-------- include/linux/hrtimer_types.h | 8 ++++---- kernel/time/hrtimer.c | 34 +++++++++++++++++----------------- kernel/time/timer_list.c | 10 ++++------ 4 files changed, 33 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index fb38df4c0b64..0f851b2432c3 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -25,14 +25,14 @@ * @offset: offset of this clock to the monotonic base */ struct hrtimer_clock_base { - struct hrtimer_cpu_base *cpu_base; - unsigned int index; - clockid_t clockid; - seqcount_raw_spinlock_t seq; - ktime_t expires_next; - struct hrtimer *running; - struct timerqueue_head active; - ktime_t offset; + struct hrtimer_cpu_base *cpu_base; + unsigned int index; + clockid_t clockid; + seqcount_raw_spinlock_t seq; + ktime_t expires_next; + struct hrtimer *running; + struct timerqueue_linked_head active; + ktime_t offset; } __hrtimer_clock_base_align; enum hrtimer_base_type { diff --git a/include/linux/hrtimer_types.h b/include/linux/hrtimer_types.h index 0e22bc91d00f..b5dacc8271a4 100644 --- a/include/linux/hrtimer_types.h +++ b/include/linux/hrtimer_types.h @@ -17,7 +17,7 @@ enum hrtimer_restart { /** * struct hrtimer - the basic hrtimer structure - * @node: timerqueue node, which also manages node.expires, + * @node: Linked timerqueue node, which also manages node.expires, * the absolute expiry time in the hrtimers internal * representation. The time is related to the clock on * which the timer is based. Is setup by adding @@ -39,15 +39,15 @@ enum hrtimer_restart { * The hrtimer structure must be initialized by hrtimer_setup() */ struct hrtimer { - struct timerqueue_node node; - ktime_t _softexpires; - enum hrtimer_restart (*__private function)(struct hrtimer *); + struct timerqueue_linked_node node; struct hrtimer_clock_base *base; bool is_queued; bool is_rel; bool is_soft; bool is_hard; bool is_lazy; + ktime_t _softexpires; + enum hrtimer_restart (*__private function)(struct hrtimer *); }; #endif /* _LINUX_HRTIMER_TYPES_H */ diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index d1e58482e0a9..5e45982363ce 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -557,10 +557,10 @@ static ktime_t hrtimer_bases_next_event_without(struct hrtimer_cpu_base *cpu_bas * If the excluded timer is the first on this base evaluate the * next timer. */ - struct timerqueue_node *node = timerqueue_getnext(&base->active); + struct timerqueue_linked_node *node = timerqueue_linked_first(&base->active); if (unlikely(&exclude->node == node)) { - node = timerqueue_iterate_next(node); + node = timerqueue_linked_next(node); if (!node) continue; expires = ktime_sub(node->expires, base->offset); @@ -576,7 +576,7 @@ static ktime_t hrtimer_bases_next_event_without(struct hrtimer_cpu_base *cpu_bas static __always_inline struct hrtimer *clock_base_next_timer(struct hrtimer_clock_base *base) { - struct timerqueue_node *next = timerqueue_getnext(&base->active); + struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active); return container_of(next, struct hrtimer, node); } @@ -938,9 +938,9 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, unsigned int act active &= cpu_base->active_bases; for_each_active_base(base, cpu_base, active) { - struct timerqueue_node *next; + struct timerqueue_linked_node *next; - next = timerqueue_getnext(&base->active); + next = timerqueue_linked_first(&base->active); expires = ktime_sub(next->expires, base->offset); if (expires < cpu_base->expires_next) return true; @@ -1112,7 +1112,7 @@ static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *ba /* Pairs with the lockless read in hrtimer_is_queued() */ WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED); - if (!timerqueue_add(&base->active, &timer->node)) + if (!timerqueue_linked_add(&base->active, &timer->node)) return false; base->expires_next = hrtimer_get_expires(timer); @@ -1121,7 +1121,7 @@ static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *ba static inline void base_update_next_timer(struct hrtimer_clock_base *base) { - struct timerqueue_node *next = timerqueue_getnext(&base->active); + struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active); base->expires_next = next ? next->expires : KTIME_MAX; } @@ -1148,9 +1148,9 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *b /* Pairs with the lockless read in hrtimer_is_queued() */ WRITE_ONCE(timer->is_queued, newstate); - was_first = &timer->node == timerqueue_getnext(&base->active); + was_first = !timerqueue_linked_prev(&timer->node); - if (!timerqueue_del(&base->active, &timer->node)) + if (!timerqueue_linked_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); /* Nothing to update if this was not the first timer in the base */ @@ -1212,8 +1212,8 @@ remove_and_enqueue_same_base(struct hrtimer *timer, struct hrtimer_clock_base *b /* Remove it from the timer queue if active */ if (timer->is_queued) { debug_hrtimer_deactivate(timer); - was_first = &timer->node == timerqueue_getnext(&base->active); - timerqueue_del(&base->active, &timer->node); + was_first = !timerqueue_linked_prev(&timer->node); + timerqueue_linked_del(&base->active, &timer->node); } /* Set the new expiry time */ @@ -1226,7 +1226,7 @@ remove_and_enqueue_same_base(struct hrtimer *timer, struct hrtimer_clock_base *b WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED); /* If it's the first expiring timer now or again, update base */ - if (timerqueue_add(&base->active, &timer->node)) { + if (timerqueue_linked_add(&base->active, &timer->node)) { base->expires_next = expires; return true; } @@ -1758,7 +1758,7 @@ static void __hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*fn)(st timer->is_hard = !!(mode & HRTIMER_MODE_HARD); timer->is_lazy = !!(mode & HRTIMER_MODE_LAZY_REARM); timer->base = &cpu_base->clock_base[base]; - timerqueue_init(&timer->node); + timerqueue_linked_init(&timer->node); if (WARN_ON_ONCE(!fn)) ACCESS_PRIVATE(timer, function) = hrtimer_dummy_timeout; @@ -1923,7 +1923,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_cloc static __always_inline struct hrtimer *clock_base_next_timer_safe(struct hrtimer_clock_base *base) { - struct timerqueue_node *next = timerqueue_getnext(&base->active); + struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active); return next ? container_of(next, struct hrtimer, node) : NULL; } @@ -2369,7 +2369,7 @@ int hrtimers_prepare_cpu(unsigned int cpu) clock_b->cpu_base = cpu_base; seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock); - timerqueue_init_head(&clock_b->active); + timerqueue_linked_init_head(&clock_b->active); } cpu_base->cpu = cpu; @@ -2399,10 +2399,10 @@ int hrtimers_cpu_starting(unsigned int cpu) static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, struct hrtimer_clock_base *new_base) { - struct timerqueue_node *node; + struct timerqueue_linked_node *node; struct hrtimer *timer; - while ((node = timerqueue_getnext(&old_base->active))) { + while ((node = timerqueue_linked_first(&old_base->active))) { timer = container_of(node, struct hrtimer, node); BUG_ON(hrtimer_callback_running(timer)); debug_hrtimer_deactivate(timer); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 19e61826b7de..e2e14fd1b466 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -56,13 +56,11 @@ print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, (long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now)); } -static void -print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, - u64 now) +static void print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { + struct timerqueue_linked_node *curr; struct hrtimer *timer, tmp; unsigned long next = 0, i; - struct timerqueue_node *curr; unsigned long flags; next_one: @@ -72,13 +70,13 @@ next_one: raw_spin_lock_irqsave(&base->cpu_base->lock, flags); - curr = timerqueue_getnext(&base->active); + curr = timerqueue_linked_first(&base->active); /* * Crude but we have to do this O(N*N) thing, because * we have to unlock the base when printing: */ while (curr && i < next) { - curr = timerqueue_iterate_next(curr); + curr = timerqueue_linked_next(curr); i++; } -- cgit v1.2.3 From 38e18d825f7281fdc16d3241df5115ce6eaeaf79 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 25 Feb 2026 10:32:41 -0800 Subject: locking: Fix rwlock and spinlock lock context annotations Fix two incorrect rwlock_t lock context annotations. Add the raw_spinlock_t lock context annotations that are missing. Fixes: f16a802d402d ("locking/rwlock, spinlock: Support Clang's context analysis") Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Acked-by: Marco Elver Link: https://patch.msgid.link/20260225183244.4035378-2-bvanassche@acm.org --- include/linux/rwlock.h | 4 ++-- include/linux/rwlock_api_smp.h | 6 ++++-- include/linux/spinlock.h | 3 ++- 3 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h index 3390d21c95dd..21ceefc4a49f 100644 --- a/include/linux/rwlock.h +++ b/include/linux/rwlock.h @@ -30,10 +30,10 @@ do { \ #ifdef CONFIG_DEBUG_SPINLOCK extern void do_raw_read_lock(rwlock_t *lock) __acquires_shared(lock); - extern int do_raw_read_trylock(rwlock_t *lock); + extern int do_raw_read_trylock(rwlock_t *lock) __cond_acquires_shared(true, lock); extern void do_raw_read_unlock(rwlock_t *lock) __releases_shared(lock); extern void do_raw_write_lock(rwlock_t *lock) __acquires(lock); - extern int do_raw_write_trylock(rwlock_t *lock); +extern int do_raw_write_trylock(rwlock_t *lock) __cond_acquires(true, lock); extern void do_raw_write_unlock(rwlock_t *lock) __releases(lock); #else # define do_raw_read_lock(rwlock) do {__acquire_shared(lock); arch_read_lock(&(rwlock)->raw_lock); } while (0) diff --git a/include/linux/rwlock_api_smp.h b/include/linux/rwlock_api_smp.h index 61a852609eab..9e02a5f28cd1 100644 --- a/include/linux/rwlock_api_smp.h +++ b/include/linux/rwlock_api_smp.h @@ -23,7 +23,7 @@ void __lockfunc _raw_write_lock_bh(rwlock_t *lock) __acquires(lock); void __lockfunc _raw_read_lock_irq(rwlock_t *lock) __acquires_shared(lock); void __lockfunc _raw_write_lock_irq(rwlock_t *lock) __acquires(lock); unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock) - __acquires(lock); + __acquires_shared(lock); unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock) __acquires(lock); int __lockfunc _raw_read_trylock(rwlock_t *lock) __cond_acquires_shared(true, lock); @@ -36,7 +36,7 @@ void __lockfunc _raw_read_unlock_irq(rwlock_t *lock) __releases_shared(lock); void __lockfunc _raw_write_unlock_irq(rwlock_t *lock) __releases(lock); void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) - __releases(lock); + __releases_shared(lock); void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) __releases(lock); @@ -116,6 +116,7 @@ _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) #endif static inline int __raw_read_trylock(rwlock_t *lock) + __cond_acquires_shared(true, lock) { preempt_disable(); if (do_raw_read_trylock(lock)) { @@ -127,6 +128,7 @@ static inline int __raw_read_trylock(rwlock_t *lock) } static inline int __raw_write_trylock(rwlock_t *lock) + __cond_acquires(true, lock) { preempt_disable(); if (do_raw_write_trylock(lock)) { diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index e1e2f144af9b..241277cd34cf 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -178,7 +178,7 @@ do { \ #ifdef CONFIG_DEBUG_SPINLOCK extern void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock); - extern int do_raw_spin_trylock(raw_spinlock_t *lock); + extern int do_raw_spin_trylock(raw_spinlock_t *lock) __cond_acquires(true, lock); extern void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock); #else static inline void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock) @@ -189,6 +189,7 @@ static inline void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock) } static inline int do_raw_spin_trylock(raw_spinlock_t *lock) + __cond_acquires(true, lock) { int ret = arch_spin_trylock(&(lock)->raw_lock); -- cgit v1.2.3 From 39be7b21af24d1d2ed3b18caac57dd219fef226e Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 25 Feb 2026 10:32:42 -0800 Subject: signal: Fix the lock_task_sighand() annotation lock_task_sighand() may return NULL. Make this clear in its lock context annotation. Fixes: 04e49d926f43 ("sched: Enable context analysis for core.c and fair.c") Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Acked-by: Marco Elver Link: https://patch.msgid.link/20260225183244.4035378-3-bvanassche@acm.org --- include/linux/sched/signal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index a22248aebcf9..a4835a7de07e 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -739,7 +739,7 @@ static inline int thread_group_empty(struct task_struct *p) extern struct sighand_struct *lock_task_sighand(struct task_struct *task, unsigned long *flags) - __acquires(&task->sighand->siglock); + __cond_acquires(nonnull, &task->sighand->siglock); static inline void unlock_task_sighand(struct task_struct *task, unsigned long *flags) -- cgit v1.2.3 From 3dcef70e41ab13483803c536ddea8d5f1803ee25 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 25 Feb 2026 10:32:43 -0800 Subject: ww-mutex: Fix the ww_acquire_ctx function annotations The ww_acquire_done() call is optional. Reflect this in the annotations of ww_acquire_done(). Fixes: 47907461e4f6 ("locking/ww_mutex: Support Clang's context analysis") Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Acked-by: Maarten Lankhorst Acked-by: Marco Elver Link: https://patch.msgid.link/20260225183244.4035378-4-bvanassche@acm.org --- include/linux/ww_mutex.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h index 85b1fff02fde..0c95ead5a297 100644 --- a/include/linux/ww_mutex.h +++ b/include/linux/ww_mutex.h @@ -181,7 +181,7 @@ static inline void ww_acquire_init(struct ww_acquire_ctx *ctx, * data structures. */ static inline void ww_acquire_done(struct ww_acquire_ctx *ctx) - __releases(ctx) __acquires_shared(ctx) __no_context_analysis + __must_hold(ctx) { #ifdef DEBUG_WW_MUTEXES lockdep_assert_held(ctx); @@ -199,7 +199,7 @@ static inline void ww_acquire_done(struct ww_acquire_ctx *ctx) * mutexes have been released with ww_mutex_unlock. */ static inline void ww_acquire_fini(struct ww_acquire_ctx *ctx) - __releases_shared(ctx) __no_context_analysis + __releases(ctx) __no_context_analysis { #ifdef CONFIG_DEBUG_LOCK_ALLOC mutex_release(&ctx->first_lock_dep_map, _THIS_IP_); -- cgit v1.2.3 From 4a91a33f15c634fb3477d122bdf1eef098d77ee3 Mon Sep 17 00:00:00 2001 From: Mallesh Koujalagi Date: Fri, 27 Feb 2026 14:24:01 +0530 Subject: workqueue: Update documentation as per system_percpu_wq naming Update documentation to use "per-CPU workqueue" instead of "global workqueue" to match the system_wq to system_percpu_wq rename. The workqueue behavior remains unchanged; this just aligns terminology with the clearer naming. Fixes: a2be943b46b4 ("workqueue: replace use of system_wq with system_percpu_wq") Signed-off-by: Mallesh Koujalagi Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index a4749f56398f..fc5744402a66 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -712,14 +712,14 @@ static inline bool schedule_work_on(int cpu, struct work_struct *work) } /** - * schedule_work - put work task in global workqueue + * schedule_work - put work task in per-CPU workqueue * @work: job to be done * - * Returns %false if @work was already on the kernel-global workqueue and + * Returns %false if @work was already on the system per-CPU workqueue and * %true otherwise. * - * This puts a job in the kernel-global workqueue if it was not already - * queued and leaves it in the same position on the kernel-global + * This puts a job in the system per-CPU workqueue if it was not already + * queued and leaves it in the same position on the system per-CPU * workqueue otherwise. * * Shares the same memory-ordering properties of queue_work(), cf. the @@ -796,12 +796,12 @@ extern void __warn_flushing_systemwide_wq(void) }) /** - * schedule_delayed_work_on - queue work in global workqueue on CPU after delay + * schedule_delayed_work_on - queue work in per-CPU workqueue on CPU after delay * @cpu: cpu to use * @dwork: job to be done * @delay: number of jiffies to wait * - * After waiting for a given time this puts a job in the kernel-global + * After waiting for a given time this puts a job in the system per-CPU * workqueue on the specified CPU. */ static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, @@ -811,11 +811,11 @@ static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, } /** - * schedule_delayed_work - put work task in global workqueue after delay + * schedule_delayed_work - put work task in per-CPU workqueue after delay * @dwork: job to be done * @delay: number of jiffies to wait or 0 for immediate execution * - * After waiting for a given time this puts a job in the kernel-global + * After waiting for a given time this puts a job in the system per-CPU * workqueue. */ static inline bool schedule_delayed_work(struct delayed_work *dwork, -- cgit v1.2.3 From e1092d5e15e6a9b168bf830af9a26d7ea17cd57d Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Tue, 24 Feb 2026 12:10:44 +0100 Subject: PCI/PTM: Do not enable PTM automatically for Root and Switch Upstream Ports Currently we enable PTM automatically for Root and Switch Upstream Ports if the advertised capabilities support the relevant role. However, there are a few issues with this. First of all, if there is no Endpoint that actually needs the PTM functionality, this is just wasting link bandwidth. There are just a couple of drivers calling pci_ptm_enable() in the tree. Secondly, we do the enablement in pci_ptm_init() that is called pretty early for the Switch Upstream Port before Downstream Ports are even enumerated. Since the Upstream Port configuration affects the whole Switch, enabling it this early might cause PTM requests to be sent. We actually do see effects of this: pcieport 0000:00:07.1: pciehp: Slot(6-1): Card present pcieport 0000:00:07.1: pciehp: Slot(6-1): Link Up pci 0000:2c:00.0: [8086:5786] type 01 class 0x060400 PCIe Switch Upstream Port ... pci 0000:2c:00.0: PTM enabled, 4ns granularity At this point we have only enumerated the Switch Upstream Port and now PTM got enabled which immediately triggers a flood of errors: pcieport 0000:00:07.1: AER: Multiple Uncorrectable (Non-Fatal) error message received from 0000:00:07.1 pcieport 0000:00:07.1: PCIe Bus Error: severity=Uncorrectable (Non-Fatal), type=Transaction Layer, (Receiver ID) pcieport 0000:00:07.1: device [8086:d44f] error status/mask=00200000/00000000 pcieport 0000:00:07.1: [21] ACSViol (First) pcieport 0000:00:07.1: AER: TLP Header: 0x34000000 0x00000052 0x00000000 0x00000000 pcieport 0000:00:07.1: AER: device recovery successful pcieport 0000:00:07.1: AER: Uncorrectable (Non-Fatal) error message received from 0000:00:07.1 In the above TLP Header the Requester ID is 0 which causes an error as we have ACS Source Validation enabled. Change the PTM enablement to happen at the time pci_enable_ptm() is called. It will try to enable PTM first for upstream devices before enabling for the Endpoint itself. For disable path we need to keep count of how many times PTM has been enabled and disable it only on the last, so change the dev->ptm_enabled to a counter (and rename it to dev->ptm_enable_cnt analogous to dev->pci_enable_cnt). Signed-off-by: Mika Westerberg Signed-off-by: Bjorn Helgaas Link: https://patch.msgid.link/20260224111044.3487873-6-mika.westerberg@linux.intel.com --- drivers/pci/pcie/ptm.c | 68 +++++++++++++++++++++++++++++--------------------- include/linux/pci.h | 2 +- 2 files changed, 40 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/pcie/ptm.c b/drivers/pci/pcie/ptm.c index 2c848ae4f15f..a41ffd1914de 100644 --- a/drivers/pci/pcie/ptm.c +++ b/drivers/pci/pcie/ptm.c @@ -52,6 +52,7 @@ void pci_ptm_init(struct pci_dev *dev) return; dev->ptm_cap = ptm; + atomic_set(&dev->ptm_enable_cnt, 0); pci_add_ext_cap_save_buffer(dev, PCI_EXT_CAP_ID_PTM, sizeof(u32)); pci_read_config_dword(dev, ptm + PCI_PTM_CAP, &cap); @@ -85,10 +86,6 @@ void pci_ptm_init(struct pci_dev *dev) dev->ptm_responder = 1; if (cap & PCI_PTM_CAP_REQ) dev->ptm_requester = 1; - - if (pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT || - pci_pcie_type(dev) == PCI_EXP_TYPE_UPSTREAM) - pci_enable_ptm(dev); } void pci_save_ptm_state(struct pci_dev *dev) @@ -129,26 +126,11 @@ void pci_restore_ptm_state(struct pci_dev *dev) static int __pci_enable_ptm(struct pci_dev *dev) { u16 ptm = dev->ptm_cap; - struct pci_dev *ups; u32 ctrl; if (!ptm) return -EINVAL; - /* - * A device uses local PTM Messages to request time information - * from a PTM Root that's farther upstream. Every device along the - * path must support PTM and have it enabled so it can handle the - * messages. Therefore, if this device is not a PTM Root, the - * upstream link partner must have PTM enabled before we can enable - * PTM. - */ - if (!dev->ptm_root) { - ups = pci_upstream_ptm(dev); - if (!ups || !ups->ptm_enabled) - return -EINVAL; - } - switch (pci_pcie_type(dev)) { case PCI_EXP_TYPE_ROOT_PORT: if (!dev->ptm_root) @@ -193,11 +175,35 @@ int pci_enable_ptm(struct pci_dev *dev) int rc; char clock_desc[8]; + /* + * A device uses local PTM Messages to request time information + * from a PTM Root that's farther upstream. Every device along + * the path must support PTM and have it enabled so it can + * handle the messages. Therefore, if this device is not a PTM + * Root, the upstream link partner must have PTM enabled before + * we can enable PTM. + */ + if (!dev->ptm_root) { + struct pci_dev *parent; + + parent = pci_upstream_ptm(dev); + if (!parent) + return -EINVAL; + /* Enable PTM for the parent */ + rc = pci_enable_ptm(parent); + if (rc) + return rc; + } + + /* Already enabled? */ + if (atomic_inc_return(&dev->ptm_enable_cnt) > 1) + return 0; + rc = __pci_enable_ptm(dev); - if (rc) + if (rc) { + atomic_dec(&dev->ptm_enable_cnt); return rc; - - dev->ptm_enabled = 1; + } switch (dev->ptm_granularity) { case 0: @@ -239,27 +245,31 @@ static void __pci_disable_ptm(struct pci_dev *dev) */ void pci_disable_ptm(struct pci_dev *dev) { - if (dev->ptm_enabled) { + struct pci_dev *parent; + + if (atomic_dec_and_test(&dev->ptm_enable_cnt)) __pci_disable_ptm(dev); - dev->ptm_enabled = 0; - } + + parent = pci_upstream_ptm(dev); + if (parent) + pci_disable_ptm(parent); } EXPORT_SYMBOL(pci_disable_ptm); /* - * Disable PTM, but preserve dev->ptm_enabled so we silently re-enable it on + * Disable PTM, but preserve dev->ptm_enable_cnt so we silently re-enable it on * resume if necessary. */ void pci_suspend_ptm(struct pci_dev *dev) { - if (dev->ptm_enabled) + if (atomic_read(&dev->ptm_enable_cnt)) __pci_disable_ptm(dev); } /* If PTM was enabled before suspend, re-enable it when resuming */ void pci_resume_ptm(struct pci_dev *dev) { - if (dev->ptm_enabled) + if (atomic_read(&dev->ptm_enable_cnt)) __pci_enable_ptm(dev); } @@ -268,7 +278,7 @@ bool pcie_ptm_enabled(struct pci_dev *dev) if (!dev) return false; - return dev->ptm_enabled; + return atomic_read(&dev->ptm_enable_cnt); } EXPORT_SYMBOL(pcie_ptm_enabled); diff --git a/include/linux/pci.h b/include/linux/pci.h index 8aaa72dcb164..c620d4b6c52e 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -518,7 +518,7 @@ struct pci_dev { unsigned int ptm_root:1; unsigned int ptm_responder:1; unsigned int ptm_requester:1; - unsigned int ptm_enabled:1; + atomic_t ptm_enable_cnt; u8 ptm_granularity; #endif #ifdef CONFIG_PCI_MSI -- cgit v1.2.3 From 11c0663a595801b6e6f7a937adec8532706ef486 Mon Sep 17 00:00:00 2001 From: Jens Emil Schulz Østergaard Date: Thu, 26 Feb 2026 09:24:19 +0100 Subject: net: phy: micrel: Add support for lan9645x internal phy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LAN9645X is a family of switch chips with 5 internal copper phys. The internal PHY is based on parts of LAN8832. This is a low-power, single port triple-speed (10BASE-T/100BASE-TX/1000BASE-T) ethernet physical layer transceiver (PHY) that supports transmission and reception of data on standard CAT-5, as well as CAT-5e and CAT-6 Unshielded Twisted Pair (UTP) cables. Add support for the internal PHY of the lan9645x chip family. Reviewed-by: Steen Hegelund Reviewed-by: Daniel Machon Signed-off-by: Jens Emil Schulz Østergaard Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20260226-phy_micrel_add_support_for_lan9645x_internal_phy-v3-1-1fe82379962b@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/micrel.c | 152 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/micrel_phy.h | 1 + 2 files changed, 153 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index c6b011a9d636..2aa1dedd21b8 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -6523,6 +6523,142 @@ static void lan8842_get_phy_stats(struct phy_device *phydev, stats->tx_errors = priv->phy_stats.tx_errors; } +#define LAN9645X_CTRL_REG 0x1f +#define LAN9645X_CTRL_REG_SW_SOFT_RST BIT(1) + +#define LAN9645X_DAC_ICAS_AMP_POWER_DOWN 0x47 +#define LAN9645X_BTRX_QBIAS_POWER_DOWN 0x46 +#define LAN9645X_TX_LOW_I_CH_CD_POWER_MGMT 0x45 +#define LAN9645X_TX_LOW_I_CH_B_POWER_MGMT 0x44 +#define LAN9645X_TX_LOW_I_CH_A_POWER_MGMT 0x43 + +static const struct lanphy_reg_data force_dac_tx_errata[] = { + /* Force channel A/B/C/D TX on */ + { LAN8814_PAGE_POWER_REGS, + LAN9645X_DAC_ICAS_AMP_POWER_DOWN, + 0 }, + /* Force channel A/B/C/D QBias on */ + { LAN8814_PAGE_POWER_REGS, + LAN9645X_BTRX_QBIAS_POWER_DOWN, + 0xaa }, + /* Tx low I on channel C/D overwrite */ + { LAN8814_PAGE_POWER_REGS, + LAN9645X_TX_LOW_I_CH_CD_POWER_MGMT, + 0xbfff }, + /* Channel B low I overwrite */ + { LAN8814_PAGE_POWER_REGS, + LAN9645X_TX_LOW_I_CH_B_POWER_MGMT, + 0xabbf }, + /* Channel A low I overwrite */ + { LAN8814_PAGE_POWER_REGS, + LAN9645X_TX_LOW_I_CH_A_POWER_MGMT, + 0xbd3f }, +}; + +static int lan9645x_config_init(struct phy_device *phydev) +{ + int ret; + + /* Apply erratas from previous generations. */ + ret = lan8842_erratas(phydev); + if (ret < 0) + return ret; + + /* Apply errata for an issue where bringing a port down, can cause a few + * CRC errors for traffic flowing through adjacent ports. + */ + return lanphy_write_reg_data(phydev, force_dac_tx_errata, + ARRAY_SIZE(force_dac_tx_errata)); +} + +static int lan9645x_suspend(struct phy_device *phydev) +{ + int ret, val; + + /* Force link down before software power down (SPD), by doing software + * soft reset. This resets the PHY, but keeps all register configuration + * intact. The bit self clears. + * + * This is needed as a workaround for an issue where performing SPD on a + * port can bring adjacent ports down, when there is traffic flowing + * through the ports. + */ + ret = phy_set_bits(phydev, LAN9645X_CTRL_REG, + LAN9645X_CTRL_REG_SW_SOFT_RST); + if (ret) + return ret; + + ret = phy_read_poll_timeout(phydev, LAN9645X_CTRL_REG, val, + !(val & LAN9645X_CTRL_REG_SW_SOFT_RST), + 3000, 100000, true); + if (ret) + return ret; + + return genphy_suspend(phydev); +} + +static int lan9645x_config_intr(struct phy_device *phydev) +{ + int err; + + /* enable / disable interrupts */ + if (phydev->interrupts == PHY_INTERRUPT_ENABLED) { + /* This is an internal PHY of lan9645x and is not possible to + * change the polarity of irq sources in the OIC (CPU_INTR) + * found in lan9645x. Therefore change the polarity of the + * interrupt in the PHY from being active low instead of active + * high. + */ + err = phy_write(phydev, LAN8804_CONTROL, + LAN8804_CONTROL_INTR_POLARITY); + if (err) + return err; + + /* By default interrupt buffer is open-drain in which case the + * interrupt can be active only low. Therefore change the + * interrupt buffer to be push-pull to be able to change + * interrupt polarity. + */ + err = phy_write(phydev, LAN8804_OUTPUT_CONTROL, + LAN8804_OUTPUT_CONTROL_INTR_BUFFER); + if (err) + return err; + + err = lan8814_ack_interrupt(phydev); + if (err) + return err; + + err = phy_write(phydev, LAN8814_INTC, + LAN8814_INT_LINK | LAN8814_INT_FLF); + } else { + err = phy_write(phydev, LAN8814_INTC, 0); + if (err) + return err; + + err = lan8814_ack_interrupt(phydev); + } + + return err; +} + +static irqreturn_t lan9645x_handle_interrupt(struct phy_device *phydev) +{ + int status; + + status = phy_read(phydev, LAN8814_INTS); + if (status < 0) { + phy_error(phydev); + return IRQ_NONE; + } + + if (status & (LAN8814_INT_LINK | LAN8814_INT_FLF)) { + phy_trigger_machine(phydev); + return IRQ_HANDLED; + } + + return IRQ_NONE; +} + static struct phy_driver ksphy_driver[] = { { PHY_ID_MATCH_MODEL(PHY_ID_KS8737), @@ -6761,6 +6897,21 @@ static struct phy_driver ksphy_driver[] = { .set_tunable = lan8842_set_tunable, .cable_test_start = lan8814_cable_test_start, .cable_test_get_status = ksz886x_cable_test_get_status, +}, { + PHY_ID_MATCH_MODEL(PHY_ID_LAN9645X), + .name = "Microchip LAN9645X Gigabit PHY", + .config_init = lan9645x_config_init, + .driver_data = &ksz9021_type, + .probe = kszphy_probe, + .soft_reset = genphy_soft_reset, + .suspend = lan9645x_suspend, + .resume = genphy_resume, + .config_intr = lan9645x_config_intr, + .handle_interrupt = lan9645x_handle_interrupt, + .get_tunable = lan8842_get_tunable, + .set_tunable = lan8842_set_tunable, + .get_phy_stats = lan8842_get_phy_stats, + .update_stats = lan8842_update_stats, }, { PHY_ID_MATCH_MODEL(PHY_ID_KSZ9131), .name = "Microchip KSZ9131 Gigabit PHY", @@ -6859,6 +7010,7 @@ static const struct mdio_device_id __maybe_unused micrel_tbl[] = { { PHY_ID_MATCH_MODEL(PHY_ID_LAN8804) }, { PHY_ID_MATCH_MODEL(PHY_ID_LAN8841) }, { PHY_ID_MATCH_MODEL(PHY_ID_LAN8842) }, + { PHY_ID_MATCH_MODEL(PHY_ID_LAN9645X) }, { } }; diff --git a/include/linux/micrel_phy.h b/include/linux/micrel_phy.h index ca691641788b..9c6f9817383f 100644 --- a/include/linux/micrel_phy.h +++ b/include/linux/micrel_phy.h @@ -33,6 +33,7 @@ #define PHY_ID_LAN8804 0x00221670 #define PHY_ID_LAN8841 0x00221650 #define PHY_ID_LAN8842 0x002216C0 +#define PHY_ID_LAN9645X 0x002216D0 #define PHY_ID_KSZ886X 0x00221430 #define PHY_ID_KSZ8863 0x00221435 -- cgit v1.2.3 From 6466441a5ecd1c1168264e4c322bae455579b156 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Feb 2026 04:12:13 +0000 Subject: net: inline skb_add_rx_frag_netmem() This critical helper (via skb_add_rx_frag()) is mostly used from drivers rx fast path. It is time to inline it, this actually saves space in vmlinux: size vmlinux.old vmlinux text data bss dec hex filename 37350766 23092977 4846992 65290735 3e441ef vmlinux.old 37350600 23092977 4846992 65290569 3e44149 vmlinux Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260226041213.1892561-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 13 +++++++++++-- net/core/skbuff.c | 11 ----------- 2 files changed, 11 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index daa4e4944ce3..9cc98f850f1d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2682,8 +2682,17 @@ static inline void skb_fill_page_desc_noacc(struct sk_buff *skb, int i, shinfo->nr_frags = i + 1; } -void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem, - int off, int size, unsigned int truesize); +static inline void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, + netmem_ref netmem, int off, + int size, unsigned int truesize) +{ + DEBUG_NET_WARN_ON_ONCE(size > truesize); + + skb_fill_netmem_desc(skb, i, netmem, off, size); + skb->len += size; + skb->data_len += size; + skb->truesize += truesize; +} static inline void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, int size, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 0e217041958a..513cbfed19bc 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -891,17 +891,6 @@ skb_fail: } EXPORT_SYMBOL(napi_alloc_skb); -void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem, - int off, int size, unsigned int truesize) -{ - DEBUG_NET_WARN_ON_ONCE(size > truesize); - - skb_fill_netmem_desc(skb, i, netmem, off, size); - skb->len += size; - skb->data_len += size; - skb->truesize += truesize; -} -EXPORT_SYMBOL(skb_add_rx_frag_netmem); void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size, unsigned int truesize) -- cgit v1.2.3 From 8021729acf21f4bf3c43866b8919b68968028478 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 25 Feb 2026 21:12:57 -0800 Subject: iio: tsl2772: fix all kernel-doc warnings Use the correct kernel-doc notation for struct members to eliminate kernel-doc warnings: Warning: include/linux/platform_data/tsl2772.h:88 struct member 'prox_diode' not described in 'tsl2772_settings' Warning: include/linux/platform_data/tsl2772.h:88 struct member 'prox_power' not described in 'tsl2772_settings' Signed-off-by: Randy Dunlap Reviewed-by: Andy Shevchenko Signed-off-by: Jonathan Cameron --- include/linux/platform_data/tsl2772.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/tsl2772.h b/include/linux/platform_data/tsl2772.h index f8ade15a35e2..f042e82b39c3 100644 --- a/include/linux/platform_data/tsl2772.h +++ b/include/linux/platform_data/tsl2772.h @@ -61,9 +61,9 @@ struct tsl2772_lux { * @prox_pulse_count: Number if proximity emitter pulses. * @prox_max_samples_cal: The number of samples that are taken when performing * a proximity calibration. - * @prox_diode Which diode(s) to use for driving the external + * @prox_diode: Which diode(s) to use for driving the external * LED(s) for proximity sensing. - * @prox_power The amount of power to use for the external LED(s). + * @prox_power: The amount of power to use for the external LED(s). */ struct tsl2772_settings { int als_time; -- cgit v1.2.3 From a2be37eedb52ea26938fa4cc9de1ff84963c57ad Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 24 Feb 2026 11:42:04 +0100 Subject: firmware: exynos-acpm: Drop fake 'const' on handle pointer All the functions operating on the 'handle' pointer are claiming it is a pointer to const thus they should not modify the handle. In fact that's a false statement, because first thing these functions do is drop the cast to const with container_of: struct acpm_info *acpm = handle_to_acpm_info(handle); And with such cast the handle is easily writable with simple: acpm->handle.ops.pmic_ops.read_reg = NULL; The code is not correct logically, either, because functions like acpm_get_by_node() and acpm_handle_put() are meant to modify the handle reference counting, thus they must modify the handle. Modification here happens anyway, even if the reference counting is stored in the container which the handle is part of. The code does not have actual visible bug, but incorrect 'const' annotations could lead to incorrect compiler decisions. Fixes: a88927b534ba ("firmware: add Exynos ACPM protocol driver") Cc: stable@vger.kernel.org Signed-off-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20260224104203.42950-2-krzysztof.kozlowski@oss.qualcomm.com Signed-off-by: Krzysztof Kozlowski --- drivers/clk/samsung/clk-acpm.c | 4 +-- drivers/firmware/samsung/exynos-acpm-dvfs.c | 4 +-- drivers/firmware/samsung/exynos-acpm-dvfs.h | 4 +-- drivers/firmware/samsung/exynos-acpm-pmic.c | 10 +++--- drivers/firmware/samsung/exynos-acpm-pmic.h | 10 +++--- drivers/firmware/samsung/exynos-acpm.c | 16 +++++---- drivers/firmware/samsung/exynos-acpm.h | 2 +- drivers/mfd/sec-acpm.c | 10 +++--- .../linux/firmware/samsung/exynos-acpm-protocol.h | 40 +++++++++------------- 9 files changed, 48 insertions(+), 52 deletions(-) (limited to 'include/linux') diff --git a/drivers/clk/samsung/clk-acpm.c b/drivers/clk/samsung/clk-acpm.c index b90809ce3f88..d8944160793a 100644 --- a/drivers/clk/samsung/clk-acpm.c +++ b/drivers/clk/samsung/clk-acpm.c @@ -20,7 +20,7 @@ struct acpm_clk { u32 id; struct clk_hw hw; unsigned int mbox_chan_id; - const struct acpm_handle *handle; + struct acpm_handle *handle; }; struct acpm_clk_variant { @@ -113,7 +113,7 @@ static int acpm_clk_register(struct device *dev, struct acpm_clk *aclk, static int acpm_clk_probe(struct platform_device *pdev) { - const struct acpm_handle *acpm_handle; + struct acpm_handle *acpm_handle; struct clk_hw_onecell_data *clk_data; struct clk_hw **hws; struct device *dev = &pdev->dev; diff --git a/drivers/firmware/samsung/exynos-acpm-dvfs.c b/drivers/firmware/samsung/exynos-acpm-dvfs.c index 17e7be7757b3..06bdf62dea1f 100644 --- a/drivers/firmware/samsung/exynos-acpm-dvfs.c +++ b/drivers/firmware/samsung/exynos-acpm-dvfs.c @@ -43,7 +43,7 @@ static void acpm_dvfs_init_set_rate_cmd(u32 cmd[4], unsigned int clk_id, cmd[3] = ktime_to_ms(ktime_get()); } -int acpm_dvfs_set_rate(const struct acpm_handle *handle, +int acpm_dvfs_set_rate(struct acpm_handle *handle, unsigned int acpm_chan_id, unsigned int clk_id, unsigned long rate) { @@ -63,7 +63,7 @@ static void acpm_dvfs_init_get_rate_cmd(u32 cmd[4], unsigned int clk_id) cmd[3] = ktime_to_ms(ktime_get()); } -unsigned long acpm_dvfs_get_rate(const struct acpm_handle *handle, +unsigned long acpm_dvfs_get_rate(struct acpm_handle *handle, unsigned int acpm_chan_id, unsigned int clk_id) { struct acpm_xfer xfer; diff --git a/drivers/firmware/samsung/exynos-acpm-dvfs.h b/drivers/firmware/samsung/exynos-acpm-dvfs.h index 9f2778e649c9..b37b15426102 100644 --- a/drivers/firmware/samsung/exynos-acpm-dvfs.h +++ b/drivers/firmware/samsung/exynos-acpm-dvfs.h @@ -11,10 +11,10 @@ struct acpm_handle; -int acpm_dvfs_set_rate(const struct acpm_handle *handle, +int acpm_dvfs_set_rate(struct acpm_handle *handle, unsigned int acpm_chan_id, unsigned int id, unsigned long rate); -unsigned long acpm_dvfs_get_rate(const struct acpm_handle *handle, +unsigned long acpm_dvfs_get_rate(struct acpm_handle *handle, unsigned int acpm_chan_id, unsigned int clk_id); diff --git a/drivers/firmware/samsung/exynos-acpm-pmic.c b/drivers/firmware/samsung/exynos-acpm-pmic.c index 26a9024d8ed8..0c50993cc9a8 100644 --- a/drivers/firmware/samsung/exynos-acpm-pmic.c +++ b/drivers/firmware/samsung/exynos-acpm-pmic.c @@ -77,7 +77,7 @@ static void acpm_pmic_init_read_cmd(u32 cmd[4], u8 type, u8 reg, u8 chan) cmd[3] = ktime_to_ms(ktime_get()); } -int acpm_pmic_read_reg(const struct acpm_handle *handle, +int acpm_pmic_read_reg(struct acpm_handle *handle, unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, u8 *buf) { @@ -107,7 +107,7 @@ static void acpm_pmic_init_bulk_read_cmd(u32 cmd[4], u8 type, u8 reg, u8 chan, FIELD_PREP(ACPM_PMIC_VALUE, count); } -int acpm_pmic_bulk_read(const struct acpm_handle *handle, +int acpm_pmic_bulk_read(struct acpm_handle *handle, unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, u8 count, u8 *buf) { @@ -150,7 +150,7 @@ static void acpm_pmic_init_write_cmd(u32 cmd[4], u8 type, u8 reg, u8 chan, cmd[3] = ktime_to_ms(ktime_get()); } -int acpm_pmic_write_reg(const struct acpm_handle *handle, +int acpm_pmic_write_reg(struct acpm_handle *handle, unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, u8 value) { @@ -187,7 +187,7 @@ static void acpm_pmic_init_bulk_write_cmd(u32 cmd[4], u8 type, u8 reg, u8 chan, } } -int acpm_pmic_bulk_write(const struct acpm_handle *handle, +int acpm_pmic_bulk_write(struct acpm_handle *handle, unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, u8 count, const u8 *buf) { @@ -220,7 +220,7 @@ static void acpm_pmic_init_update_cmd(u32 cmd[4], u8 type, u8 reg, u8 chan, cmd[3] = ktime_to_ms(ktime_get()); } -int acpm_pmic_update_reg(const struct acpm_handle *handle, +int acpm_pmic_update_reg(struct acpm_handle *handle, unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, u8 value, u8 mask) { diff --git a/drivers/firmware/samsung/exynos-acpm-pmic.h b/drivers/firmware/samsung/exynos-acpm-pmic.h index 078421888a14..88ae9aada2ae 100644 --- a/drivers/firmware/samsung/exynos-acpm-pmic.h +++ b/drivers/firmware/samsung/exynos-acpm-pmic.h @@ -11,19 +11,19 @@ struct acpm_handle; -int acpm_pmic_read_reg(const struct acpm_handle *handle, +int acpm_pmic_read_reg(struct acpm_handle *handle, unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, u8 *buf); -int acpm_pmic_bulk_read(const struct acpm_handle *handle, +int acpm_pmic_bulk_read(struct acpm_handle *handle, unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, u8 count, u8 *buf); -int acpm_pmic_write_reg(const struct acpm_handle *handle, +int acpm_pmic_write_reg(struct acpm_handle *handle, unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, u8 value); -int acpm_pmic_bulk_write(const struct acpm_handle *handle, +int acpm_pmic_bulk_write(struct acpm_handle *handle, unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, u8 count, const u8 *buf); -int acpm_pmic_update_reg(const struct acpm_handle *handle, +int acpm_pmic_update_reg(struct acpm_handle *handle, unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, u8 value, u8 mask); #endif /* __EXYNOS_ACPM_PMIC_H__ */ diff --git a/drivers/firmware/samsung/exynos-acpm.c b/drivers/firmware/samsung/exynos-acpm.c index c616ac951a0d..16c46ed60837 100644 --- a/drivers/firmware/samsung/exynos-acpm.c +++ b/drivers/firmware/samsung/exynos-acpm.c @@ -410,7 +410,7 @@ static int acpm_wait_for_message_response(struct acpm_chan *achan, * * Return: 0 on success, -errno otherwise. */ -int acpm_do_xfer(const struct acpm_handle *handle, const struct acpm_xfer *xfer) +int acpm_do_xfer(struct acpm_handle *handle, const struct acpm_xfer *xfer) { struct acpm_info *acpm = handle_to_acpm_info(handle); struct exynos_mbox_msg msg; @@ -674,7 +674,7 @@ static int acpm_probe(struct platform_device *pdev) * acpm_handle_put() - release the handle acquired by acpm_get_by_phandle. * @handle: Handle acquired by acpm_get_by_phandle. */ -static void acpm_handle_put(const struct acpm_handle *handle) +static void acpm_handle_put(struct acpm_handle *handle) { struct acpm_info *acpm = handle_to_acpm_info(handle); struct device *dev = acpm->dev; @@ -700,9 +700,11 @@ static void devm_acpm_release(struct device *dev, void *res) * @np: ACPM device tree node. * * Return: pointer to handle on success, ERR_PTR(-errno) otherwise. + * + * Note: handle CANNOT be pointer to const */ -static const struct acpm_handle *acpm_get_by_node(struct device *dev, - struct device_node *np) +static struct acpm_handle *acpm_get_by_node(struct device *dev, + struct device_node *np) { struct platform_device *pdev; struct device_link *link; @@ -743,10 +745,10 @@ static const struct acpm_handle *acpm_get_by_node(struct device *dev, * * Return: pointer to handle on success, ERR_PTR(-errno) otherwise. */ -const struct acpm_handle *devm_acpm_get_by_node(struct device *dev, - struct device_node *np) +struct acpm_handle *devm_acpm_get_by_node(struct device *dev, + struct device_node *np) { - const struct acpm_handle **ptr, *handle; + struct acpm_handle **ptr, *handle; ptr = devres_alloc(devm_acpm_release, sizeof(*ptr), GFP_KERNEL); if (!ptr) diff --git a/drivers/firmware/samsung/exynos-acpm.h b/drivers/firmware/samsung/exynos-acpm.h index 8392fcb91f45..5df8354dc96c 100644 --- a/drivers/firmware/samsung/exynos-acpm.h +++ b/drivers/firmware/samsung/exynos-acpm.h @@ -17,7 +17,7 @@ struct acpm_xfer { struct acpm_handle; -int acpm_do_xfer(const struct acpm_handle *handle, +int acpm_do_xfer(struct acpm_handle *handle, const struct acpm_xfer *xfer); #endif /* __EXYNOS_ACPM_H__ */ diff --git a/drivers/mfd/sec-acpm.c b/drivers/mfd/sec-acpm.c index 537ea65685bf..0e23b9d9f7ee 100644 --- a/drivers/mfd/sec-acpm.c +++ b/drivers/mfd/sec-acpm.c @@ -367,7 +367,7 @@ static const struct regmap_config s2mpg11_regmap_config_meter = { }; struct sec_pmic_acpm_shared_bus_context { - const struct acpm_handle *acpm; + struct acpm_handle *acpm; unsigned int acpm_chan_id; u8 speedy_channel; }; @@ -390,7 +390,7 @@ static int sec_pmic_acpm_bus_write(void *context, const void *data, size_t count) { struct sec_pmic_acpm_bus_context *ctx = context; - const struct acpm_handle *acpm = ctx->shared->acpm; + struct acpm_handle *acpm = ctx->shared->acpm; const struct acpm_pmic_ops *pmic_ops = &acpm->ops.pmic_ops; size_t val_count = count - BITS_TO_BYTES(ACPM_ADDR_BITS); const u8 *d = data; @@ -410,7 +410,7 @@ static int sec_pmic_acpm_bus_read(void *context, const void *reg_buf, size_t reg void *val_buf, size_t val_size) { struct sec_pmic_acpm_bus_context *ctx = context; - const struct acpm_handle *acpm = ctx->shared->acpm; + struct acpm_handle *acpm = ctx->shared->acpm; const struct acpm_pmic_ops *pmic_ops = &acpm->ops.pmic_ops; const u8 *r = reg_buf; u8 reg; @@ -429,7 +429,7 @@ static int sec_pmic_acpm_bus_reg_update_bits(void *context, unsigned int reg, un unsigned int val) { struct sec_pmic_acpm_bus_context *ctx = context; - const struct acpm_handle *acpm = ctx->shared->acpm; + struct acpm_handle *acpm = ctx->shared->acpm; const struct acpm_pmic_ops *pmic_ops = &acpm->ops.pmic_ops; return pmic_ops->update_reg(acpm, ctx->shared->acpm_chan_id, ctx->type, reg & 0xff, @@ -480,7 +480,7 @@ static int sec_pmic_acpm_probe(struct platform_device *pdev) struct regmap *regmap_common, *regmap_pmic, *regmap; const struct sec_pmic_acpm_platform_data *pdata; struct sec_pmic_acpm_shared_bus_context *shared_ctx; - const struct acpm_handle *acpm; + struct acpm_handle *acpm; struct device *dev = &pdev->dev; int ret, irq; diff --git a/include/linux/firmware/samsung/exynos-acpm-protocol.h b/include/linux/firmware/samsung/exynos-acpm-protocol.h index 2091da965a5a..13f17dc4443b 100644 --- a/include/linux/firmware/samsung/exynos-acpm-protocol.h +++ b/include/linux/firmware/samsung/exynos-acpm-protocol.h @@ -14,30 +14,24 @@ struct acpm_handle; struct device_node; struct acpm_dvfs_ops { - int (*set_rate)(const struct acpm_handle *handle, - unsigned int acpm_chan_id, unsigned int clk_id, - unsigned long rate); - unsigned long (*get_rate)(const struct acpm_handle *handle, + int (*set_rate)(struct acpm_handle *handle, unsigned int acpm_chan_id, + unsigned int clk_id, unsigned long rate); + unsigned long (*get_rate)(struct acpm_handle *handle, unsigned int acpm_chan_id, unsigned int clk_id); }; struct acpm_pmic_ops { - int (*read_reg)(const struct acpm_handle *handle, - unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, - u8 *buf); - int (*bulk_read)(const struct acpm_handle *handle, - unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, - u8 count, u8 *buf); - int (*write_reg)(const struct acpm_handle *handle, - unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, - u8 value); - int (*bulk_write)(const struct acpm_handle *handle, - unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, - u8 count, const u8 *buf); - int (*update_reg)(const struct acpm_handle *handle, - unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, - u8 value, u8 mask); + int (*read_reg)(struct acpm_handle *handle, unsigned int acpm_chan_id, + u8 type, u8 reg, u8 chan, u8 *buf); + int (*bulk_read)(struct acpm_handle *handle, unsigned int acpm_chan_id, + u8 type, u8 reg, u8 chan, u8 count, u8 *buf); + int (*write_reg)(struct acpm_handle *handle, unsigned int acpm_chan_id, + u8 type, u8 reg, u8 chan, u8 value); + int (*bulk_write)(struct acpm_handle *handle, unsigned int acpm_chan_id, + u8 type, u8 reg, u8 chan, u8 count, const u8 *buf); + int (*update_reg)(struct acpm_handle *handle, unsigned int acpm_chan_id, + u8 type, u8 reg, u8 chan, u8 value, u8 mask); }; struct acpm_ops { @@ -56,12 +50,12 @@ struct acpm_handle { struct device; #if IS_ENABLED(CONFIG_EXYNOS_ACPM_PROTOCOL) -const struct acpm_handle *devm_acpm_get_by_node(struct device *dev, - struct device_node *np); +struct acpm_handle *devm_acpm_get_by_node(struct device *dev, + struct device_node *np); #else -static inline const struct acpm_handle *devm_acpm_get_by_node(struct device *dev, - struct device_node *np) +static inline struct acpm_handle *devm_acpm_get_by_node(struct device *dev, + struct device_node *np) { return NULL; } -- cgit v1.2.3 From 2a76a626670b2ef391da37f457e8e51f168432a6 Mon Sep 17 00:00:00 2001 From: Taha Ed-Dafili <0rayn.dev@gmail.com> Date: Thu, 26 Feb 2026 15:11:03 +0000 Subject: iio: core: Add IIO_EV_INFO_SCALE to event info Implement support for IIO_EV_INFO_SCALE in the internal enum iio_event_info to allow proper ABI compliance. This allows drivers (like the ADXL345) to expose event scale attributes using the standard IIO ABI rather than manual device attributes. Signed-off-by: Taha Ed-Dafili <0rayn.dev@gmail.com> Reviewed-by: David Lechner Signed-off-by: Jonathan Cameron --- drivers/iio/industrialio-event.c | 1 + include/linux/iio/types.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/drivers/iio/industrialio-event.c b/drivers/iio/industrialio-event.c index 4149efcd5539..a0d6fcf2a9c9 100644 --- a/drivers/iio/industrialio-event.c +++ b/drivers/iio/industrialio-event.c @@ -256,6 +256,7 @@ static const char * const iio_ev_info_text[] = { [IIO_EV_INFO_TAP2_MIN_DELAY] = "tap2_min_delay", [IIO_EV_INFO_RUNNING_PERIOD] = "runningperiod", [IIO_EV_INFO_RUNNING_COUNT] = "runningcount", + [IIO_EV_INFO_SCALE] = "scale", }; static enum iio_event_direction iio_ev_attr_dir(struct iio_dev_attr *attr) diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h index 34eebad12d2c..4e3099defc1d 100644 --- a/include/linux/iio/types.h +++ b/include/linux/iio/types.h @@ -21,6 +21,7 @@ enum iio_event_info { IIO_EV_INFO_TAP2_MIN_DELAY, IIO_EV_INFO_RUNNING_PERIOD, IIO_EV_INFO_RUNNING_COUNT, + IIO_EV_INFO_SCALE, }; #define IIO_VAL_INT 1 -- cgit v1.2.3 From d3b693a13b39bce16e284e1c737874966b3a96de Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 28 Feb 2026 17:47:43 -0800 Subject: spi: spi-mem: clean up kernel-doc in spi-mem.h Eliminate all kernel-doc warnings in spi-mem.h: - add missing struct member descriptions - don't use "struct" for function descrptions Warning: include/linux/spi/spi-mem.h:202 struct member 'cmd' not described in 'spi_mem_op' Warning: include/linux/spi/spi-mem.h:202 struct member 'addr' not described in 'spi_mem_op' Warning: include/linux/spi/spi-mem.h:202 struct member 'dummy' not described in 'spi_mem_op' Warning: include/linux/spi/spi-mem.h:202 struct member 'data' not described in 'spi_mem_op' Warning: include/linux/spi/spi-mem.h:286 Incorrect use of kernel-doc format: * struct spi_mem_get_drvdata() - get driver private data attached to a SPI mem Warning: include/linux/spi/spi-mem.h:298 Incorrect use of kernel-doc format: * struct spi_controller_mem_ops - SPI memory operations Warning: include/linux/spi/spi-mem.h:362 expecting prototype for struct spi_mem_set_drvdata. Prototype was for struct spi_controller_mem_ops instead Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260301014743.3133167-1-rdunlap@infradead.org Signed-off-by: Mark Brown --- include/linux/spi/spi-mem.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h index 5774e554c0f0..37f709784350 100644 --- a/include/linux/spi/spi-mem.h +++ b/include/linux/spi/spi-mem.h @@ -130,11 +130,13 @@ enum spi_mem_data_dir { /** * struct spi_mem_op - describes a SPI memory operation + * @cmd: the complete command * @cmd.nbytes: number of opcode bytes (only 1 or 2 are valid). The opcode is * sent MSB-first. * @cmd.buswidth: number of IO lines used to transmit the command * @cmd.opcode: operation opcode * @cmd.dtr: whether the command opcode should be sent in DTR mode or not + * @addr: the address attributes * @addr.nbytes: number of address bytes to send. Can be zero if the operation * does not need to send an address * @addr.buswidth: number of IO lines used to transmit the address cycles @@ -143,10 +145,12 @@ enum spi_mem_data_dir { * Note that only @addr.nbytes are taken into account in this * address value, so users should make sure the value fits in the * assigned number of bytes. + * @dummy: data for dummy operation * @dummy.nbytes: number of dummy bytes to send after an opcode or address. Can * be zero if the operation does not require dummy bytes * @dummy.buswidth: number of IO lanes used to transmit the dummy bytes * @dummy.dtr: whether the dummy bytes should be sent in DTR mode or not + * @data: the data attributes * @data.buswidth: number of IO lanes used to send/receive the data * @data.dtr: whether the data should be sent in DTR mode or not * @data.ecc: whether error correction is required or not @@ -273,7 +277,7 @@ struct spi_mem { }; /** - * struct spi_mem_set_drvdata() - attach driver private data to a SPI mem + * spi_mem_set_drvdata() - attach driver private data to a SPI mem * device * @mem: memory device * @data: data to attach to the memory device @@ -284,7 +288,7 @@ static inline void spi_mem_set_drvdata(struct spi_mem *mem, void *data) } /** - * struct spi_mem_get_drvdata() - get driver private data attached to a SPI mem + * spi_mem_get_drvdata() - get driver private data attached to a SPI mem * device * @mem: memory device * -- cgit v1.2.3 From 73942a6ea26bd7e02b7c260b8b7aa942397be894 Mon Sep 17 00:00:00 2001 From: Stefan Binding Date: Tue, 24 Feb 2026 16:18:05 +0000 Subject: firmware: cs_dsp: Add API to hibernate the DSP For some parts, the DSP is kept running when in low power mode (hibernation), leaving the firmware ALSA controls enabled, but the registers are inaccessible. Attempts to access volatile firmware controls whilst in this state would produce errors in the kernel log due to a regmap_raw_read() into DSP registers whilst the regmap is in cache_only. To remove this error log, add a hibernating flag to indicate that the controls are inaccessible, so we no longer try to read or write to the registers whilst the regmap is in cache_only. This would still produce an error when trying to read or write to these controls, but this would be a different error (-EPERM instead of -EBUSY), and would not produce a spurious error log in the kernel. Upon wake from hibernation, the control caches are re-synced to the hardware, if the DSP is running. Signed-off-by: Stefan Binding Link: https://patch.msgid.link/20260224161821.93365-2-sbinding@opensource.cirrus.com Reviewed-by: Richard Fitzgerald Signed-off-by: Mark Brown --- drivers/firmware/cirrus/cs_dsp.c | 49 +++++++++++++++++++++++++++++++--- include/linux/firmware/cirrus/cs_dsp.h | 3 +++ 2 files changed, 48 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/cirrus/cs_dsp.c b/drivers/firmware/cirrus/cs_dsp.c index b4f1c01e3b5b..f9d8a883900d 100644 --- a/drivers/firmware/cirrus/cs_dsp.c +++ b/drivers/firmware/cirrus/cs_dsp.c @@ -515,6 +515,7 @@ void cs_dsp_init_debugfs(struct cs_dsp *dsp, struct dentry *debugfs_root) debugfs_create_bool("booted", 0444, root, &dsp->booted); debugfs_create_bool("running", 0444, root, &dsp->running); + debugfs_create_bool("hibernating", 0444, root, &dsp->hibernating); debugfs_create_x32("fw_id", 0444, root, &dsp->fw_id); debugfs_create_x32("fw_version", 0444, root, &dsp->fw_id_version); @@ -703,7 +704,7 @@ int cs_dsp_coeff_write_acked_control(struct cs_dsp_coeff_ctl *ctl, unsigned int lockdep_assert_held(&dsp->pwr_lock); - if (!dsp->running) + if (!dsp->running || dsp->hibernating) return -EPERM; ret = cs_dsp_coeff_base_reg(ctl, ®, 0); @@ -827,7 +828,7 @@ int cs_dsp_coeff_write_ctrl(struct cs_dsp_coeff_ctl *ctl, } ctl->set = 1; - if (ctl->enabled && ctl->dsp->running) + if (ctl->enabled && ctl->dsp->running && !ctl->dsp->hibernating) ret = cs_dsp_coeff_write_ctrl_raw(ctl, off, buf, len); if (ret < 0) @@ -920,12 +921,12 @@ int cs_dsp_coeff_read_ctrl(struct cs_dsp_coeff_ctl *ctl, return -EINVAL; if (ctl->flags & WMFW_CTL_FLAG_VOLATILE) { - if (ctl->enabled && ctl->dsp->running) + if (ctl->enabled && ctl->dsp->running && !ctl->dsp->hibernating) return cs_dsp_coeff_read_ctrl_raw(ctl, off, buf, len); else return -EPERM; } else { - if (!ctl->flags && ctl->enabled && ctl->dsp->running) + if (!ctl->flags && ctl->enabled && ctl->dsp->running && !ctl->dsp->hibernating) ret = cs_dsp_coeff_read_ctrl_raw(ctl, 0, ctl->cache, ctl->len); if (buf != ctl->cache) @@ -1108,6 +1109,44 @@ err_ctl: return ret; } + +/** + * cs_dsp_hibernate() - Disable or enable all controls for a DSP + * @dsp: pointer to DSP structure + * @hibernate: whether to set controls to cache only mode + * + * When @hibernate is true, the DSP is entering hibernation mode where the + * regmap is inaccessible, and all controls become cache only. + * When @hibernate is false, the DSP has exited hibernation mode. If the DSP + * is running, all controls are re-synced to the DSP. + * + */ +void cs_dsp_hibernate(struct cs_dsp *dsp, bool hibernate) +{ + mutex_lock(&dsp->pwr_lock); + + if (!dsp->running) { + cs_dsp_dbg(dsp, "Cannot hibernate, DSP not running\n"); + goto out; + } + + if (dsp->hibernating == hibernate) + goto out; + + cs_dsp_dbg(dsp, "Set hibernating to %d\n", hibernate); + dsp->hibernating = hibernate; + + if (!dsp->hibernating && dsp->running) { + int ret = cs_dsp_coeff_sync_controls(dsp); + + if (ret) + cs_dsp_err(dsp, "Error syncing controls: %d\n", ret); + } +out: + mutex_unlock(&dsp->pwr_lock); +} +EXPORT_SYMBOL_NS_GPL(cs_dsp_hibernate, "FW_CS_DSP"); + struct cs_dsp_coeff_parsed_alg { int id; const u8 *name; @@ -2498,6 +2537,7 @@ int cs_dsp_adsp1_power_up(struct cs_dsp *dsp, goto err_ena; dsp->booted = true; + dsp->hibernating = false; /* Start the core running */ regmap_update_bits(dsp->regmap, dsp->base + ADSP1_CONTROL_30, @@ -2776,6 +2816,7 @@ int cs_dsp_power_up(struct cs_dsp *dsp, dsp->ops->disable_core(dsp); dsp->booted = true; + dsp->hibernating = false; mutex_unlock(&dsp->pwr_lock); diff --git a/include/linux/firmware/cirrus/cs_dsp.h b/include/linux/firmware/cirrus/cs_dsp.h index 0ec1cdc5585d..4e3baa557068 100644 --- a/include/linux/firmware/cirrus/cs_dsp.h +++ b/include/linux/firmware/cirrus/cs_dsp.h @@ -179,6 +179,7 @@ struct cs_dsp { bool booted; bool running; + bool hibernating; struct list_head ctl_list; @@ -354,4 +355,6 @@ int cs_dsp_chunk_write(struct cs_dsp_chunk *ch, int nbits, u32 val); int cs_dsp_chunk_flush(struct cs_dsp_chunk *ch); int cs_dsp_chunk_read(struct cs_dsp_chunk *ch, int nbits); +void cs_dsp_hibernate(struct cs_dsp *dsp, bool hibernating); + #endif -- cgit v1.2.3 From bd77375097357b46af00db1316ceab5e82ccbc8b Mon Sep 17 00:00:00 2001 From: Kavita Kavita Date: Fri, 27 Feb 2026 00:25:51 +0530 Subject: wifi: cfg80211: add support for IEEE 802.1X Authentication Protocol Add an extended feature flag NL80211_EXT_FEATURE_IEEE8021X_AUTH to allow a driver to indicate support for the IEEE 802.1X authentication protocol in non-AP STA mode, as defined in "IEEE P802.11bi/D4.0, 12.16.5". In case of SME in userspace, the Authentication frame body is prepared in userspace while the driver finalizes the Authentication frame once it receives the required fields and elements. The driver indicates support for IEEE 802.1X authentication using the extended feature flag so that userspace can initiate IEEE 802.1X authentication. When the feature flag is set, process IEEE 802.1X Authentication frames from userspace in non-AP STA mode. If the flag is not set, reject IEEE 802.1X Authentication frames. Define a new authentication type NL80211_AUTHTYPE_IEEE8021X for IEEE 802.1X authentication. Signed-off-by: Kavita Kavita Link: https://patch.msgid.link/20260226185553.1516290-4-kavita.kavita@oss.qualcomm.com Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 1 + include/uapi/linux/nl80211.h | 9 +++++++++ net/wireless/nl80211.c | 14 ++++++++++++-- 3 files changed, 22 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 0aa2fb8f88de..1bf806f85372 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1358,6 +1358,7 @@ struct ieee80211_tdls_data { #define WLAN_AUTH_FILS_SK 4 #define WLAN_AUTH_FILS_SK_PFS 5 #define WLAN_AUTH_FILS_PK 6 +#define WLAN_AUTH_IEEE8021X 8 #define WLAN_AUTH_EPPKE 9 #define WLAN_AUTH_LEAP 128 diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index fe2c8c8d6dd6..0b7a06c2b9f7 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -5491,6 +5491,8 @@ enum nl80211_bss_status { * @NL80211_AUTHTYPE_FILS_SK_PFS: Fast Initial Link Setup shared key with PFS * @NL80211_AUTHTYPE_FILS_PK: Fast Initial Link Setup public key * @NL80211_AUTHTYPE_EPPKE: Enhanced Privacy Protection Key Exchange + * @NL80211_AUTHTYPE_IEEE8021X: IEEE 802.1X authentication utilizing + * Authentication frames * @__NL80211_AUTHTYPE_NUM: internal * @NL80211_AUTHTYPE_MAX: maximum valid auth algorithm * @NL80211_AUTHTYPE_AUTOMATIC: determine automatically (if necessary by @@ -5507,6 +5509,7 @@ enum nl80211_auth_type { NL80211_AUTHTYPE_FILS_SK_PFS, NL80211_AUTHTYPE_FILS_PK, NL80211_AUTHTYPE_EPPKE, + NL80211_AUTHTYPE_IEEE8021X, /* keep last */ __NL80211_AUTHTYPE_NUM, @@ -6820,6 +6823,11 @@ enum nl80211_feature_flags { * frames in both non‑AP STA and AP mode as specified in * "IEEE P802.11bi/D3.0, 12.16.6". * + * @NL80211_EXT_FEATURE_IEEE8021X_AUTH: Driver supports IEEE 802.1X + * authentication utilizing Authentication frames with user space SME + * (NL80211_CMD_AUTHENTICATE) in non-AP STA mode, as specified in + * "IEEE P802.11bi/D4.0, 12.16.5". + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -6898,6 +6906,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_BEACON_RATE_EHT, NL80211_EXT_FEATURE_EPPKE, NL80211_EXT_FEATURE_ASSOC_FRAME_ENCRYPTION, + NL80211_EXT_FEATURE_IEEE8021X_AUTH, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index f54b3cca6975..de7956dbe0a0 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6550,6 +6550,10 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev, NL80211_EXT_FEATURE_EPPKE) && auth_type == NL80211_AUTHTYPE_EPPKE) return false; + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_IEEE8021X_AUTH) && + auth_type == NL80211_AUTHTYPE_IEEE8021X) + return false; return true; case NL80211_CMD_CONNECT: if (!(rdev->wiphy.features & NL80211_FEATURE_SAE) && @@ -6571,6 +6575,10 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev, NL80211_EXT_FEATURE_EPPKE) && auth_type == NL80211_AUTHTYPE_EPPKE) return false; + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_IEEE8021X_AUTH) && + auth_type == NL80211_AUTHTYPE_IEEE8021X) + return false; return true; case NL80211_CMD_START_AP: if (!wiphy_ext_feature_isset(&rdev->wiphy, @@ -12103,7 +12111,8 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) auth_type == NL80211_AUTHTYPE_FILS_SK || auth_type == NL80211_AUTHTYPE_FILS_SK_PFS || auth_type == NL80211_AUTHTYPE_FILS_PK || - auth_type == NL80211_AUTHTYPE_EPPKE) && + auth_type == NL80211_AUTHTYPE_EPPKE || + auth_type == NL80211_AUTHTYPE_IEEE8021X) && !info->attrs[NL80211_ATTR_AUTH_DATA]) return -EINVAL; @@ -12112,7 +12121,8 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) auth_type != NL80211_AUTHTYPE_FILS_SK && auth_type != NL80211_AUTHTYPE_FILS_SK_PFS && auth_type != NL80211_AUTHTYPE_FILS_PK && - auth_type != NL80211_AUTHTYPE_EPPKE) + auth_type != NL80211_AUTHTYPE_EPPKE && + auth_type != NL80211_AUTHTYPE_IEEE8021X) return -EINVAL; req.auth_data = nla_data(info->attrs[NL80211_ATTR_AUTH_DATA]); req.auth_data_len = nla_len(info->attrs[NL80211_ATTR_AUTH_DATA]); -- cgit v1.2.3 From 9347878b1513beee1a26bb249f5dc8326d450f75 Mon Sep 17 00:00:00 2001 From: Kavita Kavita Date: Fri, 27 Feb 2026 00:25:52 +0530 Subject: wifi: mac80211: Add support for IEEE 802.1X authentication protocol in non-AP STA mode Add support for the IEEE 802.1X authentication protocol in non-AP STA mode, as specified in "IEEE P802.11bi/D4.0, 12.16.5". IEEE 802.1X authentication involves multiple Authentication frame exchanges, with the non-AP STA and AP alternating transaction sequence numbers. The number of Authentication frame exchanges depends on the EAP method in use. For IEEE 802.1X authentication, process only Authentication frames with the expected transaction sequence number. For IEEE 802.1X Authentication, Table 9-71 specifies that the Encapsulation Length field as specified in Clause 9.4.1.82 shall be present in all IEEE 802.1X Authentication frames. Drop the frame in the mac80211 if the Encapsulation Length field is missing. After receiving the final Authentication frame with status code WLAN_STATUS_8021X_AUTH_SUCCESS from the AP, mac80211 marks the state as authenticated, as it indicates the EAP handshake has completed successfully over the Authentication frames as specified in Clause 12.16.5. In the PMKSA caching case, only two Authentication frames are exchanged if the AP identifies a valid PMKSA, then as specified in Clause 12.16.8.3, the AP shall set the Status Code to WLAN_STATUS_SUCCESS in the final Authentication frame and must not include an encapsulated EAPOL PDU. This frame will be the final Authentication frame from the AP when PMKSA caching is enabled, and mac80211 marks the state as authenticated. In case of authentication success or failure, forward the Authentication frame to userspace(e.g. wpa_supplicant), and let userspace validate the Authentication frame from the AP as per the specification. Signed-off-by: Kavita Kavita Link: https://patch.msgid.link/20260226185553.1516290-5-kavita.kavita@oss.qualcomm.com Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 1 + net/mac80211/mlme.c | 78 +++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 73 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 1bf806f85372..3651b2e6c518 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1508,6 +1508,7 @@ enum ieee80211_statuscode { WLAN_STATUS_SAE_PK = 127, WLAN_STATUS_DENIED_TID_TO_LINK_MAPPING = 133, WLAN_STATUS_PREF_TID_TO_LINK_MAPPING_SUGGESTED = 134, + WLAN_STATUS_8021X_AUTH_SUCCESS = 153, }; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 810bea1aacc5..7957eacc5ab7 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -4920,7 +4920,7 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - u16 auth_alg, auth_transaction, status_code; + u16 auth_alg, auth_transaction, status_code, encap_len; struct ieee80211_event event = { .type = MLME_EVENT, .u.mlme.data = AUTH_EVENT, @@ -4929,6 +4929,7 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, .subtype = IEEE80211_STYPE_AUTH, }; bool sae_need_confirm = false; + bool auth_fail = false; lockdep_assert_wiphy(sdata->local->hw.wiphy); @@ -4945,6 +4946,15 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, auth_transaction = le16_to_cpu(mgmt->u.auth.auth_transaction); status_code = le16_to_cpu(mgmt->u.auth.status_code); + /* + * IEEE 802.1X Authentication: + * Header + Authentication Algorithm Number(2 byte) + Authentication + * Transaction Sequence Number(2 byte) + Status Code(2 byte) + + * Encapsulation Length(2 byte). + */ + if (auth_alg == WLAN_AUTH_IEEE8021X && len < 24 + 8) + return; + info.link_id = ifmgd->auth_data->link_id; if (auth_alg != ifmgd->auth_data->algorithm || @@ -4960,7 +4970,24 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, goto notify_driver; } - if (status_code != WLAN_STATUS_SUCCESS) { + switch (auth_alg) { + case WLAN_AUTH_IEEE8021X: + if (status_code != WLAN_STATUS_SUCCESS && + status_code != WLAN_STATUS_8021X_AUTH_SUCCESS) + auth_fail = true; + + if (!auth_fail) { + /* Indicates length of encapsulated EAPOL PDU */ + encap_len = get_unaligned_le16(mgmt->u.auth.variable); + } + break; + default: + if (status_code != WLAN_STATUS_SUCCESS) + auth_fail = true; + break; + } + + if (auth_fail) { cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); if (auth_alg == WLAN_AUTH_SAE && @@ -4997,6 +5024,7 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, case WLAN_AUTH_FILS_SK_PFS: case WLAN_AUTH_FILS_PK: case WLAN_AUTH_EPPKE: + case WLAN_AUTH_IEEE8021X: break; case WLAN_AUTH_SHARED_KEY: if (ifmgd->auth_data->expected_transaction != 4) { @@ -5017,8 +5045,37 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, if (ifmgd->auth_data->algorithm != WLAN_AUTH_SAE || (auth_transaction == 2 && ifmgd->auth_data->expected_transaction == 2)) { - if (!ieee80211_mark_sta_auth(sdata)) - return; /* ignore frame -- wait for timeout */ + switch (ifmgd->auth_data->algorithm) { + case WLAN_AUTH_IEEE8021X: + /* + * IEEE 802.1X authentication: + * - When the full EAP handshake completes over the + * Authentication process, the responder sets the + * Status Code to WLAN_STATUS_8021X_AUTH_SUCCESS as + * specified in "IEEE P802.11bi/D4.0, 12.16.5". + * + * - In the PMKSA caching case, only two Authentication + * frames are exchanged if the responder (e.g., AP) + * identifies a valid PMKSA, then as specified in + * "IEEE P802.11bi/D4.0, 12.16.8.3", the responder + * shall set the Status Code to SUCCESS in the final + * Authentication frame and must not include an + * encapsulated EAPOL PDU. + * + * Both conditions are treated as successful + * authentication, so mark the state to Authenticated. + */ + if (status_code != WLAN_STATUS_8021X_AUTH_SUCCESS && + !(status_code == WLAN_STATUS_SUCCESS && + encap_len == 0)) + break; + fallthrough; + default: + if (!ieee80211_mark_sta_auth(sdata)) + return; /* ignore frame -- wait for timeout */ + + break; + } } else if (ifmgd->auth_data->algorithm == WLAN_AUTH_SAE && auth_transaction == 1) { sae_need_confirm = true; @@ -8460,6 +8517,10 @@ static int ieee80211_auth(struct ieee80211_sub_if_data *sdata) } else if (auth_data->algorithm == WLAN_AUTH_EPPKE) { trans = auth_data->trans; status = auth_data->status; + } else if (auth_data->algorithm == WLAN_AUTH_IEEE8021X) { + trans = auth_data->trans; + status = auth_data->status; + auth_data->expected_transaction = trans + 1; } if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) @@ -9117,7 +9178,8 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, } if (ifmgd->auth_data && - ifmgd->auth_data->algorithm == WLAN_AUTH_EPPKE) + (ifmgd->auth_data->algorithm == WLAN_AUTH_EPPKE || + ifmgd->auth_data->algorithm == WLAN_AUTH_IEEE8021X)) new_sta->sta.epp_peer = true; new_sta->sta.mlo = mlo; @@ -9377,6 +9439,9 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, case NL80211_AUTHTYPE_EPPKE: auth_alg = WLAN_AUTH_EPPKE; break; + case NL80211_AUTHTYPE_IEEE8021X: + auth_alg = WLAN_AUTH_IEEE8021X; + break; default: return -EOPNOTSUPP; } @@ -9402,7 +9467,8 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, if (req->auth_data_len >= 4) { if (req->auth_type == NL80211_AUTHTYPE_SAE || - req->auth_type == NL80211_AUTHTYPE_EPPKE) { + req->auth_type == NL80211_AUTHTYPE_EPPKE || + req->auth_type == NL80211_AUTHTYPE_IEEE8021X) { __le16 *pos = (__le16 *) req->auth_data; auth_data->trans = le16_to_cpu(pos[0]); -- cgit v1.2.3 From 25ab7b6f34c74ea555b4489b57f7219612991433 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 16 Feb 2026 14:32:02 +0100 Subject: xattr: remove rbtree-based simple_xattr infrastructure Now that all consumers (shmem, kernfs, pidfs) have been converted to use the rhashtable-based simple_xattrs with pointer-based lazy allocation, remove the legacy rbtree code path. The rhashtable implementation provides O(1) average-case lookup with RCU-based lockless reads, replacing the O(log n) rbtree with reader-writer spinlock contention. Link: https://patch.msgid.link/20260216-work-xattr-socket-v1-6-c2efa4f74cb7@kernel.org Acked-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/xattr.c | 387 +++++++++++++------------------------------------- include/linux/xattr.h | 12 +- 2 files changed, 103 insertions(+), 296 deletions(-) (limited to 'include/linux') diff --git a/fs/xattr.c b/fs/xattr.c index eb45ae0fd17f..64803097e1dc 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -1200,20 +1200,18 @@ void simple_xattr_free(struct simple_xattr *xattr) static void simple_xattr_rcu_free(struct rcu_head *head) { - struct simple_xattr *xattr; + struct simple_xattr *xattr = container_of(head, struct simple_xattr, rcu); - xattr = container_of(head, struct simple_xattr, rcu); simple_xattr_free(xattr); } /** - * simple_xattr_free_rcu - free an xattr object after an RCU grace period + * simple_xattr_free_rcu - free an xattr object with RCU delay * @xattr: the xattr object * - * Schedule RCU-deferred freeing of an xattr entry. This is used by - * rhashtable-based callers of simple_xattr_set() that replace or remove - * an existing entry while concurrent RCU readers may still be accessing - * it. + * Free the xattr object after an RCU grace period. This must be used when + * the xattr was removed from a data structure that concurrent RCU readers + * may still be traversing. Can handle @xattr being NULL. */ void simple_xattr_free_rcu(struct simple_xattr *xattr) { @@ -1254,43 +1252,6 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size) return new_xattr; } -/** - * rbtree_simple_xattr_cmp - compare xattr name with current rbtree xattr entry - * @key: xattr name - * @node: current node - * - * Compare the xattr name with the xattr name attached to @node in the rbtree. - * - * Return: Negative value if continuing left, positive if continuing right, 0 - * if the xattr attached to @node matches @key. - */ -static int rbtree_simple_xattr_cmp(const void *key, const struct rb_node *node) -{ - const char *xattr_name = key; - const struct simple_xattr *xattr; - - xattr = rb_entry(node, struct simple_xattr, rb_node); - return strcmp(xattr->name, xattr_name); -} - -/** - * rbtree_simple_xattr_node_cmp - compare two xattr rbtree nodes - * @new_node: new node - * @node: current node - * - * Compare the xattr attached to @new_node with the xattr attached to @node. - * - * Return: Negative value if continuing left, positive if continuing right, 0 - * if the xattr attached to @new_node matches the xattr attached to @node. - */ -static int rbtree_simple_xattr_node_cmp(struct rb_node *new_node, - const struct rb_node *node) -{ - struct simple_xattr *xattr; - xattr = rb_entry(new_node, struct simple_xattr, rb_node); - return rbtree_simple_xattr_cmp(xattr->name, node); -} - static u32 simple_xattr_hashfn(const void *data, u32 len, u32 seed) { const char *name = data; @@ -1336,41 +1297,19 @@ static const struct rhashtable_params simple_xattr_params = { int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, void *buffer, size_t size) { - struct simple_xattr *xattr = NULL; + struct simple_xattr *xattr; int ret = -ENODATA; - if (xattrs->use_rhashtable) { - guard(rcu)(); - xattr = rhashtable_lookup(&xattrs->ht, name, - simple_xattr_params); - if (xattr) { - ret = xattr->size; - if (buffer) { - if (size < xattr->size) - ret = -ERANGE; - else - memcpy(buffer, xattr->value, - xattr->size); - } - } - } else { - struct rb_node *rbp; - - read_lock(&xattrs->lock); - rbp = rb_find(name, &xattrs->rb_root, - rbtree_simple_xattr_cmp); - if (rbp) { - xattr = rb_entry(rbp, struct simple_xattr, rb_node); - ret = xattr->size; - if (buffer) { - if (size < xattr->size) - ret = -ERANGE; - else - memcpy(buffer, xattr->value, - xattr->size); - } + guard(rcu)(); + xattr = rhashtable_lookup(&xattrs->ht, name, simple_xattr_params); + if (xattr) { + ret = xattr->size; + if (buffer) { + if (size < xattr->size) + ret = -ERANGE; + else + memcpy(buffer, xattr->value, xattr->size); } - read_unlock(&xattrs->lock); } return ret; } @@ -1398,6 +1337,11 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, * nothing if XATTR_CREATE is specified in @flags or @flags is zero. For * XATTR_REPLACE we fail as mentioned above. * + * Note: Callers must externally serialize writes. All current callers hold + * the inode lock for write operations. The lookup->replace/remove sequence + * is not atomic with respect to the rhashtable's per-bucket locking, but + * is safe because writes are serialized by the caller. + * * Return: On success, the removed or replaced xattr is returned, to be freed * by the caller; or NULL if none. On failure a negative error code is returned. */ @@ -1406,7 +1350,7 @@ struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, size_t size, int flags) { struct simple_xattr *old_xattr = NULL; - int err = 0; + int err; CLASS(simple_xattr, new_xattr)(value, size); if (IS_ERR(new_xattr)) @@ -1418,119 +1362,52 @@ struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, return ERR_PTR(-ENOMEM); } - if (xattrs->use_rhashtable) { - /* - * Lookup is safe without RCU here since writes are - * serialized by the caller. - */ - old_xattr = rhashtable_lookup_fast(&xattrs->ht, name, - simple_xattr_params); - - if (old_xattr) { - /* Fail if XATTR_CREATE is requested and the xattr exists. */ - if (flags & XATTR_CREATE) - return ERR_PTR(-EEXIST); - - if (new_xattr) { - err = rhashtable_replace_fast(&xattrs->ht, - &old_xattr->hash_node, - &new_xattr->hash_node, - simple_xattr_params); - if (err) - return ERR_PTR(err); - } else { - err = rhashtable_remove_fast(&xattrs->ht, - &old_xattr->hash_node, - simple_xattr_params); - if (err) - return ERR_PTR(err); - } - } else { - /* Fail if XATTR_REPLACE is requested but no xattr is found. */ - if (flags & XATTR_REPLACE) - return ERR_PTR(-ENODATA); - - /* - * If XATTR_CREATE or no flags are specified together - * with a new value simply insert it. - */ - if (new_xattr) { - err = rhashtable_insert_fast(&xattrs->ht, - &new_xattr->hash_node, - simple_xattr_params); - if (err) - return ERR_PTR(err); - } - - /* - * If XATTR_CREATE or no flags are specified and - * neither an old or new xattr exist then we don't - * need to do anything. - */ - } - } else { - struct rb_node *parent = NULL, **rbp; - int ret; - - write_lock(&xattrs->lock); - rbp = &xattrs->rb_root.rb_node; - while (*rbp) { - parent = *rbp; - ret = rbtree_simple_xattr_cmp(name, *rbp); - if (ret < 0) - rbp = &(*rbp)->rb_left; - else if (ret > 0) - rbp = &(*rbp)->rb_right; - else - old_xattr = rb_entry(*rbp, struct simple_xattr, - rb_node); - if (old_xattr) - break; - } + /* Lookup is safe without RCU here since writes are serialized. */ + old_xattr = rhashtable_lookup_fast(&xattrs->ht, name, + simple_xattr_params); - if (old_xattr) { - /* Fail if XATTR_CREATE is requested and the xattr exists. */ - if (flags & XATTR_CREATE) { - err = -EEXIST; - goto out_unlock; - } + if (old_xattr) { + /* Fail if XATTR_CREATE is requested and the xattr exists. */ + if (flags & XATTR_CREATE) + return ERR_PTR(-EEXIST); - if (new_xattr) - rb_replace_node(&old_xattr->rb_node, - &new_xattr->rb_node, - &xattrs->rb_root); - else - rb_erase(&old_xattr->rb_node, - &xattrs->rb_root); + if (new_xattr) { + err = rhashtable_replace_fast(&xattrs->ht, + &old_xattr->hash_node, + &new_xattr->hash_node, + simple_xattr_params); + if (err) + return ERR_PTR(err); } else { - /* Fail if XATTR_REPLACE is requested but no xattr is found. */ - if (flags & XATTR_REPLACE) { - err = -ENODATA; - goto out_unlock; - } - - /* - * If XATTR_CREATE or no flags are specified together - * with a new value simply insert it. - */ - if (new_xattr) { - rb_link_node(&new_xattr->rb_node, parent, rbp); - rb_insert_color(&new_xattr->rb_node, - &xattrs->rb_root); - } + err = rhashtable_remove_fast(&xattrs->ht, + &old_xattr->hash_node, + simple_xattr_params); + if (err) + return ERR_PTR(err); + } + } else { + /* Fail if XATTR_REPLACE is requested but no xattr is found. */ + if (flags & XATTR_REPLACE) + return ERR_PTR(-ENODATA); - /* - * If XATTR_CREATE or no flags are specified and - * neither an old or new xattr exist then we don't - * need to do anything. - */ + /* + * If XATTR_CREATE or no flags are specified together with a + * new value simply insert it. + */ + if (new_xattr) { + err = rhashtable_insert_fast(&xattrs->ht, + &new_xattr->hash_node, + simple_xattr_params); + if (err) + return ERR_PTR(err); } -out_unlock: - write_unlock(&xattrs->lock); - if (err) - return ERR_PTR(err); + /* + * If XATTR_CREATE or no flags are specified and neither an + * old or new xattr exist then we don't need to do anything. + */ } + retain_and_null_ptr(new_xattr); return old_xattr; } @@ -1572,6 +1449,7 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer, size_t size) { bool trusted = ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN); + struct rhashtable_iter iter; struct simple_xattr *xattr; ssize_t remaining_size = size; int err = 0; @@ -1595,77 +1473,34 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, if (!xattrs) return size - remaining_size; - if (xattrs->use_rhashtable) { - struct rhashtable_iter iter; - - rhashtable_walk_enter(&xattrs->ht, &iter); - rhashtable_walk_start(&iter); - - while ((xattr = rhashtable_walk_next(&iter)) != NULL) { - if (IS_ERR(xattr)) { - if (PTR_ERR(xattr) == -EAGAIN) - continue; - err = PTR_ERR(xattr); - break; - } - - /* skip "trusted." attributes for unprivileged callers */ - if (!trusted && xattr_is_trusted(xattr->name)) - continue; + rhashtable_walk_enter(&xattrs->ht, &iter); + rhashtable_walk_start(&iter); - /* skip MAC labels; these are provided by LSM above */ - if (xattr_is_maclabel(xattr->name)) + while ((xattr = rhashtable_walk_next(&iter)) != NULL) { + if (IS_ERR(xattr)) { + if (PTR_ERR(xattr) == -EAGAIN) continue; - - err = xattr_list_one(&buffer, &remaining_size, - xattr->name); - if (err) - break; + err = PTR_ERR(xattr); + break; } - rhashtable_walk_stop(&iter); - rhashtable_walk_exit(&iter); - } else { - struct rb_node *rbp; - - read_lock(&xattrs->lock); - for (rbp = rb_first(&xattrs->rb_root); rbp; - rbp = rb_next(rbp)) { - xattr = rb_entry(rbp, struct simple_xattr, rb_node); - - /* skip "trusted." attributes for unprivileged callers */ - if (!trusted && xattr_is_trusted(xattr->name)) - continue; + /* skip "trusted." attributes for unprivileged callers */ + if (!trusted && xattr_is_trusted(xattr->name)) + continue; - /* skip MAC labels; these are provided by LSM above */ - if (xattr_is_maclabel(xattr->name)) - continue; + /* skip MAC labels; these are provided by LSM above */ + if (xattr_is_maclabel(xattr->name)) + continue; - err = xattr_list_one(&buffer, &remaining_size, - xattr->name); - if (err) - break; - } - read_unlock(&xattrs->lock); + err = xattr_list_one(&buffer, &remaining_size, xattr->name); + if (err) + break; } - return err ? err : size - remaining_size; -} + rhashtable_walk_stop(&iter); + rhashtable_walk_exit(&iter); -/** - * rbtree_simple_xattr_less - compare two xattr rbtree nodes - * @new_node: new node - * @node: current node - * - * Compare the xattr attached to @new_node with the xattr attached to @node. - * Note that this function technically tolerates duplicate entries. - * - * Return: True if insertion point in the rbtree is found. - */ -static bool rbtree_simple_xattr_less(struct rb_node *new_node, - const struct rb_node *node) -{ - return rbtree_simple_xattr_node_cmp(new_node, node) < 0; + return err ? err : size - remaining_size; } /** @@ -1676,33 +1511,29 @@ static bool rbtree_simple_xattr_less(struct rb_node *new_node, * Add an xattr object to @xattrs. This assumes no replacement or removal * of matching xattrs is wanted. Should only be called during inode * initialization when a few distinct initial xattrs are supposed to be set. + * + * Return: On success zero is returned. On failure a negative error code is + * returned. */ int simple_xattr_add(struct simple_xattrs *xattrs, struct simple_xattr *new_xattr) { - if (xattrs->use_rhashtable) - return rhashtable_insert_fast(&xattrs->ht, - &new_xattr->hash_node, - simple_xattr_params); - - write_lock(&xattrs->lock); - rb_add(&new_xattr->rb_node, &xattrs->rb_root, - rbtree_simple_xattr_less); - write_unlock(&xattrs->lock); - return 0; + return rhashtable_insert_fast(&xattrs->ht, &new_xattr->hash_node, + simple_xattr_params); } /** * simple_xattrs_init - initialize new xattr header * @xattrs: header to initialize * - * Initialize relevant fields of a an xattr header. + * Initialize the rhashtable used to store xattr objects. + * + * Return: On success zero is returned. On failure a negative error code is + * returned. */ -void simple_xattrs_init(struct simple_xattrs *xattrs) +int simple_xattrs_init(struct simple_xattrs *xattrs) { - xattrs->use_rhashtable = false; - xattrs->rb_root = RB_ROOT; - rwlock_init(&xattrs->lock); + return rhashtable_init(&xattrs->ht, &simple_xattr_params); } /** @@ -1710,7 +1541,8 @@ void simple_xattrs_init(struct simple_xattrs *xattrs) * * Dynamically allocate a simple_xattrs header and initialize the * underlying rhashtable. This is intended for consumers that want - * rhashtable-based xattr storage. + * to lazily allocate xattr storage only when the first xattr is set, + * avoiding the per-inode rhashtable overhead when no xattrs are used. * * Return: On success a new simple_xattrs is returned. On failure an * ERR_PTR is returned. @@ -1718,14 +1550,15 @@ void simple_xattrs_init(struct simple_xattrs *xattrs) struct simple_xattrs *simple_xattrs_alloc(void) { struct simple_xattrs *xattrs __free(kfree) = NULL; + int ret; xattrs = kzalloc(sizeof(*xattrs), GFP_KERNEL); if (!xattrs) return ERR_PTR(-ENOMEM); - xattrs->use_rhashtable = true; - if (rhashtable_init(&xattrs->ht, &simple_xattr_params)) - return ERR_PTR(-ENOMEM); + ret = simple_xattrs_init(xattrs); + if (ret) + return ERR_PTR(ret); return no_free_ptr(xattrs); } @@ -1784,28 +1617,10 @@ static void simple_xattr_ht_free(void *ptr, void *arg) */ void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space) { + might_sleep(); + if (freed_space) *freed_space = 0; - - if (xattrs->use_rhashtable) { - rhashtable_free_and_destroy(&xattrs->ht, - simple_xattr_ht_free, freed_space); - } else { - struct rb_node *rbp; - - rbp = rb_first(&xattrs->rb_root); - while (rbp) { - struct simple_xattr *xattr; - struct rb_node *rbp_next; - - rbp_next = rb_next(rbp); - xattr = rb_entry(rbp, struct simple_xattr, rb_node); - rb_erase(&xattr->rb_node, &xattrs->rb_root); - if (freed_space) - *freed_space += simple_xattr_space(xattr->name, - xattr->size); - simple_xattr_free(xattr); - rbp = rbp_next; - } - } + rhashtable_free_and_destroy(&xattrs->ht, simple_xattr_ht_free, + freed_space); } diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 6e619e185e90..3b5a5fd684eb 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -107,18 +107,10 @@ static inline const char *xattr_prefix(const struct xattr_handler *handler) } struct simple_xattrs { - bool use_rhashtable; - union { - struct { - struct rb_root rb_root; - rwlock_t lock; - }; - struct rhashtable ht; - }; + struct rhashtable ht; }; struct simple_xattr { - struct rb_node rb_node; struct rhash_head hash_node; struct rcu_head rcu; char *name; @@ -126,7 +118,7 @@ struct simple_xattr { char value[] __counted_by(size); }; -void simple_xattrs_init(struct simple_xattrs *xattrs); +int simple_xattrs_init(struct simple_xattrs *xattrs); struct simple_xattrs *simple_xattrs_alloc(void); struct simple_xattrs *simple_xattrs_lazy_alloc(struct simple_xattrs **xattrsp, const void *value, int flags); -- cgit v1.2.3 From 4fbe9e78bb415dd632ff63a9f620af0be58ef820 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 16 Feb 2026 14:32:05 +0100 Subject: xattr: move user limits for xattrs to generic infra Link: https://patch.msgid.link/20260216-work-xattr-socket-v1-9-c2efa4f74cb7@kernel.org Acked-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/kernfs/inode.c | 75 ++------------------------------------------- fs/kernfs/kernfs-internal.h | 3 +- fs/xattr.c | 65 +++++++++++++++++++++++++++++++++++++++ include/linux/kernfs.h | 2 -- include/linux/xattr.h | 18 +++++++++++ 5 files changed, 87 insertions(+), 76 deletions(-) (limited to 'include/linux') diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index dfc3315b5afc..1de10500842d 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -45,8 +45,7 @@ static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, bool alloc) ret->ia_mtime = ret->ia_atime; ret->ia_ctime = ret->ia_atime; - atomic_set(&ret->nr_user_xattrs, 0); - atomic_set(&ret->user_xattr_size, 0); + simple_xattr_limits_init(&ret->xattr_limits); /* If someone raced us, recognize it. */ if (!try_cmpxchg(&kn->iattr, &attr, ret)) @@ -355,69 +354,6 @@ static int kernfs_vfs_xattr_set(const struct xattr_handler *handler, return kernfs_xattr_set(kn, name, value, size, flags); } -static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn, - const char *full_name, - struct simple_xattrs *xattrs, - const void *value, size_t size, int flags) -{ - struct kernfs_iattrs *attr = kernfs_iattrs_noalloc(kn); - atomic_t *sz = &attr->user_xattr_size; - atomic_t *nr = &attr->nr_user_xattrs; - struct simple_xattr *old_xattr; - int ret; - - if (atomic_inc_return(nr) > KERNFS_MAX_USER_XATTRS) { - ret = -ENOSPC; - goto dec_count_out; - } - - if (atomic_add_return(size, sz) > KERNFS_USER_XATTR_SIZE_LIMIT) { - ret = -ENOSPC; - goto dec_size_out; - } - - old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags); - if (!old_xattr) - return 0; - - if (IS_ERR(old_xattr)) { - ret = PTR_ERR(old_xattr); - goto dec_size_out; - } - - ret = 0; - size = old_xattr->size; - simple_xattr_free_rcu(old_xattr); -dec_size_out: - atomic_sub(size, sz); -dec_count_out: - atomic_dec(nr); - return ret; -} - -static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn, - const char *full_name, - struct simple_xattrs *xattrs, - const void *value, size_t size, int flags) -{ - struct kernfs_iattrs *attr = kernfs_iattrs_noalloc(kn); - atomic_t *sz = &attr->user_xattr_size; - atomic_t *nr = &attr->nr_user_xattrs; - struct simple_xattr *old_xattr; - - old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags); - if (!old_xattr) - return 0; - - if (IS_ERR(old_xattr)) - return PTR_ERR(old_xattr); - - atomic_sub(old_xattr->size, sz); - atomic_dec(nr); - simple_xattr_free_rcu(old_xattr); - return 0; -} - static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, @@ -440,13 +376,8 @@ static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler, if (IS_ERR_OR_NULL(xattrs)) return PTR_ERR(xattrs); - if (value) - return kernfs_vfs_user_xattr_add(kn, full_name, xattrs, - value, size, flags); - else - return kernfs_vfs_user_xattr_rm(kn, full_name, xattrs, - value, size, flags); - + return simple_xattr_set_limited(xattrs, &attrs->xattr_limits, + full_name, value, size, flags); } static const struct xattr_handler kernfs_trusted_xattr_handler = { diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 1324ed8c0661..1d3831e3a270 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -27,8 +27,7 @@ struct kernfs_iattrs { struct timespec64 ia_ctime; struct simple_xattrs *xattrs; - atomic_t nr_user_xattrs; - atomic_t user_xattr_size; + struct simple_xattr_limits xattr_limits; }; struct kernfs_root { diff --git a/fs/xattr.c b/fs/xattr.c index 328ed7558dfc..5e559b1c651f 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -1427,6 +1427,71 @@ struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, return old_xattr; } +static inline void simple_xattr_limits_dec(struct simple_xattr_limits *limits, + size_t size) +{ + atomic_sub(size, &limits->xattr_size); + atomic_dec(&limits->nr_xattrs); +} + +static inline int simple_xattr_limits_inc(struct simple_xattr_limits *limits, + size_t size) +{ + if (atomic_inc_return(&limits->nr_xattrs) > SIMPLE_XATTR_MAX_NR) { + atomic_dec(&limits->nr_xattrs); + return -ENOSPC; + } + + if (atomic_add_return(size, &limits->xattr_size) <= SIMPLE_XATTR_MAX_SIZE) + return 0; + + simple_xattr_limits_dec(limits, size); + return -ENOSPC; +} + +/** + * simple_xattr_set_limited - set an xattr with per-inode user.* limits + * @xattrs: the header of the xattr object + * @limits: per-inode limit counters for user.* xattrs + * @name: the name of the xattr to set or remove + * @value: the value to store (NULL to remove) + * @size: the size of @value + * @flags: XATTR_CREATE, XATTR_REPLACE, or 0 + * + * Like simple_xattr_set(), but enforces per-inode count and total value size + * limits for user.* xattrs. Uses speculative pre-increment of the atomic + * counters to avoid races without requiring external locks. + * + * Return: On success zero is returned. On failure a negative error code is + * returned. + */ +int simple_xattr_set_limited(struct simple_xattrs *xattrs, + struct simple_xattr_limits *limits, + const char *name, const void *value, + size_t size, int flags) +{ + struct simple_xattr *old_xattr; + int ret; + + if (value) { + ret = simple_xattr_limits_inc(limits, size); + if (ret) + return ret; + } + + old_xattr = simple_xattr_set(xattrs, name, value, size, flags); + if (IS_ERR(old_xattr)) { + if (value) + simple_xattr_limits_dec(limits, size); + return PTR_ERR(old_xattr); + } + if (old_xattr) { + simple_xattr_limits_dec(limits, old_xattr->size); + simple_xattr_free_rcu(old_xattr); + } + return 0; +} + static bool xattr_is_trusted(const char *name) { return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index b5a5f32fdfd1..d8f57f0af5e4 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -99,8 +99,6 @@ enum kernfs_node_type { #define KERNFS_TYPE_MASK 0x000f #define KERNFS_FLAG_MASK ~KERNFS_TYPE_MASK -#define KERNFS_MAX_USER_XATTRS 128 -#define KERNFS_USER_XATTR_SIZE_LIMIT (128 << 10) enum kernfs_node_flag { KERNFS_ACTIVATED = 0x0010, diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 3b5a5fd684eb..8b6601367eae 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -118,6 +118,20 @@ struct simple_xattr { char value[] __counted_by(size); }; +#define SIMPLE_XATTR_MAX_NR 128 +#define SIMPLE_XATTR_MAX_SIZE (128 << 10) + +struct simple_xattr_limits { + atomic_t nr_xattrs; /* current user.* xattr count */ + atomic_t xattr_size; /* current total user.* value bytes */ +}; + +static inline void simple_xattr_limits_init(struct simple_xattr_limits *limits) +{ + atomic_set(&limits->nr_xattrs, 0); + atomic_set(&limits->xattr_size, 0); +} + int simple_xattrs_init(struct simple_xattrs *xattrs); struct simple_xattrs *simple_xattrs_alloc(void); struct simple_xattrs *simple_xattrs_lazy_alloc(struct simple_xattrs **xattrsp, @@ -132,6 +146,10 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, const char *name, const void *value, size_t size, int flags); +int simple_xattr_set_limited(struct simple_xattrs *xattrs, + struct simple_xattr_limits *limits, + const char *name, const void *value, + size_t size, int flags); ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer, size_t size); int simple_xattr_add(struct simple_xattrs *xattrs, -- cgit v1.2.3 From cc2f5e2aeb6c69556837e45756b3ddded98b3898 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 28 Feb 2026 17:48:02 -0800 Subject: pinctrl: pinconf-generic: fix an enum name description Correct an enum name in a kernel-doc comment to avoid kernel-doc warnings: Warning: include/linux/pinctrl/pinconf-generic.h:161 Enum value 'PIN_CONFIG_SKEW_DELAY_OUTPUT_PS' not described in enum 'pin_config_param' Warning: include/linux/pinctrl/pinconf-generic.h:161 Excess enum value '@PIN_CONFIG_SKEW_DELAY_OUPUT_PS' description in 'pin_config_param' Signed-off-by: Randy Dunlap Signed-off-by: Linus Walleij --- include/linux/pinctrl/pinconf-generic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h index 89277808ea61..531dc3e9b3f7 100644 --- a/include/linux/pinctrl/pinconf-generic.h +++ b/include/linux/pinctrl/pinconf-generic.h @@ -115,7 +115,7 @@ struct pinctrl_map; * @PIN_CONFIG_SKEW_DELAY_INPUT_PS: if the pin has independent values for the * programmable skew rate (on inputs) and latch delay (on outputs), then * this parameter specifies the clock skew only. The argument is in ps. - * @PIN_CONFIG_SKEW_DELAY_OUPUT_PS: if the pin has independent values for the + * @PIN_CONFIG_SKEW_DELAY_OUTPUT_PS: if the pin has independent values for the * programmable skew rate (on inputs) and latch delay (on outputs), then * this parameter specifies the latch delay only. The argument is in ps. * @PIN_CONFIG_SLEEP_HARDWARE_STATE: indicate this is sleep related state. -- cgit v1.2.3 From 94798081732abfb5748471d5c3cced6ff187fa36 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 13 Feb 2026 18:52:43 -0800 Subject: driver core: platform: add kerneldoc to struct platform_device_info Add kernel documentation for struct platform_device_info and its individual members. While at it remove an extra indent level from the structure definition. Signed-off-by: Dmitry Torokhov Reviewed-by: Bartosz Golaszewski Link: https://patch.msgid.link/20260214025246.2095239-2-dmitry.torokhov@gmail.com Signed-off-by: Danilo Krummrich --- include/linux/platform_device.h | 53 ++++++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 813da101b5bf..5f54217930e1 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -118,22 +118,53 @@ extern int platform_get_irq_byname_optional(struct platform_device *dev, const char *name); extern int platform_add_devices(struct platform_device **, int); +/** + * struct platform_device_info - set of parameters for creating a platform device + * @parent: parent device for the new platform device. + * @fwnode: firmware node associated with the device. + * @of_node_reused: indicates that device tree node associated with the device + * is shared with another device, typically its ancestor. Setting this to + * %true prevents the device from being matched via the OF match table, + * and stops the device core from automatically binding pinctrl + * configuration to avoid disrupting the other device. + * @name: name of the device. + * @id: instance ID of the device. Use %PLATFORM_DEVID_NONE if there is only + * one instance of the device, or %PLATFORM_DEVID_AUTO to let the + * kernel automatically assign a unique instance ID. + * @res: set of resources to attach to the device. + * @num_res: number of entries in @res. + * @data: device-specific data for this platform device. + * @size_data: size of device-specific data. + * @dma_mask: DMA mask for the device. + * @properties: a set of software properties for the device. If provided, + * a managed software node will be automatically created and + * assigned to the device. The properties array must be terminated + * with a sentinel entry. + * + * This structure is used to hold information needed to create and register + * a platform device using platform_device_register_full(). + * + * platform_device_register_full() makes deep copies of @name, @res, @data and + * @properties, so the caller does not need to keep them after registration. + * If the registration is performed during initialization, these can be marked + * as __initconst. + */ struct platform_device_info { - struct device *parent; - struct fwnode_handle *fwnode; - bool of_node_reused; + struct device *parent; + struct fwnode_handle *fwnode; + bool of_node_reused; - const char *name; - int id; + const char *name; + int id; - const struct resource *res; - unsigned int num_res; + const struct resource *res; + unsigned int num_res; - const void *data; - size_t size_data; - u64 dma_mask; + const void *data; + size_t size_data; + u64 dma_mask; - const struct property_entry *properties; + const struct property_entry *properties; }; extern struct platform_device *platform_device_register_full( const struct platform_device_info *pdevinfo); -- cgit v1.2.3 From 0fc434bc2c45fceb9356f2138911db0f454b8ca6 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Fri, 13 Feb 2026 18:52:44 -0800 Subject: driver core: platform: allow attaching software nodes when creating devices Extend platform_device_info structure with an optional pointer to a software node to be used as a secondary firmware node for the device being created. If software node has not been registered yet it will be automatically registered. This reduces boilerplate needed when switching legacy board code to static device properties/GPIO references. Signed-off-by: Dmitry Torokhov Reviewed-by: Bartosz Golaszewski Link: https://patch.msgid.link/20260214025246.2095239-3-dmitry.torokhov@gmail.com Signed-off-by: Danilo Krummrich --- drivers/base/platform.c | 9 ++++++++- include/linux/platform_device.h | 7 ++++++- 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/platform.c b/drivers/base/platform.c index b45d41b018ca..ec467ccd05b3 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -850,6 +850,9 @@ struct platform_device *platform_device_register_full( int ret; struct platform_device *pdev; + if (pdevinfo->swnode && pdevinfo->properties) + return ERR_PTR(-EINVAL); + pdev = platform_device_alloc(pdevinfo->name, pdevinfo->id); if (!pdev) return ERR_PTR(-ENOMEM); @@ -875,7 +878,11 @@ struct platform_device *platform_device_register_full( if (ret) goto err; - if (pdevinfo->properties) { + if (pdevinfo->swnode) { + ret = device_add_software_node(&pdev->dev, pdevinfo->swnode); + if (ret) + goto err; + } else if (pdevinfo->properties) { ret = device_create_managed_software_node(&pdev->dev, pdevinfo->properties, NULL); if (ret) diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 5f54217930e1..754e4bf2771a 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -136,10 +136,14 @@ extern int platform_add_devices(struct platform_device **, int); * @data: device-specific data for this platform device. * @size_data: size of device-specific data. * @dma_mask: DMA mask for the device. + * @swnode: a secondary software node to be attached to the device. The node + * will be automatically registered and its lifetime tied to the platform + * device if it is not registered yet. * @properties: a set of software properties for the device. If provided, * a managed software node will be automatically created and * assigned to the device. The properties array must be terminated - * with a sentinel entry. + * with a sentinel entry. Specifying both @properties and @swnode is not + * allowed. * * This structure is used to hold information needed to create and register * a platform device using platform_device_register_full(). @@ -164,6 +168,7 @@ struct platform_device_info { size_t size_data; u64 dma_mask; + const struct software_node *swnode; const struct property_entry *properties; }; extern struct platform_device *platform_device_register_full( -- cgit v1.2.3 From ba51cf9fcf511df8c7026feda4b7d65999d3517c Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Thu, 26 Feb 2026 15:52:12 +0200 Subject: net/mlx5: Drop MR cache related code Following mlx5_ib move to using FRMR pools, drop all unused code of MR cache. Signed-off-by: Michael Guralnik Reviewed-by: Yishai Hadas Signed-off-by: Edward Srouji Link: https://patch.msgid.link/20260226-frmr_pools-v4-7-95360b54f15e@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 67 +------------------------- include/linux/mlx5/driver.h | 11 ----- 2 files changed, 1 insertion(+), 77 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index fdc3ba20912e..4b59f3f7c6f0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -110,74 +110,9 @@ static struct mlx5_profile profile[] = { }, [2] = { - .mask = MLX5_PROF_MASK_QP_SIZE | - MLX5_PROF_MASK_MR_CACHE, + .mask = MLX5_PROF_MASK_QP_SIZE, .log_max_qp = LOG_MAX_SUPPORTED_QPS, .num_cmd_caches = MLX5_NUM_COMMAND_CACHES, - .mr_cache[0] = { - .size = 500, - .limit = 250 - }, - .mr_cache[1] = { - .size = 500, - .limit = 250 - }, - .mr_cache[2] = { - .size = 500, - .limit = 250 - }, - .mr_cache[3] = { - .size = 500, - .limit = 250 - }, - .mr_cache[4] = { - .size = 500, - .limit = 250 - }, - .mr_cache[5] = { - .size = 500, - .limit = 250 - }, - .mr_cache[6] = { - .size = 500, - .limit = 250 - }, - .mr_cache[7] = { - .size = 500, - .limit = 250 - }, - .mr_cache[8] = { - .size = 500, - .limit = 250 - }, - .mr_cache[9] = { - .size = 500, - .limit = 250 - }, - .mr_cache[10] = { - .size = 500, - .limit = 250 - }, - .mr_cache[11] = { - .size = 500, - .limit = 250 - }, - .mr_cache[12] = { - .size = 64, - .limit = 32 - }, - .mr_cache[13] = { - .size = 32, - .limit = 16 - }, - .mr_cache[14] = { - .size = 16, - .limit = 8 - }, - .mr_cache[15] = { - .size = 8, - .limit = 4 - }, }, [3] = { .mask = MLX5_PROF_MASK_QP_SIZE, diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 04dcd09f7517..27d64f09683f 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -705,23 +705,12 @@ struct mlx5_st; enum { MLX5_PROF_MASK_QP_SIZE = (u64)1 << 0, - MLX5_PROF_MASK_MR_CACHE = (u64)1 << 1, -}; - -enum { - MKEY_CACHE_LAST_STD_ENTRY = 20, - MLX5_IMR_KSM_CACHE_ENTRY, - MAX_MKEY_CACHE_ENTRIES }; struct mlx5_profile { u64 mask; u8 log_max_qp; u8 num_cmd_caches; - struct { - int size; - int limit; - } mr_cache[MAX_MKEY_CACHE_ENTRIES]; }; struct mlx5_hca_cap { -- cgit v1.2.3 From fe3c03b84ae69f34992a5e72cbb8384b9ebad738 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 1 Mar 2026 16:51:54 -0800 Subject: cred: fix kernel-doc warnings in cred.h Use the correct function parameter names, function names, or kernel-doc format, and add function return comment sections to avoid kernel-doc warnings: Warning: include/linux/cred.h:43 function parameter 'gi' not described in 'get_group_info' Warning: include/linux/cred.h:43 No description found for return value of 'get_group_info' Warning: include/linux/cred.h:213 No description found for return value of 'get_cred_many' Warning: include/linux/cred.h:260 function parameter '_cred' not described in 'put_cred_many' Warning: include/linux/cred.h:260 expecting prototype for put_cred(). Prototype was for put_cred_many() instead Signed-off-by: Randy Dunlap [PM: subject tweak] Signed-off-by: Paul Moore --- include/linux/cred.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cred.h b/include/linux/cred.h index ed1609d78cd7..c6676265a985 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -33,12 +33,14 @@ struct group_info { /** * get_group_info - Get a reference to a group info structure - * @group_info: The group info to reference + * @gi: The group info to reference * * This gets a reference to a set of supplementary groups. * * If the caller is accessing a task's credentials, they must hold the RCU read * lock when reading. + * + * Returns: @gi */ static inline struct group_info *get_group_info(struct group_info *gi) { @@ -209,6 +211,8 @@ DEFINE_CLASS(override_creds, * usage count. The purpose of this is to attempt to catch at compile time the * accidental alteration of a set of credentials that should be considered * immutable. + * + * Returns: @cred when the references are acquired, NULL otherwise. */ static inline const struct cred *get_cred_many(const struct cred *cred, int nr) { @@ -246,8 +250,8 @@ static inline const struct cred *get_cred_rcu(const struct cred *cred) } /** - * put_cred - Release a reference to a set of credentials - * @cred: The credentials to release + * put_cred_many - Release a reference to a set of credentials + * @_cred: The credentials to release * @nr: Number of references to release * * Release a reference to a set of credentials, deleting them when the last ref -- cgit v1.2.3 From 0b16e69d17d8c35c5c9d5918bf596c75a44655d3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 24 Feb 2026 17:20:36 -0800 Subject: KVM: x86: Use scratch field in MMIO fragment to hold small write values When exiting to userspace to service an emulated MMIO write, copy the to-be-written value to a scratch field in the MMIO fragment if the size of the data payload is 8 bytes or less, i.e. can fit in a single chunk, instead of pointing the fragment directly at the source value. This fixes a class of use-after-free bugs that occur when the emulator initiates a write using an on-stack, local variable as the source, the write splits a page boundary, *and* both pages are MMIO pages. Because KVM's ABI only allows for physically contiguous MMIO requests, accesses that split MMIO pages are separated into two fragments, and are sent to userspace one at a time. When KVM attempts to complete userspace MMIO in response to KVM_RUN after the first fragment, KVM will detect the second fragment and generate a second userspace exit, and reference the on-stack variable. The issue is most visible if the second KVM_RUN is performed by a separate task, in which case the stack of the initiating task can show up as truly freed data. ================================================================== BUG: KASAN: use-after-free in complete_emulated_mmio+0x305/0x420 Read of size 1 at addr ffff888009c378d1 by task syz-executor417/984 CPU: 1 PID: 984 Comm: syz-executor417 Not tainted 5.10.0-182.0.0.95.h2627.eulerosv2r13.x86_64 #3 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.15.0-0-g2dd4b9b3f840-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack+0xbe/0xfd print_address_description.constprop.0+0x19/0x170 __kasan_report.cold+0x6c/0x84 kasan_report+0x3a/0x50 check_memory_region+0xfd/0x1f0 memcpy+0x20/0x60 complete_emulated_mmio+0x305/0x420 kvm_arch_vcpu_ioctl_run+0x63f/0x6d0 kvm_vcpu_ioctl+0x413/0xb20 __se_sys_ioctl+0x111/0x160 do_syscall_64+0x30/0x40 entry_SYSCALL_64_after_hwframe+0x67/0xd1 RIP: 0033:0x42477d Code: <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007faa8e6890e8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00000000004d7338 RCX: 000000000042477d RDX: 0000000000000000 RSI: 000000000000ae80 RDI: 0000000000000005 RBP: 00000000004d7330 R08: 00007fff28d546df R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000004d733c R13: 0000000000000000 R14: 000000000040a200 R15: 00007fff28d54720 The buggy address belongs to the page: page:0000000029f6a428 refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x9c37 flags: 0xfffffc0000000(node=0|zone=1|lastcpupid=0x1fffff) raw: 000fffffc0000000 0000000000000000 ffffea0000270dc8 0000000000000000 raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff888009c37780: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff888009c37800: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff >ffff888009c37880: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ^ ffff888009c37900: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff888009c37980: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ================================================================== The bug can also be reproduced with a targeted KVM-Unit-Test by hacking KVM to fill a large on-stack variable in complete_emulated_mmio(), i.e. by overwrite the data value with garbage. Limit the use of the scratch fields to 8-byte or smaller accesses, and to just writes, as larger accesses and reads are not affected thanks to implementation details in the emulator, but add a sanity check to ensure those details don't change in the future. Specifically, KVM never uses on-stack variables for accesses larger that 8 bytes, e.g. uses an operand in the emulator context, and *all* reads are buffered through the mem_read cache. Note! Using the scratch field for reads is not only unnecessary, it's also extremely difficult to handle correctly. As above, KVM buffers all reads through the mem_read cache, and heavily relies on that behavior when re-emulating the instruction after a userspace MMIO read exit. If a read splits a page, the first page is NOT an MMIO page, and the second page IS an MMIO page, then the MMIO fragment needs to point at _just_ the second chunk of the destination, i.e. its position in the mem_read cache. Taking the "obvious" approach of copying the fragment value into the destination when re-emulating the instruction would clobber the first chunk of the destination, i.e. would clobber the data that was read from guest memory. Fixes: f78146b0f923 ("KVM: Fix page-crossing MMIO") Suggested-by: Yashu Zhang Reported-by: Yashu Zhang Closes: https://lore.kernel.org/all/369eaaa2b3c1425c85e8477066391bc7@huawei.com Cc: stable@vger.kernel.org Tested-by: Tom Lendacky Tested-by: Rick Edgecombe Link: https://patch.msgid.link/20260225012049.920665-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 14 +++++++++++++- include/linux/kvm_host.h | 3 ++- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a03530795707..9bb9d7f078fc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8225,7 +8225,13 @@ static int emulator_read_write_onepage(unsigned long addr, void *val, WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS); frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++]; frag->gpa = gpa; - frag->data = val; + if (write && bytes <= 8u) { + frag->val = 0; + frag->data = &frag->val; + memcpy(&frag->val, val, bytes); + } else { + frag->data = val; + } frag->len = bytes; return X86EMUL_CONTINUE; } @@ -8240,6 +8246,9 @@ static int emulator_read_write(struct x86_emulate_ctxt *ctxt, gpa_t gpa; int rc; + if (WARN_ON_ONCE((bytes > 8u || !ops->write) && object_is_on_stack(val))) + return X86EMUL_UNHANDLEABLE; + if (ops->read_write_prepare && ops->read_write_prepare(vcpu, val, bytes)) return X86EMUL_CONTINUE; @@ -11846,6 +11855,9 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) frag++; vcpu->mmio_cur_fragment++; } else { + if (WARN_ON_ONCE(frag->data == &frag->val)) + return -EIO; + /* Go forward to the next mmio piece. */ frag->data += len; frag->gpa += len; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 34759a262b28..abb309372035 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -318,7 +318,8 @@ static inline bool kvm_vcpu_can_poll(ktime_t cur, ktime_t stop) struct kvm_mmio_fragment { gpa_t gpa; void *data; - unsigned len; + u64 val; + unsigned int len; }; struct kvm_vcpu { -- cgit v1.2.3 From 44a2ec96d374806ee74454ea915615536a76b152 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 27 Feb 2026 09:53:13 +0000 Subject: net: stmmac: remove plat_dat->port_node There are repeated instances of: fwnode = priv->plat->port_node; if (!fwnode) fwnode = dev_fwnode(priv->device); However, the only place that ->port_node is set is stmmac_probe_config_dt(): struct device_node *np = pdev->dev.of_node; ... /* PHYLINK automatically parses the phy-handle property */ plat->port_node = of_fwnode_handle(np); which is equivalent to dev_fwnode(&pdev->dev) and, as priv->device will be &pdev->dev, is also equivalent to dev_fwnode(priv->device). Thus, plat_dat->port_node doesn't provide any extra benefit over using dev_fwnode(priv->device) directly. There is one case where port_node is used directly, which can be found in stmmac_pcs_setup(). This may cause a change of behaviour as PCI drivers do not populate plat_dat->port_node, but dev_fwnode(priv->device) may be valid. PCI-based stmmac should be tested. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1vvuX3-0000000Avme-3oej@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 13 +++---------- drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c | 7 ++----- drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 3 --- include/linux/stmmac.h | 1 - 4 files changed, 5 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 5c144ac259af..4e788f54bbbc 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1252,10 +1252,7 @@ static int stmmac_init_phy(struct net_device *dev) xpcs_get_an_mode(priv->hw->xpcs, mode) == DW_AN_C73) return 0; - fwnode = priv->plat->port_node; - if (!fwnode) - fwnode = dev_fwnode(priv->device); - + fwnode = dev_fwnode(priv->device); if (fwnode) phy_fwnode = fwnode_get_phy_node(fwnode); else @@ -1313,7 +1310,6 @@ static int stmmac_phylink_setup(struct stmmac_priv *priv) { struct stmmac_mdio_bus_data *mdio_bus_data; struct phylink_config *config; - struct fwnode_handle *fwnode; struct phylink_pcs *pcs; struct phylink *phylink; @@ -1400,11 +1396,8 @@ static int stmmac_phylink_setup(struct stmmac_priv *priv) config->wol_mac_support |= WAKE_MAGIC; } - fwnode = priv->plat->port_node; - if (!fwnode) - fwnode = dev_fwnode(priv->device); - - phylink = phylink_create(config, fwnode, priv->plat->phy_interface, + phylink = phylink_create(config, dev_fwnode(priv->device), + priv->plat->phy_interface, &stmmac_phylink_mac_ops); if (IS_ERR(phylink)) return PTR_ERR(phylink); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c index a7c2496b39f2..485a0d790baa 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c @@ -430,7 +430,7 @@ int stmmac_pcs_setup(struct net_device *ndev) struct dw_xpcs *xpcs = NULL; int addr, ret; - devnode = priv->plat->port_node; + devnode = dev_fwnode(priv->device); if (priv->plat->pcs_init) { ret = priv->plat->pcs_init(priv); @@ -649,10 +649,7 @@ int stmmac_mdio_register(struct net_device *ndev) stmmac_xgmac2_mdio_read_c45(new_bus, 0, 0, 0); /* If fixed-link is set, skip PHY scanning */ - fwnode = priv->plat->port_node; - if (!fwnode) - fwnode = dev_fwnode(priv->device); - + fwnode = dev_fwnode(priv->device); if (fwnode) { fixed_node = fwnode_get_named_child_node(fwnode, "fixed-link"); if (fixed_node) { diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index 5c9fd91a1db9..c34998486293 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -446,9 +446,6 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) * they are not converted to phylink. */ plat->phy_node = of_parse_phandle(np, "phy-handle", 0); - /* PHYLINK automatically parses the phy-handle property */ - plat->port_node = of_fwnode_handle(np); - /* Get max speed of operation from device tree */ of_property_read_u32(np, "max-speed", &plat->max_speed); diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index b96ae9dadfab..77e51eaa5ec5 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -225,7 +225,6 @@ struct plat_stmmacenet_data { phy_interface_t phy_interface; struct stmmac_mdio_bus_data *mdio_bus_data; struct device_node *phy_node; - struct fwnode_handle *port_node; struct device_node *mdio_node; struct stmmac_dma_cfg *dma_cfg; struct stmmac_safety_feature_cfg *safety_feat_cfg; -- cgit v1.2.3 From 1558705afbb293549fdedd539682bc5240e1964b Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 27 Feb 2026 09:53:59 +0000 Subject: net: stmmac: make dma_cfg mixed/fixed burst boolean struct stmmac_dma_cfg mixed_burst/fixed_burst members are both boolean in nature - of_property_read_bool() are used to read these from DT, and they are only tested for non-zero values. Use bool to avoid unnecessary padding in this structure. Update dwmac-intel to initialise these using true rather than '1', and remove the '0' initialisers as the struct is already zero initialised on allocation. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1vvuXn-0000000AvnX-4A1u@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 4 +--- include/linux/stmmac.h | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c index 92d77b0c2f54..ece2a0c38562 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c @@ -636,8 +636,6 @@ static int intel_mgbe_common_data(struct pci_dev *pdev, plat->dma_cfg->pbl = 32; plat->dma_cfg->pblx8 = true; - plat->dma_cfg->fixed_burst = 0; - plat->dma_cfg->mixed_burst = 0; plat->dma_cfg->aal = 0; plat->dma_cfg->dche = true; @@ -1106,7 +1104,7 @@ static int quark_default_data(struct pci_dev *pdev, plat->dma_cfg->pbl = 16; plat->dma_cfg->pblx8 = true; - plat->dma_cfg->fixed_burst = 1; + plat->dma_cfg->fixed_burst = true; /* AXI (TODO) */ return 0; diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 77e51eaa5ec5..2fc169c7117e 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -97,8 +97,8 @@ struct stmmac_dma_cfg { int txpbl; int rxpbl; bool pblx8; - int fixed_burst; - int mixed_burst; + bool fixed_burst; + bool mixed_burst; bool aal; bool eame; bool multi_msi_en; -- cgit v1.2.3 From 4480d5fa1f6ebe7dfc546e14371d63c8b915a82d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 28 Feb 2026 22:17:31 +0000 Subject: ipmr/ip6mr: Convert net->ipv[46].ipmr_seq to atomic_t. We will no longer hold RTNL for ipmr_mfc_add() and ipmr_mfc_delete(). MFC entry can be loosely connected with VIF by its index for mrt->vif_table[] (stored in mfc_parent), but the two tables are not synchronised. i.e. Even if VIF 1 is removed, MFC for VIF 1 is not automatically removed. The only field that the MFC/VIF interfaces share is net->ipv[46].ipmr_seq, which is protected by RTNL. Adding a new mutex for both just to protect a single field is overkill. Let's convert the field to atomic_t. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260228221800.1082070-14-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/mroute_base.h | 8 ++++---- include/net/netns/ipv4.h | 2 +- include/net/netns/ipv6.h | 2 +- net/ipv4/ipmr.c | 4 ++-- net/ipv6/ip6mr.c | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 0075f6e5c3da..0baa6f994da9 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -76,7 +76,7 @@ static inline int mr_call_vif_notifiers(struct net *net, struct vif_device *vif, struct net_device *vif_dev, unsigned short vif_index, u32 tb_id, - unsigned int *ipmr_seq) + atomic_t *ipmr_seq) { struct vif_entry_notifier_info info = { .info = { @@ -89,7 +89,7 @@ static inline int mr_call_vif_notifiers(struct net *net, }; ASSERT_RTNL(); - (*ipmr_seq)++; + atomic_inc(ipmr_seq); return call_fib_notifiers(net, event_type, &info.info); } @@ -198,7 +198,7 @@ static inline int mr_call_mfc_notifiers(struct net *net, unsigned short family, enum fib_event_type event_type, struct mr_mfc *mfc, u32 tb_id, - unsigned int *ipmr_seq) + atomic_t *ipmr_seq) { struct mfc_entry_notifier_info info = { .info = { @@ -209,7 +209,7 @@ static inline int mr_call_mfc_notifiers(struct net *net, }; ASSERT_RTNL(); - (*ipmr_seq)++; + atomic_inc(ipmr_seq); return call_fib_notifiers(net, event_type, &info.info); } diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 380ff34c0233..94dca64fec41 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -280,7 +280,7 @@ struct netns_ipv4 { struct fib_rules_ops *mr_rules_ops; #endif struct fib_notifier_ops *ipmr_notifier_ops; - unsigned int ipmr_seq; /* protected by rtnl_mutex */ + atomic_t ipmr_seq; #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH struct sysctl_fib_multipath_hash_seed sysctl_fib_multipath_hash_seed; diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 34bdb1308e8f..499e4288170f 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -118,7 +118,7 @@ struct netns_ipv6 { struct seg6_pernet_data *seg6_data; struct fib_notifier_ops *notifier_ops; struct fib_notifier_ops *ip6mr_notifier_ops; - unsigned int ipmr_seq; /* protected by rtnl_mutex */ + atomic_t ipmr_seq; struct { struct hlist_head head; spinlock_t lock; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 07f2d4f8dcbe..6ec73796d84d 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -3226,7 +3226,7 @@ static const struct net_protocol pim_protocol = { static unsigned int ipmr_seq_read(const struct net *net) { - return READ_ONCE(net->ipv4.ipmr_seq) + ipmr_rules_seq_read(net); + return atomic_read(&net->ipv4.ipmr_seq) + ipmr_rules_seq_read(net); } static int ipmr_dump(struct net *net, struct notifier_block *nb, @@ -3247,7 +3247,7 @@ static int __net_init ipmr_notifier_init(struct net *net) { struct fib_notifier_ops *ops; - net->ipv4.ipmr_seq = 0; + atomic_set(&net->ipv4.ipmr_seq, 0); ops = fib_notifier_ops_register(&ipmr_notifier_ops_template, net); if (IS_ERR(ops)) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index e047a4680ab0..85010ff21c98 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1280,7 +1280,7 @@ static int ip6mr_device_event(struct notifier_block *this, static unsigned int ip6mr_seq_read(const struct net *net) { - return READ_ONCE(net->ipv6.ipmr_seq) + ip6mr_rules_seq_read(net); + return atomic_read(&net->ipv6.ipmr_seq) + ip6mr_rules_seq_read(net); } static int ip6mr_dump(struct net *net, struct notifier_block *nb, @@ -1305,7 +1305,7 @@ static int __net_init ip6mr_notifier_init(struct net *net) { struct fib_notifier_ops *ops; - net->ipv6.ipmr_seq = 0; + atomic_set(&net->ipv6.ipmr_seq, 0); ops = fib_notifier_ops_register(&ip6mr_notifier_ops_template, net); if (IS_ERR(ops)) -- cgit v1.2.3 From bddafc06ca5ee1be4d10061f7954c6d6be5dc1d8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 28 Feb 2026 22:17:33 +0000 Subject: ipmr: Don't hold RTNL for ipmr_rtm_route(). ipmr_mfc_add() and ipmr_mfc_delete() are already protected by a dedicated mutex. rtm_to_ipmr_mfcc() calls __ipmr_get_table(), __dev_get_by_index(), amd ipmr_find_vif(). Once __dev_get_by_index() is converted to dev_get_by_index_rcu(), we can move the other two functions under that same RCU section and drop RTNL for ipmr_rtm_route(). Let's do that conversion and drop ASSERT_RTNL() in mr_call_mfc_notifiers(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260228221800.1082070-16-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/mroute_base.h | 1 - net/ipv4/ipmr.c | 34 +++++++++++++++++++++------------- 2 files changed, 21 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 0baa6f994da9..cf3374580f74 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -208,7 +208,6 @@ static inline int mr_call_mfc_notifiers(struct net *net, .tb_id = tb_id }; - ASSERT_RTNL(); atomic_inc(ipmr_seq); return call_fib_notifiers(net, event_type, &info.info); } diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index d4983d8a9b2a..8a08d09b4c30 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1211,7 +1211,6 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent) struct net *net = read_pnet(&mrt->net); struct mfc_cache *c; - /* The entries are added/deleted only under RTNL */ rcu_read_lock(); c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr, mfc->mfcc_mcastgrp.s_addr, parent); @@ -1238,7 +1237,6 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, if (mfc->mfcc_parent >= MAXVIFS) return -ENFILE; - /* The entries are added/deleted only under RTNL */ rcu_read_lock(); c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr, mfc->mfcc_mcastgrp.s_addr, parent); @@ -2853,10 +2851,10 @@ static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh, { struct net_device *dev = NULL; u32 tblid = RT_TABLE_DEFAULT; + int ret, rem, iif = 0; struct mr_table *mrt; struct nlattr *attr; struct rtmsg *rtm; - int ret, rem; ret = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX, rtm_ipmr_policy, extack); @@ -2883,11 +2881,7 @@ static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh, mfcc->mfcc_mcastgrp.s_addr = nla_get_be32(attr); break; case RTA_IIF: - dev = __dev_get_by_index(net, nla_get_u32(attr)); - if (!dev) { - ret = -ENODEV; - goto out; - } + iif = nla_get_u32(attr); break; case RTA_MULTIPATH: if (ipmr_nla_get_ttls(attr, mfcc) < 0) { @@ -2903,16 +2897,30 @@ static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh, break; } } + + rcu_read_lock(); + mrt = __ipmr_get_table(net, tblid); if (!mrt) { ret = -ENOENT; - goto out; + goto unlock; } + + if (iif) { + dev = dev_get_by_index_rcu(net, iif); + if (!dev) { + ret = -ENODEV; + goto unlock; + } + + mfcc->mfcc_parent = ipmr_find_vif(mrt, dev); + } + *mrtret = mrt; *mrtsock = rtm->rtm_protocol == RTPROT_MROUTED ? 1 : 0; - if (dev) - mfcc->mfcc_parent = ipmr_find_vif(mrt, dev); +unlock: + rcu_read_unlock(); out: return ret; } @@ -3343,9 +3351,9 @@ static const struct rtnl_msg_handler ipmr_rtnl_msg_handlers[] __initconst = { {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_GETLINK, .dumpit = ipmr_rtm_dumplink, .flags = RTNL_FLAG_DUMP_UNLOCKED}, {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_NEWROUTE, - .doit = ipmr_rtm_route}, + .doit = ipmr_rtm_route, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_DELROUTE, - .doit = ipmr_rtm_route}, + .doit = ipmr_rtm_route, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_GETROUTE, .doit = ipmr_rtm_getroute, .dumpit = ipmr_rtm_dumproute, .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, -- cgit v1.2.3 From c69855ada28656fdd7e197b6e24cd40a04fe14d3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 28 Feb 2026 14:08:45 -0800 Subject: atm: atmdev: add function parameter names and description kernel-doc reports function parameters not described for parameters that are not named. Add parameter names for these functions and then describe the function parameters in kernel-doc format. Fixes these warnings: Warning: include/linux/atmdev.h:316 function parameter '' not described in 'register_atm_ioctl' Warning: include/linux/atmdev.h:321 function parameter '' not described in 'deregister_atm_ioctl' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260228220845.2978547-1-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- include/linux/atmdev.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/atmdev.h b/include/linux/atmdev.h index 70807c679f1a..82a32526df64 100644 --- a/include/linux/atmdev.h +++ b/include/linux/atmdev.h @@ -309,17 +309,19 @@ struct atm_ioctl { /** * register_atm_ioctl - register handler for ioctl operations + * @ioctl: ioctl handler to register * * Special (non-device) handlers of ioctl's should * register here. If you're a normal device, you should * set .ioctl in your atmdev_ops instead. */ -void register_atm_ioctl(struct atm_ioctl *); +void register_atm_ioctl(struct atm_ioctl *ioctl); /** * deregister_atm_ioctl - remove the ioctl handler + * @ioctl: ioctl handler to deregister */ -void deregister_atm_ioctl(struct atm_ioctl *); +void deregister_atm_ioctl(struct atm_ioctl *ioctl); /* register_atmdevice_notifier - register atm_dev notify events -- cgit v1.2.3 From 83419c8fdbbc1dacd12fa614c5a3561e498aac5f Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Sat, 28 Feb 2026 13:47:55 -0500 Subject: bpf: Factor out program return value calculation Factor the return value range calculation logic in check_return_code out of the function in preparation for separating the return value validation logic for BPF_EXIT and bpf_throw(). No functional changes. The change made in return_retval_code's handling of PROG_TRACING program types (not error'ing on the default case) is a no-op because the match on the program attach type is exhaustive. Acked-by: Eduard Zingerman Signed-off-by: Emil Tsalapatis Link: https://lore.kernel.org/r/20260228184759.108145-2-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 226 ++++++++++++++++++++++++------------------- 2 files changed, 126 insertions(+), 101 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c1e30096ea7b..090aa26d1c98 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -265,6 +265,7 @@ struct bpf_reference_state { struct bpf_retval_range { s32 minval; s32 maxval; + bool return_32bit; }; /* state of the program: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 34f89ed29c47..89e6d2f6bdf6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2953,7 +2953,11 @@ static void init_reg_state(struct bpf_verifier_env *env, static struct bpf_retval_range retval_range(s32 minval, s32 maxval) { - return (struct bpf_retval_range){ minval, maxval }; + /* + * return_32bit is set to false by default and set explicitly + * by the caller when necessary. + */ + return (struct bpf_retval_range){ minval, maxval, false }; } #define BPF_MAIN_FUNC (-1) @@ -11175,10 +11179,9 @@ static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env) return is_rbtree_lock_required_kfunc(kfunc_btf_id); } -static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg, - bool return_32bit) +static bool retval_range_within(struct bpf_retval_range range, const struct bpf_reg_state *reg) { - if (return_32bit) + if (range.return_32bit) return range.minval <= reg->s32_min_value && reg->s32_max_value <= range.maxval; else return range.minval <= reg->smin_value && reg->smax_value <= range.maxval; @@ -11222,7 +11225,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) return err; /* enforce R0 return value range, and bpf_callback_t returns 64bit */ - if (!retval_range_within(callee->callback_ret_range, r0, false)) { + if (!retval_range_within(callee->callback_ret_range, r0)) { verbose_invalid_scalar(env, r0, callee->callback_ret_range, "At callback return", "R0"); return -EINVAL; @@ -17841,6 +17844,115 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } + +static bool return_retval_range(struct bpf_verifier_env *env, struct bpf_retval_range *range) +{ + enum bpf_prog_type prog_type = resolve_prog_type(env->prog); + + /* Default return value range. */ + *range = retval_range(0, 1); + + switch (prog_type) { + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + switch (env->prog->expected_attach_type) { + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_UNIX_RECVMSG: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_UNIX_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: + case BPF_CGROUP_UNIX_GETSOCKNAME: + *range = retval_range(1, 1); + break; + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: + *range = retval_range(0, 3); + break; + default: + break; + } + break; + case BPF_PROG_TYPE_CGROUP_SKB: + if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) + *range = retval_range(0, 3); + break; + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + break; + case BPF_PROG_TYPE_RAW_TRACEPOINT: + if (!env->prog->aux->attach_btf_id) + return false; + *range = retval_range(0, 0); + break; + case BPF_PROG_TYPE_TRACING: + switch (env->prog->expected_attach_type) { + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + case BPF_TRACE_FSESSION: + *range = retval_range(0, 0); + break; + case BPF_TRACE_RAW_TP: + case BPF_MODIFY_RETURN: + return false; + case BPF_TRACE_ITER: + default: + break; + } + break; + case BPF_PROG_TYPE_KPROBE: + switch (env->prog->expected_attach_type) { + case BPF_TRACE_KPROBE_SESSION: + case BPF_TRACE_UPROBE_SESSION: + break; + default: + return false; + } + break; + case BPF_PROG_TYPE_SK_LOOKUP: + *range = retval_range(SK_DROP, SK_PASS); + break; + + case BPF_PROG_TYPE_LSM: + if (env->prog->expected_attach_type != BPF_LSM_CGROUP) { + /* no range found, any return value is allowed */ + if (!get_func_retval_range(env->prog, range)) + return false; + /* no restricted range, any return value is allowed */ + if (range->minval == S32_MIN && range->maxval == S32_MAX) + return false; + range->return_32bit = true; + } else if (!env->prog->aux->attach_func_proto->type) { + /* Make sure programs that attach to void + * hooks don't try to modify return value. + */ + *range = retval_range(1, 1); + } + break; + + case BPF_PROG_TYPE_NETFILTER: + *range = retval_range(NF_DROP, NF_ACCEPT); + break; + case BPF_PROG_TYPE_STRUCT_OPS: + *range = retval_range(0, 0); + break; + case BPF_PROG_TYPE_EXT: + /* freplace program can return anything as its return value + * depends on the to-be-replaced kernel func or bpf program. + */ + default: + return false; + } + + /* Continue calculating. */ + + return true; +} + static int check_return_code(struct bpf_verifier_env *env, int regno, const char *reg_name) { const char *exit_ctx = "At program exit"; @@ -17849,18 +17961,17 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char struct bpf_reg_state *reg = reg_state(env, regno); struct bpf_retval_range range = retval_range(0, 1); enum bpf_prog_type prog_type = resolve_prog_type(env->prog); - int err; struct bpf_func_state *frame = env->cur_state->frame[0]; const bool is_subprog = frame->subprogno; - bool return_32bit = false; const struct btf_type *reg_type, *ret_type = NULL; + int err; /* LSM and struct_ops func-ptr's return type could be "void" */ if (!is_subprog || frame->in_exception_callback_fn) { switch (prog_type) { case BPF_PROG_TYPE_LSM: if (prog->expected_attach_type == BPF_LSM_CGROUP) - /* See below, can be 0 or 0-1 depending on hook. */ + /* See return_retval_range, can be 0 or 0-1 depending on hook. */ break; if (!prog->aux->attach_func_proto->type) return 0; @@ -17918,101 +18029,14 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char return 0; } - switch (prog_type) { - case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: - if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || - env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG || - env->prog->expected_attach_type == BPF_CGROUP_UNIX_RECVMSG || - env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME || - env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME || - env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETPEERNAME || - env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME || - env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME || - env->prog->expected_attach_type == BPF_CGROUP_UNIX_GETSOCKNAME) - range = retval_range(1, 1); - if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND || - env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND) - range = retval_range(0, 3); - break; - case BPF_PROG_TYPE_CGROUP_SKB: - if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) { - range = retval_range(0, 3); - enforce_attach_type_range = tnum_range(2, 3); - } - break; - case BPF_PROG_TYPE_CGROUP_SOCK: - case BPF_PROG_TYPE_SOCK_OPS: - case BPF_PROG_TYPE_CGROUP_DEVICE: - case BPF_PROG_TYPE_CGROUP_SYSCTL: - case BPF_PROG_TYPE_CGROUP_SOCKOPT: - break; - case BPF_PROG_TYPE_RAW_TRACEPOINT: - if (!env->prog->aux->attach_btf_id) - return 0; - range = retval_range(0, 0); - break; - case BPF_PROG_TYPE_TRACING: - switch (env->prog->expected_attach_type) { - case BPF_TRACE_FENTRY: - case BPF_TRACE_FEXIT: - case BPF_TRACE_FSESSION: - range = retval_range(0, 0); - break; - case BPF_TRACE_RAW_TP: - case BPF_MODIFY_RETURN: - return 0; - case BPF_TRACE_ITER: - break; - default: - return -ENOTSUPP; - } - break; - case BPF_PROG_TYPE_KPROBE: - switch (env->prog->expected_attach_type) { - case BPF_TRACE_KPROBE_SESSION: - case BPF_TRACE_UPROBE_SESSION: - range = retval_range(0, 1); - break; - default: - return 0; - } - break; - case BPF_PROG_TYPE_SK_LOOKUP: - range = retval_range(SK_DROP, SK_PASS); - break; + if (prog_type == BPF_PROG_TYPE_STRUCT_OPS && !ret_type) + return 0; - case BPF_PROG_TYPE_LSM: - if (env->prog->expected_attach_type != BPF_LSM_CGROUP) { - /* no range found, any return value is allowed */ - if (!get_func_retval_range(env->prog, &range)) - return 0; - /* no restricted range, any return value is allowed */ - if (range.minval == S32_MIN && range.maxval == S32_MAX) - return 0; - return_32bit = true; - } else if (!env->prog->aux->attach_func_proto->type) { - /* Make sure programs that attach to void - * hooks don't try to modify return value. - */ - range = retval_range(1, 1); - } - break; + if (prog_type == BPF_PROG_TYPE_CGROUP_SKB && (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS)) + enforce_attach_type_range = tnum_range(2, 3); - case BPF_PROG_TYPE_NETFILTER: - range = retval_range(NF_DROP, NF_ACCEPT); - break; - case BPF_PROG_TYPE_STRUCT_OPS: - if (!ret_type) - return 0; - range = retval_range(0, 0); - break; - case BPF_PROG_TYPE_EXT: - /* freplace program can return anything as its return value - * depends on the to-be-replaced kernel func or bpf program. - */ - default: + if (!return_retval_range(env, &range)) return 0; - } enforce_retval: if (reg->type != SCALAR_VALUE) { @@ -18025,7 +18049,7 @@ enforce_retval: if (err) return err; - if (!retval_range_within(range, reg, return_32bit)) { + if (!retval_range_within(range, reg)) { verbose_invalid_scalar(env, reg, range, exit_ctx, reg_name); if (!is_subprog && prog->expected_attach_type == BPF_LSM_CGROUP && -- cgit v1.2.3 From 2864fb6aa947703d290b52b1b030b0b74d0a6128 Mon Sep 17 00:00:00 2001 From: André Draszik Date: Mon, 2 Mar 2026 13:32:08 +0000 Subject: power: supply: max17042: initial support for Maxim MAX77759 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Maxim MAX77759 is a companion PMIC intended for use in mobile phones and tablets. It is used on Google Pixel 6 and 6 Pro (oriole and raven). Amongst others, it contains a fuel gauge that is similar to the ones supported by this driver. The fuel gauge can measure battery charge and discharge current, battery voltage, battery temperature, and the Type C connector's temperature. The MAX77759 incorporates the Maxim ModelGauge m5 algorithm. It, as well as previous generations like m3 on max17047/max17050, requires the host to save/restore some register values across power cycles to maintain full accuracy. Extending the driver for such support is out of scope in this initial commit. Reviewed-by: Peter Griffin Signed-off-by: André Draszik Link: https://patch.msgid.link/20260302-max77759-fg-v3-9-3c5f01dbda23@linaro.org Signed-off-by: Sebastian Reichel --- drivers/power/supply/max17042_battery.c | 59 ++++++++++++++++++++++++++++++--- include/linux/power/max17042_battery.h | 24 ++++++++++++-- 2 files changed, 77 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/power/supply/max17042_battery.c b/drivers/power/supply/max17042_battery.c index e21d2bd7e231..b9a21cef2cc6 100644 --- a/drivers/power/supply/max17042_battery.c +++ b/drivers/power/supply/max17042_battery.c @@ -650,7 +650,8 @@ static void max17042_write_config_regs(struct max17042_chip *chip) regmap_write(map, MAX17042_RelaxCFG, config->relax_cfg); if (chip->chip_type == MAXIM_DEVICE_TYPE_MAX17047 || chip->chip_type == MAXIM_DEVICE_TYPE_MAX17050 || - chip->chip_type == MAXIM_DEVICE_TYPE_MAX17055) + chip->chip_type == MAXIM_DEVICE_TYPE_MAX17055 || + chip->chip_type == MAXIM_DEVICE_TYPE_MAX77759) regmap_write(map, MAX17047_FullSOCThr, config->full_soc_thresh); } @@ -787,7 +788,8 @@ static inline void max17042_override_por_values(struct max17042_chip *chip) if ((chip->chip_type == MAXIM_DEVICE_TYPE_MAX17042) || (chip->chip_type == MAXIM_DEVICE_TYPE_MAX17047) || - (chip->chip_type == MAXIM_DEVICE_TYPE_MAX17050)) { + (chip->chip_type == MAXIM_DEVICE_TYPE_MAX17050) || + (chip->chip_type == MAXIM_DEVICE_TYPE_MAX77759)) { max17042_override_por(map, MAX17042_IAvg_empty, config->iavg_empty); max17042_override_por(map, MAX17042_TempNom, config->temp_nom); max17042_override_por(map, MAX17042_TempLim, config->temp_lim); @@ -796,7 +798,8 @@ static inline void max17042_override_por_values(struct max17042_chip *chip) if ((chip->chip_type == MAXIM_DEVICE_TYPE_MAX17047) || (chip->chip_type == MAXIM_DEVICE_TYPE_MAX17050) || - (chip->chip_type == MAXIM_DEVICE_TYPE_MAX17055)) { + (chip->chip_type == MAXIM_DEVICE_TYPE_MAX17055) || + (chip->chip_type == MAXIM_DEVICE_TYPE_MAX77759)) { max17042_override_por(map, MAX17047_V_empty, config->vempty); } } @@ -1019,6 +1022,45 @@ static const struct regmap_config max17042_regmap_config = { .val_format_endian = REGMAP_ENDIAN_NATIVE, }; +static const struct regmap_range max77759_fg_registers[] = { + regmap_reg_range(MAX17042_STATUS, MAX77759_MixAtFull), + regmap_reg_range(MAX17042_VFSOC0Enable, MAX17042_VFSOC0Enable), + regmap_reg_range(MAX17042_MLOCKReg1, MAX17042_MLOCKReg2), + regmap_reg_range(MAX17042_MODELChrTbl, MAX17055_TimerH), + regmap_reg_range(MAX77759_IIn, MAX77759_IIn), + regmap_reg_range(MAX17055_AtQResidual, MAX17055_AtAvCap), + regmap_reg_range(MAX17042_OCVInternal, MAX17042_OCVInternal), + regmap_reg_range(MAX17042_VFSOC, MAX17042_VFSOC), +}; + +static const struct regmap_range max77759_fg_ro_registers[] = { + regmap_reg_range(MAX17042_FSTAT, MAX17042_FSTAT), + regmap_reg_range(MAX17042_OCVInternal, MAX17042_OCVInternal), + regmap_reg_range(MAX17042_VFSOC, MAX17042_VFSOC), +}; + +static const struct regmap_access_table max77759_fg_write_table = { + .yes_ranges = max77759_fg_registers, + .n_yes_ranges = ARRAY_SIZE(max77759_fg_registers), + .no_ranges = max77759_fg_ro_registers, + .n_no_ranges = ARRAY_SIZE(max77759_fg_ro_registers), +}; + +static const struct regmap_access_table max77759_fg_rd_table = { + .yes_ranges = max77759_fg_registers, + .n_yes_ranges = ARRAY_SIZE(max77759_fg_registers), +}; + +static const struct regmap_config max77759_fg_regmap_cfg = { + .reg_bits = 8, + .val_bits = 16, + .max_register = 0xff, + .wr_table = &max77759_fg_write_table, + .rd_table = &max77759_fg_rd_table, + .val_format_endian = REGMAP_ENDIAN_NATIVE, + .cache_type = REGCACHE_NONE, +}; + static const struct power_supply_desc max17042_psy_desc = { .name = "max170xx_battery", .type = POWER_SUPPLY_TYPE_BATTERY, @@ -1045,6 +1087,7 @@ static int max17042_probe(struct i2c_client *client, struct device *dev, int irq { struct i2c_adapter *adapter = client->adapter; const struct power_supply_desc *max17042_desc = &max17042_psy_desc; + const struct regmap_config *regmap_config; struct power_supply_config psy_cfg = {}; struct max17042_chip *chip; int ret; @@ -1060,7 +1103,12 @@ static int max17042_probe(struct i2c_client *client, struct device *dev, int irq chip->dev = dev; chip->chip_type = chip_type; - chip->regmap = devm_regmap_init_i2c(client, &max17042_regmap_config); + + if (chip->chip_type == MAXIM_DEVICE_TYPE_MAX77759) + regmap_config = &max77759_fg_regmap_cfg; + else + regmap_config = &max17042_regmap_config; + chip->regmap = devm_regmap_init_i2c(client, regmap_config); if (IS_ERR(chip->regmap)) return dev_err_probe(dev, PTR_ERR(chip->regmap), "Failed to initialize regmap\n"); @@ -1241,6 +1289,8 @@ static const struct of_device_id max17042_dt_match[] __used = { .data = (void *) MAXIM_DEVICE_TYPE_MAX17055 }, { .compatible = "maxim,max77705-battery", .data = (void *) MAXIM_DEVICE_TYPE_MAX17047 }, + { .compatible = "maxim,max77759-fg", + .data = (void *) MAXIM_DEVICE_TYPE_MAX77759 }, { .compatible = "maxim,max77849-battery", .data = (void *) MAXIM_DEVICE_TYPE_MAX17047 }, { }, @@ -1253,6 +1303,7 @@ static const struct i2c_device_id max17042_id[] = { { "max17047", MAXIM_DEVICE_TYPE_MAX17047 }, { "max17050", MAXIM_DEVICE_TYPE_MAX17050 }, { "max17055", MAXIM_DEVICE_TYPE_MAX17055 }, + { "max77759-fg", MAXIM_DEVICE_TYPE_MAX77759 }, { "max77849-battery", MAXIM_DEVICE_TYPE_MAX17047 }, { } }; diff --git a/include/linux/power/max17042_battery.h b/include/linux/power/max17042_battery.h index c417abd2ab70..05097f08ea36 100644 --- a/include/linux/power/max17042_battery.h +++ b/include/linux/power/max17042_battery.h @@ -105,7 +105,7 @@ enum max17042_register { MAX17042_OCV = 0xEE, - MAX17042_OCVInternal = 0xFB, /* MAX17055 VFOCV */ + MAX17042_OCVInternal = 0xFB, /* MAX17055/77759 VFOCV */ MAX17042_VFSOC = 0xFF, }; @@ -156,7 +156,7 @@ enum max17055_register { MAX17055_AtAvCap = 0xDF, }; -/* Registers specific to max17047/50/55 */ +/* Registers specific to max17047/50/55/77759 */ enum max17047_register { MAX17047_QRTbl00 = 0x12, MAX17047_FullSOCThr = 0x13, @@ -167,12 +167,32 @@ enum max17047_register { MAX17047_QRTbl30 = 0x42, }; +enum max77759_register { + MAX77759_AvgTA0 = 0x26, + MAX77759_AtTTF = 0x33, + MAX77759_Tconvert = 0x34, + MAX77759_AvgCurrent0 = 0x3B, + MAX77759_THMHOT = 0x40, + MAX77759_CTESample = 0x41, + MAX77759_ISys = 0x43, + MAX77759_AvgVCell0 = 0x44, + MAX77759_RlxSOC = 0x47, + MAX77759_AvgISys = 0x4B, + MAX77759_QH0 = 0x4C, + MAX77759_MixAtFull = 0x4F, + MAX77759_VSys = 0xB1, + MAX77759_TAlrtTh2 = 0xB2, + MAX77759_VByp = 0xB3, + MAX77759_IIn = 0xD0, +}; + enum max170xx_chip_type { MAXIM_DEVICE_TYPE_UNKNOWN = 0, MAXIM_DEVICE_TYPE_MAX17042, MAXIM_DEVICE_TYPE_MAX17047, MAXIM_DEVICE_TYPE_MAX17050, MAXIM_DEVICE_TYPE_MAX17055, + MAXIM_DEVICE_TYPE_MAX77759, MAXIM_DEVICE_TYPE_NUM }; -- cgit v1.2.3 From 83a86e27c34d06ec2dc117fb293e80f78402df49 Mon Sep 17 00:00:00 2001 From: André Draszik Date: Mon, 2 Mar 2026 13:32:09 +0000 Subject: power: supply: max17042: consider task period (max77759) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Several (register) values reported by the fuel gauge depend on its internal task period and it needs to be taken into account when calculating results. All relevant example formulas in the data sheet assume the default task period (of 5760) and final results need to be adjusted based on the task period in effect. Update the code as and where necessary. Reviewed-by: Peter Griffin Signed-off-by: André Draszik Link: https://patch.msgid.link/20260302-max77759-fg-v3-10-3c5f01dbda23@linaro.org Signed-off-by: Sebastian Reichel --- drivers/power/supply/max17042_battery.c | 20 ++++++++++++++++++++ include/linux/power/max17042_battery.h | 1 + 2 files changed, 21 insertions(+) (limited to 'include/linux') diff --git a/drivers/power/supply/max17042_battery.c b/drivers/power/supply/max17042_battery.c index b9a21cef2cc6..bafbf8706055 100644 --- a/drivers/power/supply/max17042_battery.c +++ b/drivers/power/supply/max17042_battery.c @@ -61,6 +61,7 @@ struct max17042_chip { struct work_struct work; int init_complete; int irq; + int task_period; }; static enum power_supply_property max17042_battery_props[] = { @@ -331,6 +332,8 @@ static int max17042_get_property(struct power_supply *psy, return ret; data64 = data * 5000000ll; + data64 *= chip->task_period; + do_div(data64, MAX17042_DEFAULT_TASK_PERIOD); do_div(data64, chip->pdata->r_sns); val->intval = data64; break; @@ -340,6 +343,8 @@ static int max17042_get_property(struct power_supply *psy, return ret; data64 = data * 5000000ll; + data64 *= chip->task_period; + do_div(data64, MAX17042_DEFAULT_TASK_PERIOD); do_div(data64, chip->pdata->r_sns); val->intval = data64; break; @@ -349,6 +354,8 @@ static int max17042_get_property(struct power_supply *psy, return ret; data64 = data * 5000000ll; + data64 *= chip->task_period; + do_div(data64, MAX17042_DEFAULT_TASK_PERIOD); do_div(data64, chip->pdata->r_sns); val->intval = data64; break; @@ -358,6 +365,8 @@ static int max17042_get_property(struct power_supply *psy, return ret; data64 = sign_extend64(data, 15) * 5000000ll; + data64 *= chip->task_period; + data64 = div_s64(data64, MAX17042_DEFAULT_TASK_PERIOD); val->intval = div_s64(data64, chip->pdata->r_sns); break; case POWER_SUPPLY_PROP_TEMP: @@ -1142,6 +1151,17 @@ static int max17042_probe(struct i2c_client *client, struct device *dev, int irq regmap_write(chip->regmap, MAX17042_LearnCFG, 0x0007); } + chip->task_period = MAX17042_DEFAULT_TASK_PERIOD; + if (chip->chip_type == MAXIM_DEVICE_TYPE_MAX77759) { + ret = regmap_read(chip->regmap, MAX17042_TaskPeriod, &val); + if (ret) + return dev_err_probe(dev, ret, + "failed to read task period\n"); + chip->task_period = val; + } + dev_dbg(dev, "task period: %#.4x (%d)\n", chip->task_period, + chip->task_period); + chip->battery = devm_power_supply_register(dev, max17042_desc, &psy_cfg); if (IS_ERR(chip->battery)) diff --git a/include/linux/power/max17042_battery.h b/include/linux/power/max17042_battery.h index 05097f08ea36..d5b08313cf11 100644 --- a/include/linux/power/max17042_battery.h +++ b/include/linux/power/max17042_battery.h @@ -17,6 +17,7 @@ #define MAX17042_DEFAULT_VMAX (4500) /* LiHV cell max */ #define MAX17042_DEFAULT_TEMP_MIN (0) /* For sys without temp sensor */ #define MAX17042_DEFAULT_TEMP_MAX (700) /* 70 degrees Celcius */ +#define MAX17042_DEFAULT_TASK_PERIOD (5760) /* Consider RepCap which is less then 10 units below FullCAP full */ #define MAX17042_FULL_THRESHOLD 10 -- cgit v1.2.3 From 48f7a50c027dd2abb9e7b8a6ecc8e531d87f2c21 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Tue, 17 Feb 2026 14:11:11 +0100 Subject: stop_machine: Fix the documentation for a NULL cpus argument MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A recent refactoring of the kernel-docs for stop machine changed the description of the cpus parameter from "NULL = any online cpu" to "NULL = run on each online CPU". However the callback is only executed on a single CPU, not all of them. The old wording was a bit ambiguous and could have been read both ways. Reword the documentation to be correct again and hopefully also clearer. Fixes: fc6f89dc7078 ("stop_machine: Improve kernel-doc function-header comments") Signed-off-by: Thomas Weißschuh Signed-off-by: Paul E. McKenney Reviewed-by: Sebastian Andrzej Siewior --- include/linux/stop_machine.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 72820503514c..01011113d226 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -99,7 +99,7 @@ static inline void print_stop_info(const char *log_lvl, struct task_struct *task * stop_machine: freeze the machine on all CPUs and run this function * @fn: the function to run * @data: the data ptr to pass to @fn() - * @cpus: the cpus to run @fn() on (NULL = run on each online CPU) + * @cpus: the cpus to run @fn() on (NULL = one unspecified online CPU) * * Description: This causes a thread to be scheduled on every CPU, which * will run with interrupts disabled. Each CPU specified by @cpus will @@ -133,7 +133,7 @@ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus); * stop_machine_cpuslocked: freeze the machine on all CPUs and run this function * @fn: the function to run * @data: the data ptr to pass to @fn() - * @cpus: the cpus to run @fn() on (NULL = run on each online CPU) + * @cpus: the cpus to run @fn() on (NULL = one unspecified online CPU) * * Same as above. Avoids nested calls to cpus_read_lock(). * -- cgit v1.2.3 From 1ac252ad036cdb18f5fb7f76bb6061adfed9cedf Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 3 Mar 2026 23:04:04 +0200 Subject: rculist_bl: add hlist_bl_for_each_entry_continue_rcu Change the old hlist_bl_first_rcu to hlist_bl_first_rcu_dereference to indicate that it is a RCU dereference. Add hlist_bl_next_rcu and hlist_bl_first_rcu to use RCU pointers and use them to fix sparse warnings. Add hlist_bl_for_each_entry_continue_rcu. Signed-off-by: Julian Anastasov Signed-off-by: Florian Westphal --- include/linux/rculist_bl.h | 49 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rculist_bl.h b/include/linux/rculist_bl.h index 0b952d06eb0b..36363b876e53 100644 --- a/include/linux/rculist_bl.h +++ b/include/linux/rculist_bl.h @@ -8,21 +8,31 @@ #include #include +/* return the first ptr or next element in an RCU protected list */ +#define hlist_bl_first_rcu(head) \ + (*((struct hlist_bl_node __rcu **)(&(head)->first))) +#define hlist_bl_next_rcu(node) \ + (*((struct hlist_bl_node __rcu **)(&(node)->next))) + static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h, struct hlist_bl_node *n) { LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK); LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) != LIST_BL_LOCKMASK); - rcu_assign_pointer(h->first, + rcu_assign_pointer(hlist_bl_first_rcu(h), (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK)); } -static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h) -{ - return (struct hlist_bl_node *) - ((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK); -} +#define hlist_bl_first_rcu_dereference(head) \ +({ \ + struct hlist_bl_head *__head = (head); \ + \ + (struct hlist_bl_node *) \ + ((unsigned long)rcu_dereference_check(hlist_bl_first_rcu(__head), \ + hlist_bl_is_locked(__head)) & \ + ~LIST_BL_LOCKMASK); \ +}) /** * hlist_bl_del_rcu - deletes entry from hash list without re-initialization @@ -73,7 +83,7 @@ static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n, { struct hlist_bl_node *first; - /* don't need hlist_bl_first_rcu because we're under lock */ + /* don't need hlist_bl_first_rcu* because we're under lock */ first = hlist_bl_first(h); n->next = first; @@ -93,9 +103,30 @@ static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n, * */ #define hlist_bl_for_each_entry_rcu(tpos, pos, head, member) \ - for (pos = hlist_bl_first_rcu(head); \ + for (pos = hlist_bl_first_rcu_dereference(head); \ pos && \ ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \ - pos = rcu_dereference_raw(pos->next)) + pos = rcu_dereference_raw(hlist_bl_next_rcu(pos))) + +/** + * hlist_bl_for_each_entry_continue_rcu - continue iteration over list of given + * type + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct hlist_bl_node to use as a loop cursor. + * @member: the name of the hlist_bl_node within the struct. + * + * Continue to iterate over list of given type, continuing after + * the current position which must have been in the list when the RCU read + * lock was taken. + * This would typically require either that you obtained the node from a + * previous walk of the list in the same RCU read-side critical section, or + * that you held some sort of non-RCU reference (such as a reference count) + * to keep the node alive *and* in the list. + */ +#define hlist_bl_for_each_entry_continue_rcu(tpos, pos, member) \ + for (pos = rcu_dereference_raw(hlist_bl_next_rcu(&(tpos)->member)); \ + pos && \ + ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \ + pos = rcu_dereference_raw(hlist_bl_next_rcu(pos))) #endif -- cgit v1.2.3 From 44d93cf1abb6a85d65c3b4b027c82d44263de6a5 Mon Sep 17 00:00:00 2001 From: Karthikeyan Kathirvel Date: Wed, 4 Mar 2026 14:23:42 +0530 Subject: wifi: UHR: define DPS/DBE/P-EDCA elements and fix size parsing Add UHR Operation and Capability definitions and parsing helpers: - Define ieee80211_uhr_dps_info, ieee80211_uhr_dbe_info, ieee80211_uhr_p_edca_info with masks. - Update ieee80211_uhr_oper_size_ok() to account for optional DPS/DBE/P-EDCA blocks. - Move NPCA pointer position after DPS Operation Parameter if it is present in ieee80211_uhr_oper_size_ok(). - Move NPCA pointer position after DPS info if it is present in ieee80211_uhr_npca_info(). Signed-off-by: Karthikeyan Kathirvel Link: https://patch.msgid.link/20260304085343.1093993-2-karthikeyan.kathirvel@oss.qualcomm.com Signed-off-by: Johannes Berg --- include/linux/ieee80211-uhr.h | 271 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 265 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211-uhr.h b/include/linux/ieee80211-uhr.h index 132acced7d79..9729d23e4766 100644 --- a/include/linux/ieee80211-uhr.h +++ b/include/linux/ieee80211-uhr.h @@ -29,11 +29,216 @@ struct ieee80211_uhr_operation { #define IEEE80211_UHR_NPCA_PARAMS_MOPLEN 0x00400000 #define IEEE80211_UHR_NPCA_PARAMS_DIS_SUBCH_BMAP_PRES 0x00800000 +/** + * struct ieee80211_uhr_npca_info - npca operation information + * + * This structure is the "NPCA Operation Parameters field format" of "UHR + * Operation Element" fields as described in P802.11bn_D1.3 + * subclause 9.4.2.353. See Figure 9-aa4. + * + * Refer to IEEE80211_UHR_NPCA* + * @params: + * NPCA Primary Channel - NPCA primary channel + * NPCA_Min Duration Threshold - Minimum duration of inter-BSS activity + * NPCA Switching Delay - + * Time needed by an NPCA AP to switch from the + * BSS primary channel to the NPCA primary channel + * in the unit of 4 µs. + * NPCA Switching Back Delay - + * Time to switch from the NPCA primary channel + * to the BSS primary channel in the unit of 4 µs. + * NPCA Initial QSRC - + * Initialize the EDCAF QSRC[AC] variables + * when an NPCA STA in the BSS + * switches to NPCA operation. + * NPCA MOPLEN - + * Indicates which conditions can be used to + * initiate an NPCA operation, + * 1 -> both PHYLEN NPCA operation and MOPLEN + * NPCA operation are + * permitted in the BSS + * 0 -> only PHYLEN NPCA operation is allowed in the BSS. + * NPCA Disabled Subchannel Bitmap Present - + * Indicates whether the NPCA Disabled Subchannel + * Bitmap field is present. A 1 in this field indicates that + * the NPCA Disabled Subchannel Bitmap field is present + * @dis_subch_bmap: + * A bit in the bitmap that lies within the BSS bandwidth is set + * to 1 to indicate that the corresponding 20 MHz subchannel is + * punctured and is set to 0 to indicate that the corresponding + * 20 MHz subchannel is not punctured. A bit in the bitmap that + * falls outside of the BSS bandwidth is reserved. This field is + * present when the value of the NPCA Disabled Subchannel Bitmap + * Field Present field is equal to 1, and not present, otherwise + */ struct ieee80211_uhr_npca_info { __le32 params; __le16 dis_subch_bmap[]; } __packed; +#define IEEE80211_UHR_DPS_PADDING_DELAY 0x0000003F +#define IEEE80211_UHR_DPS_TRANSITION_DELAY 0x00003F00 +#define IEEE80211_UHR_DPS_ICF_REQUIRED 0x00010000 +#define IEEE80211_UHR_DPS_PARAMETERIZED_FLAG 0x00020000 +#define IEEE80211_UHR_DPS_LC_MODE_BW 0x001C0000 +#define IEEE80211_UHR_DPS_LC_MODE_NSS 0x01E00000 +#define IEEE80211_UHR_DPS_LC_MODE_MCS 0x1E000000 +#define IEEE80211_UHR_DPS_MOBILE_AP_DPS_STATIC_HCM 0x20000000 + +/** + * struct ieee80211_uhr_dps_info - DPS operation information + * + * This structure is the "DPS Operation Parameter field" of "UHR + * Operation Element" fields as described in P802.11bn_D1.3 + * subclause 9.4.1.87. See Figure 9-207u. + * + * Refer to IEEE80211_UHR_DPS* + * @params: + * DPS Padding Delay - + * Indicates the minimum MAC padding + * duration that is required by a DPS STA + * in an ICF to cause the STA to transition + * from the lower capability mode to the + * higher capability mode. The DPS Padding + * Delay field is in units of 4 µs. + * DPS Transition Delay - + * Indicates the amount of time required by a + * DPS STA to transition from the higher + * capability mode to the lower capability + * mode. The DPS Transition Delay field is in + * units of 4 µs. + * ICF Required - + * Indicates when the DPS assisting STA needs + * to transmit an ICF frame to the peer DPS STA + * before performing the frame exchanges with + * the peer DPS STA in a TXOP. + * 1 -> indicates that the transmission of the + * ICF frame to the peer DPS STA prior to + * any frame exchange is needed. + * 0 -> ICF transmission before the frame + * exchanges with the peer DPS STA is only + * needed if the frame exchange is performed + * in the HC mode. + * Parameterized Flag - + * 0 -> indicates that only 20 MHz, 1 SS, + * non-HT PPDU format with the data + * rate of 6, 12, and 24 Mb/s as the + * default mode are supported by the + * DPS STA in the LC mode + * 1 -> indicates that a bandwidth up to the + * bandwidth indicated in the LC Mode + * Bandwidth field, a number of spatial + * streams up to the NSS indicated in + * the LC Mode Nss field, and an MCS up + * to the MCS indicated in the LC Mode + * MCS fields are supported by the DPS + * STA in the LC mode as the + * parameterized mode. + * LC Mode Bandwidth - + * Indicates the maximum bandwidth supported + * by the STA in the LC mode. + * LC Mode NSS - + * Indicates the maximum number of the spatial + * streams supported by the STA in the LC mode. + * LC Mode MCS - + * Indicates the highest MCS supported by the STA + * in the LC mode. + * Mobile AP DPS Static HCM - + * 1 -> indicates that it will remain in the DPS high + * capability mode until the next TBTT on that + * link. + * 0 -> otherwise. + */ +struct ieee80211_uhr_dps_info { + __le32 params; +} __packed; + +#define IEEE80211_UHR_DBE_OPER_BANDWIDTH 0x07 +#define IEEE80211_UHR_DBE_OPER_DIS_SUBCHANNEL_BITMAP_PRES 0x08 + +/** + * enum ieee80211_uhr_dbe_oper_bw - DBE Operational Bandwidth + * + * Encoding for the DBE Operational Bandwidth field in the UHR Operation + * element (DBE Operation Parameters). + * + * @IEEE80211_UHR_DBE_OPER_BW_40: 40 MHz operational DBE bandwidth + * @IEEE80211_UHR_DBE_OPER_BW_80: 80 MHz operational DBE bandwidth + * @IEEE80211_UHR_DBE_OPER_BW_160: 160 MHz operational DBE bandwidth + * @IEEE80211_UHR_DBE_OPER_BW_320_1: 320-1 MHz operational DBE bandwidth + * @IEEE80211_UHR_DBE_OPER_BW_320_2: 320-2 MHz operational DBE bandwidth + */ +enum ieee80211_uhr_dbe_oper_bw { + IEEE80211_UHR_DBE_OPER_BW_40 = 1, + IEEE80211_UHR_DBE_OPER_BW_80 = 2, + IEEE80211_UHR_DBE_OPER_BW_160 = 3, + IEEE80211_UHR_DBE_OPER_BW_320_1 = 4, + IEEE80211_UHR_DBE_OPER_BW_320_2 = 5, +}; + +/** + * struct ieee80211_uhr_dbe_info - DBE operation information + * + * This structure is the "DBE Operation Parameters field" of + * "UHR Operation Element" fields as described in P802.11bn_D1.3 + * subclause 9.4.2.353. See Figure 9-aa6. + * + * Refer to IEEE80211_UHR_DBE_OPER* + * @params: + * B0-B2 - DBE Operational Bandwidth field, see + * "enum ieee80211_uhr_dbe_oper_bw" for values. + * Value 0 is reserved. + * Value 1 indicates 40 MHz operational DBE bandwidth. + * Value 2 indicates 80 MHz operational DBE bandwidth. + * Value 3 indicates 160 MHz operational DBE bandwidth. + * Value 4 indicates 320-1 MHz operational DBE bandwidth. + * Value 5 indicates 320-2 MHz operational DBE bandwidth. + * Values 6 to 7 are reserved. + * B3 - DBE Disabled Subchannel Bitmap Present. + * @dis_subch_bmap: DBE Disabled Subchannel Bitmap field is set to indicate + * disabled 20 MHz subchannels within the DBE Bandwidth. + */ +struct ieee80211_uhr_dbe_info { + u8 params; + __le16 dis_subch_bmap[]; +} __packed; + +#define IEEE80211_UHR_P_EDCA_ECWMIN 0x0F +#define IEEE80211_UHR_P_EDCA_ECWMAX 0xF0 +#define IEEE80211_UHR_P_EDCA_AIFSN 0x000F +#define IEEE80211_UHR_P_EDCA_CW_DS 0x0030 +#define IEEE80211_UHR_P_EDCA_PSRC_THRESHOLD 0x01C0 +#define IEEE80211_UHR_P_EDCA_QSRC_THRESHOLD 0x0600 + +/** + * struct ieee80211_uhr_p_edca_info - P-EDCA operation information + * + * This structure is the "P-EDCA Operation Parameters field" of + * "UHR Operation Element" fields as described in P802.11bn_D1.3 + * subclause 9.4.2.353. See Figure 9-aa5. + * + * Refer to IEEE80211_UHR_P_EDCA* + * @p_edca_ec: P-EDCA ECWmin and ECWmax. + * These fields indicate the CWmin and CWmax values used by a + * P-EDCA STA during P-EDCA contention. + * @params: AIFSN, CW DS, PSRC threshold, and QSRC threshold. + * - The AIFSN field indicates the AIFSN value used by a P-EDCA STA + * during P-EDCA contention. + * - The CW DS field indicates the value used for randomization of the + * transmission slot of the DS-CTS frame. The value 3 is reserved. + * The value 0 indicates that randomization is not enabled. + * - The P-EDCA PSRC threshold field indicates the maximum number of + * allowed consecutive DS-CTS transmissions. The value 0 and values + * greater than 4 are reserved. + * - The P-EDCA QSRC threshold field indicates the value of the + * QSRC[AC_VO] counter required to start P-EDCA contention. The + * value 0 is reserved. + */ +struct ieee80211_uhr_p_edca_info { + u8 p_edca_ec; + __le16 params; +} __packed; + static inline bool ieee80211_uhr_oper_size_ok(const u8 *data, u8 len, bool beacon) { @@ -47,19 +252,52 @@ static inline bool ieee80211_uhr_oper_size_ok(const u8 *data, u8 len, if (beacon) return true; - /* FIXME: DPS, DBE, P-EDCA (consider order, also relative to NPCA) */ + /* DPS Operation Parameters (fixed 4 bytes) */ + if (oper->params & cpu_to_le16(IEEE80211_UHR_OPER_PARAMS_DPS_ENA)) { + needed += sizeof(struct ieee80211_uhr_dps_info); + if (len < needed) + return false; + } + /* NPCA Operation Parameters (fixed 4 bytes + optional 2 bytes) */ if (oper->params & cpu_to_le16(IEEE80211_UHR_OPER_PARAMS_NPCA_ENA)) { const struct ieee80211_uhr_npca_info *npca = - (const void *)oper->variable; + (const void *)(data + needed); needed += sizeof(*npca); - if (len < needed) return false; - if (npca->params & cpu_to_le32(IEEE80211_UHR_NPCA_PARAMS_DIS_SUBCH_BMAP_PRES)) + if (npca->params & + cpu_to_le32(IEEE80211_UHR_NPCA_PARAMS_DIS_SUBCH_BMAP_PRES)) { needed += sizeof(npca->dis_subch_bmap[0]); + if (len < needed) + return false; + } + } + + /* P-EDCA Operation Parameters (fixed 3 bytes) */ + if (oper->params & cpu_to_le16(IEEE80211_UHR_OPER_PARAMS_PEDCA_ENA)) { + needed += sizeof(struct ieee80211_uhr_p_edca_info); + if (len < needed) + return false; + } + + /* DBE Operation Parameters (fixed 1 byte + optional 2 bytes) */ + if (oper->params & cpu_to_le16(IEEE80211_UHR_OPER_PARAMS_DBE_ENA)) { + const struct ieee80211_uhr_dbe_info *dbe = + (const void *)(data + needed); + + needed += sizeof(*dbe); + if (len < needed) + return false; + + if (dbe->params & + IEEE80211_UHR_DBE_OPER_DIS_SUBCHANNEL_BITMAP_PRES) { + needed += sizeof(dbe->dis_subch_bmap[0]); + if (len < needed) + return false; + } } return len >= needed; @@ -72,12 +310,15 @@ static inline bool ieee80211_uhr_oper_size_ok(const u8 *data, u8 len, static inline const struct ieee80211_uhr_npca_info * ieee80211_uhr_npca_info(const struct ieee80211_uhr_operation *oper) { + const u8 *pos = oper->variable; + if (!(oper->params & cpu_to_le16(IEEE80211_UHR_OPER_PARAMS_NPCA_ENA))) return NULL; - /* FIXME: DPS */ + if (oper->params & cpu_to_le16(IEEE80211_UHR_OPER_PARAMS_DPS_ENA)) + pos += sizeof(struct ieee80211_uhr_dps_info); - return (const void *)oper->variable; + return (const void *)pos; } static inline const __le16 * @@ -131,6 +372,24 @@ ieee80211_uhr_npca_dis_subch_bitmap(const struct ieee80211_uhr_operation *oper) #define IEEE80211_UHR_MAC_CAP_DBE_EHT_MCS_MAP_160_PRES 0x08 #define IEEE80211_UHR_MAC_CAP_DBE_EHT_MCS_MAP_320_PRES 0x10 +/** + * enum ieee80211_uhr_dbe_max_supported_bw - DBE Maximum Supported Bandwidth + * + * As per spec P802.11bn_D1.3 "Table 9-bb5—Encoding of the DBE Maximum + * Supported Bandwidth field". + * + * @IEEE80211_UHR_DBE_MAX_BW_40: Indicates 40 MHz DBE max supported bw + * @IEEE80211_UHR_DBE_MAX_BW_80: Indicates 80 MHz DBE max supported bw + * @IEEE80211_UHR_DBE_MAX_BW_160: Indicates 160 MHz DBE max supported bw + * @IEEE80211_UHR_DBE_MAX_BW_320: Indicates 320 MHz DBE max supported bw + */ +enum ieee80211_uhr_dbe_max_supported_bw { + IEEE80211_UHR_DBE_MAX_BW_40 = 1, + IEEE80211_UHR_DBE_MAX_BW_80 = 2, + IEEE80211_UHR_DBE_MAX_BW_160 = 3, + IEEE80211_UHR_DBE_MAX_BW_320 = 4, +}; + struct ieee80211_uhr_cap_mac { u8 mac_cap[5]; } __packed; -- cgit v1.2.3 From 4059172b2a78a71d15d8fcd8d3fd8ea1ba65d25b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 13 Feb 2026 17:26:47 -0800 Subject: KVM: x86: Move kvm_rebooting to x86 Move kvm_rebooting, which is only read by x86, to KVM x86 so that it can be moved again to core x86 code. Add a "shutdown" arch hook to facilate setting the flag in KVM x86, along with a pile of comments to provide more context around what KVM x86 is doing and why. Reviewed-by: Chao Gao Acked-by: Dave Hansen Tested-by: Chao Gao Reviewed-by: Dan Williams Tested-by: Sagi Shahar Link: https://patch.msgid.link/20260214012702.2368778-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 22 ++++++++++++++++++++++ arch/x86/kvm/x86.h | 1 + include/linux/kvm_host.h | 8 +++++++- virt/kvm/kvm_main.c | 14 +++++++------- 4 files changed, 37 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a03530795707..7ac3578e6ec0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -700,6 +700,9 @@ static void drop_user_return_notifiers(void) kvm_on_user_return(&msrs->urn); } +__visible bool kvm_rebooting; +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_rebooting); + /* * Handle a fault on a hardware virtualization (VMX or SVM) instruction. * @@ -13177,6 +13180,25 @@ int kvm_arch_enable_virtualization_cpu(void) return 0; } +void kvm_arch_shutdown(void) +{ + /* + * Set kvm_rebooting to indicate that KVM has asynchronously disabled + * hardware virtualization, i.e. that errors and/or exceptions on SVM + * and VMX instructions are expected and should be ignored. + */ + kvm_rebooting = true; + + /* + * Ensure kvm_rebooting is visible before IPIs are sent to other CPUs + * to disable virtualization. Effectively pairs with the reception of + * the IPI (kvm_rebooting is read in task/exception context, but only + * _needs_ to be read as %true after the IPI function callback disables + * virtualization). + */ + smp_wmb(); +} + void kvm_arch_disable_virtualization_cpu(void) { kvm_x86_call(disable_virtualization_cpu)(); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 94d4f07aaaa0..b314649e5c02 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -54,6 +54,7 @@ struct kvm_host_values { u64 arch_capabilities; }; +extern bool kvm_rebooting; void kvm_spurious_fault(void); #define SIZE_OF_MEMSLOTS_HASHTABLE \ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 34759a262b28..7c4ebd5210ec 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1627,6 +1627,13 @@ static inline void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) {} #endif #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING +/* + * kvm_arch_shutdown() is invoked immediately prior to forcefully disabling + * hardware virtualization on all CPUs via IPI function calls (in preparation + * for shutdown or reboot), e.g. to allow arch code to prepare for disabling + * virtualization while KVM may be actively running vCPUs. + */ +void kvm_arch_shutdown(void); /* * kvm_arch_{enable,disable}_virtualization() are called on one CPU, under * kvm_usage_lock, immediately after/before 0=>1 and 1=>0 transitions of @@ -2313,7 +2320,6 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu) #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING extern bool enable_virt_at_load; -extern bool kvm_rebooting; #endif extern unsigned int halt_poll_ns; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1bc1da66b4b0..d27bf2488b12 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5578,13 +5578,15 @@ bool enable_virt_at_load = true; module_param(enable_virt_at_load, bool, 0444); EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_virt_at_load); -__visible bool kvm_rebooting; -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_rebooting); - static DEFINE_PER_CPU(bool, virtualization_enabled); static DEFINE_MUTEX(kvm_usage_lock); static int kvm_usage_count; +__weak void kvm_arch_shutdown(void) +{ + +} + __weak void kvm_arch_enable_virtualization(void) { @@ -5638,10 +5640,9 @@ static int kvm_offline_cpu(unsigned int cpu) static void kvm_shutdown(void *data) { + kvm_arch_shutdown(); + /* - * Disable hardware virtualization and set kvm_rebooting to indicate - * that KVM has asynchronously disabled hardware virtualization, i.e. - * that relevant errors and exceptions aren't entirely unexpected. * Some flavors of hardware virtualization need to be disabled before * transferring control to firmware (to perform shutdown/reboot), e.g. * on x86, virtualization can block INIT interrupts, which are used by @@ -5650,7 +5651,6 @@ static void kvm_shutdown(void *data) * 100% comprehensive. */ pr_info("kvm: exiting hardware virtualization\n"); - kvm_rebooting = true; on_each_cpu(kvm_disable_virtualization_cpu, NULL, 1); } -- cgit v1.2.3 From d30372d0b7e637475c79a785d055f4eb8c863656 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 13 Feb 2026 17:27:01 -0800 Subject: KVM: Bury kvm_{en,dis}able_virtualization() in kvm_main.c once more Now that TDX handles doing VMXON without KVM's involvement, bury the top-level APIs to enable and disable virtualization back in kvm_main.c. No functional change intended. Reviewed-by: Dan Williams Reviewed-by: Chao Gao Tested-by: Chao Gao Tested-by: Sagi Shahar Link: https://patch.msgid.link/20260214012702.2368778-16-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 8 -------- virt/kvm/kvm_main.c | 17 +++++++++++++---- 2 files changed, 13 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7c4ebd5210ec..fbd549bdf052 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2613,12 +2613,4 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, struct kvm_pre_fault_memory *range); #endif -#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING -int kvm_enable_virtualization(void); -void kvm_disable_virtualization(void); -#else -static inline int kvm_enable_virtualization(void) { return 0; } -static inline void kvm_disable_virtualization(void) { } -#endif - #endif diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d27bf2488b12..a9ccf9a1c41e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1102,6 +1102,9 @@ static inline struct kvm_io_bus *kvm_get_bus_for_destruction(struct kvm *kvm, !refcount_read(&kvm->users_count)); } +static int kvm_enable_virtualization(void); +static void kvm_disable_virtualization(void); + static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) { struct kvm *kvm = kvm_arch_alloc_vm(); @@ -5689,7 +5692,7 @@ static struct syscore kvm_syscore = { .ops = &kvm_syscore_ops, }; -int kvm_enable_virtualization(void) +static int kvm_enable_virtualization(void) { int r; @@ -5734,9 +5737,8 @@ err_cpuhp: --kvm_usage_count; return r; } -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_enable_virtualization); -void kvm_disable_virtualization(void) +static void kvm_disable_virtualization(void) { guard(mutex)(&kvm_usage_lock); @@ -5747,7 +5749,6 @@ void kvm_disable_virtualization(void) cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); kvm_arch_disable_virtualization(); } -EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_disable_virtualization); static int kvm_init_virtualization(void) { @@ -5763,6 +5764,14 @@ static void kvm_uninit_virtualization(void) kvm_disable_virtualization(); } #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ +static int kvm_enable_virtualization(void) +{ + return 0; +} +static void kvm_disable_virtualization(void) +{ + +} static int kvm_init_virtualization(void) { return 0; -- cgit v1.2.3 From c66e0f453d1afa82534383c58d503238a43fa76c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Mar 2026 01:27:47 +0000 Subject: net: use ktime_t in struct scm_timestamping_internal Instead of using struct timespec64 in scm_timestamping_internal, use ktime_t, saving 24 bytes in kernel stack. This makes tcp_update_recv_tstamps() small enough to be inlined. The ktime_t -> timespec64 conversions happen after socket lock has been released in tcp_recvmsg(), and only if the application requested them. $ scripts/bloat-o-meter -t vmlinux.0 vmlinux add/remove: 0/2 grow/shrink: 5/4 up/down: 146/-277 (-131) Function old new delta tcp_zerocopy_receive 2383 2425 +42 mptcp_recvmsg 1565 1607 +42 tcp_recvmsg_locked 3797 3823 +26 put_cmsg_scm_timestamping64 131 149 +18 put_cmsg_scm_timestamping 131 149 +18 __pfx_tcp_update_recv_tstamps 16 - -16 do_tcp_getsockopt 4024 4006 -18 tcp_recv_timestamp 474 430 -44 tcp_zc_handle_leftover 417 371 -46 __sock_recv_timestamp 1087 1031 -56 tcp_update_recv_tstamps 97 - -97 Total: Before=25223788, After=25223657, chg -0.00% Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Reviewed-by: Jason Xing Link: https://patch.msgid.link/20260304012747.881644-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/socket.h | 2 +- include/net/tcp.h | 11 +++++++-- net/core/scm.c | 12 ++++++---- net/ipv4/tcp.c | 61 ++++++++++++++++++-------------------------------- net/socket.c | 23 +++++++++---------- 5 files changed, 51 insertions(+), 58 deletions(-) (limited to 'include/linux') diff --git a/include/linux/socket.h b/include/linux/socket.h index ec715ad4bf25..ec4a0a025793 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -415,7 +415,7 @@ struct __kernel_timespec; struct old_timespec32; struct scm_timestamping_internal { - struct timespec64 ts[3]; + ktime_t ts[3]; }; extern void put_cmsg_scm_timestamping64(struct msghdr *msg, struct scm_timestamping_internal *tss); diff --git a/include/net/tcp.h b/include/net/tcp.h index 9cf8785ef0b4..fea6081cf6c7 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -503,8 +503,15 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags); int tcp_set_rcvlowat(struct sock *sk, int val); int tcp_set_window_clamp(struct sock *sk, int val); -void tcp_update_recv_tstamps(struct sk_buff *skb, - struct scm_timestamping_internal *tss); + +static inline void +tcp_update_recv_tstamps(struct sk_buff *skb, + struct scm_timestamping_internal *tss) +{ + tss->ts[0] = skb->tstamp; + tss->ts[2] = skb_hwtstamps(skb)->hwtstamp; +} + void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, struct scm_timestamping_internal *tss); void tcp_data_ready(struct sock *sk); diff --git a/net/core/scm.c b/net/core/scm.c index a29aa8fb8065..eec13f50ecaf 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -318,8 +318,10 @@ void put_cmsg_scm_timestamping64(struct msghdr *msg, struct scm_timestamping_int int i; for (i = 0; i < ARRAY_SIZE(tss.ts); i++) { - tss.ts[i].tv_sec = tss_internal->ts[i].tv_sec; - tss.ts[i].tv_nsec = tss_internal->ts[i].tv_nsec; + struct timespec64 tv = ktime_to_timespec64(tss_internal->ts[i]); + + tss.ts[i].tv_sec = tv.tv_sec; + tss.ts[i].tv_nsec = tv.tv_nsec; } put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPING_NEW, sizeof(tss), &tss); @@ -332,8 +334,10 @@ void put_cmsg_scm_timestamping(struct msghdr *msg, struct scm_timestamping_inter int i; for (i = 0; i < ARRAY_SIZE(tss.ts); i++) { - tss.ts[i].tv_sec = tss_internal->ts[i].tv_sec; - tss.ts[i].tv_nsec = tss_internal->ts[i].tv_nsec; + struct timespec64 tv = ktime_to_timespec64(tss_internal->ts[i]); + + tss.ts[i].tv_sec = tv.tv_sec; + tss.ts[i].tv_nsec = tv.tv_nsec; } put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPING_OLD, sizeof(tss), &tss); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5997e0fb7a45..1c8be22a361e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1871,20 +1871,6 @@ int tcp_set_rcvlowat(struct sock *sk, int val) } EXPORT_IPV6_MOD(tcp_set_rcvlowat); -void tcp_update_recv_tstamps(struct sk_buff *skb, - struct scm_timestamping_internal *tss) -{ - if (skb->tstamp) - tss->ts[0] = ktime_to_timespec64(skb->tstamp); - else - tss->ts[0] = (struct timespec64) {0}; - - if (skb_hwtstamps(skb)->hwtstamp) - tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp); - else - tss->ts[2] = (struct timespec64) {0}; -} - #ifdef CONFIG_MMU static const struct vm_operations_struct tcp_vm_ops = { }; @@ -2376,22 +2362,23 @@ void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, { int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW); u32 tsflags = READ_ONCE(sk->sk_tsflags); - bool has_timestamping = false; - if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) { + if (tss->ts[0]) { if (sock_flag(sk, SOCK_RCVTSTAMP)) { + struct timespec64 tv = ktime_to_timespec64(tss->ts[0]); + if (sock_flag(sk, SOCK_RCVTSTAMPNS)) { if (new_tstamp) { struct __kernel_timespec kts = { - .tv_sec = tss->ts[0].tv_sec, - .tv_nsec = tss->ts[0].tv_nsec, + .tv_sec = tv.tv_sec, + .tv_nsec = tv.tv_nsec, }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW, sizeof(kts), &kts); } else { struct __kernel_old_timespec ts_old = { - .tv_sec = tss->ts[0].tv_sec, - .tv_nsec = tss->ts[0].tv_nsec, + .tv_sec = tv.tv_sec, + .tv_nsec = tv.tv_nsec, }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD, sizeof(ts_old), &ts_old); @@ -2399,41 +2386,37 @@ void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, } else { if (new_tstamp) { struct __kernel_sock_timeval stv = { - .tv_sec = tss->ts[0].tv_sec, - .tv_usec = tss->ts[0].tv_nsec / 1000, + .tv_sec = tv.tv_sec, + .tv_usec = tv.tv_nsec / 1000, }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW, sizeof(stv), &stv); } else { - struct __kernel_old_timeval tv = { - .tv_sec = tss->ts[0].tv_sec, - .tv_usec = tss->ts[0].tv_nsec / 1000, + struct __kernel_old_timeval otv = { + .tv_sec = tv.tv_sec, + .tv_usec = tv.tv_nsec / 1000, }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD, - sizeof(tv), &tv); + sizeof(otv), &otv); } } } - if (tsflags & SOF_TIMESTAMPING_SOFTWARE && + if (!(tsflags & SOF_TIMESTAMPING_SOFTWARE && (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE || - !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER))) - has_timestamping = true; - else - tss->ts[0] = (struct timespec64) {0}; + !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER)))) + tss->ts[0] = 0; } - if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) { - if (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE && + if (tss->ts[2]) { + if (!(tsflags & SOF_TIMESTAMPING_RAW_HARDWARE && (tsflags & SOF_TIMESTAMPING_RX_HARDWARE || - !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER))) - has_timestamping = true; - else - tss->ts[2] = (struct timespec64) {0}; + !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER)))) + tss->ts[2] = 0; } - if (has_timestamping) { - tss->ts[1] = (struct timespec64) {0}; + if (tss->ts[0] | tss->ts[2]) { + tss->ts[1] = 0; if (sock_flag(sk, SOCK_TSTAMP_NEW)) put_cmsg_scm_timestamping64(msg, tss); else diff --git a/net/socket.c b/net/socket.c index 05952188127f..68829d09bcf1 100644 --- a/net/socket.c +++ b/net/socket.c @@ -912,11 +912,10 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, { int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP); int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW); - struct scm_timestamping_internal tss; - int empty = 1, false_tstamp = 0; struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); - int if_index; + struct scm_timestamping_internal tss; + int if_index, false_tstamp = 0; ktime_t hwtstamp; u32 tsflags; @@ -961,12 +960,12 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, memset(&tss, 0, sizeof(tss)); tsflags = READ_ONCE(sk->sk_tsflags); - if ((tsflags & SOF_TIMESTAMPING_SOFTWARE && - (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE || - skb_is_err_queue(skb) || - !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER))) && - ktime_to_timespec64_cond(skb->tstamp, tss.ts + 0)) - empty = 0; + if (tsflags & SOF_TIMESTAMPING_SOFTWARE && + (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE || + skb_is_err_queue(skb) || + !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER))) + tss.ts[0] = skb->tstamp; + if (shhwtstamps && (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE && (tsflags & SOF_TIMESTAMPING_RX_HARDWARE || @@ -983,15 +982,15 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, hwtstamp = ptp_convert_timestamp(&hwtstamp, READ_ONCE(sk->sk_bind_phc)); - if (ktime_to_timespec64_cond(hwtstamp, tss.ts + 2)) { - empty = 0; + if (hwtstamp) { + tss.ts[2] = hwtstamp; if ((tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) && !skb_is_err_queue(skb)) put_ts_pktinfo(msg, skb, if_index); } } - if (!empty) { + if (tss.ts[0] | tss.ts[2]) { if (sock_flag(sk, SOCK_TSTAMP_NEW)) put_cmsg_scm_timestamping64(msg, &tss); else -- cgit v1.2.3 From 01b7768578a68abe597cfb36ebe0fc47c9305f88 Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Wed, 25 Feb 2026 16:19:31 +0200 Subject: net/mlx5: Add TLP emulation device capabilities Introduce the hardware structures and definitions needed for the driver support of TLP emulation in mlx5_ifc. Signed-off-by: Maher Sanalla Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 775cb0c56865..a3948b36820d 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1389,6 +1389,26 @@ struct mlx5_ifc_virtio_emulation_cap_bits { u8 reserved_at_1c0[0x640]; }; +struct mlx5_ifc_tlp_dev_emu_capabilities_bits { + u8 reserved_at_0[0x20]; + + u8 reserved_at_20[0x13]; + u8 log_tlp_rsp_gw_page_stride[0x5]; + u8 reserved_at_38[0x8]; + + u8 reserved_at_40[0xc0]; + + u8 reserved_at_100[0xc]; + u8 tlp_rsp_gw_num_pages[0x4]; + u8 reserved_at_110[0x10]; + + u8 reserved_at_120[0xa0]; + + u8 tlp_rsp_gw_pages_bar_offset[0x40]; + + u8 reserved_at_200[0x600]; +}; + enum { MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_1_BYTE = 0x0, MLX5_ATOMIC_CAPS_ATOMIC_SIZE_QP_2_BYTES = 0x2, @@ -1961,7 +1981,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 log_max_rqt[0x5]; u8 reserved_at_390[0x3]; u8 log_max_rqt_size[0x5]; - u8 reserved_at_398[0x1]; + u8 tlp_device_emulation_manager[0x1]; u8 vnic_env_cnt_bar_uar_access[0x1]; u8 vnic_env_cnt_odp_page_fault[0x1]; u8 log_max_tis_per_sq[0x5]; @@ -3830,6 +3850,7 @@ union mlx5_ifc_hca_cap_union_bits { struct mlx5_ifc_tls_cap_bits tls_cap; struct mlx5_ifc_device_mem_cap_bits device_mem_cap; struct mlx5_ifc_virtio_emulation_cap_bits virtio_emulation_cap; + struct mlx5_ifc_tlp_dev_emu_capabilities_bits tlp_dev_emu_capabilities; struct mlx5_ifc_macsec_cap_bits macsec_cap; struct mlx5_ifc_crypto_cap_bits crypto_cap; struct mlx5_ifc_ipsec_cap_bits ipsec_cap; -- cgit v1.2.3 From 385a06f74ff7a03e3fb0b15fb87cfeb052d75073 Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Wed, 25 Feb 2026 16:19:32 +0200 Subject: net/mlx5: Expose TLP emulation capabilities Expose and query TLP device emulation caps on driver load. Signed-off-by: Maher Sanalla Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/fw.c | 6 ++++++ drivers/net/ethernet/mellanox/mlx5/core/main.c | 1 + include/linux/mlx5/device.h | 9 +++++++++ 3 files changed, 16 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c index eeb4437975f2..55249f405841 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c @@ -255,6 +255,12 @@ int mlx5_query_hca_caps(struct mlx5_core_dev *dev) return err; } + if (MLX5_CAP_GEN(dev, tlp_device_emulation_manager)) { + err = mlx5_core_get_caps_mode(dev, MLX5_CAP_TLP_EMULATION, HCA_CAP_OPMOD_GET_CUR); + if (err) + return err; + } + if (MLX5_CAP_GEN(dev, ipsec_offload)) { err = mlx5_core_get_caps_mode(dev, MLX5_CAP_IPSEC, HCA_CAP_OPMOD_GET_CUR); if (err) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index fdc3ba20912e..b0bc4a7d4a93 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1772,6 +1772,7 @@ static const int types[] = { MLX5_CAP_CRYPTO, MLX5_CAP_SHAMPO, MLX5_CAP_ADV_RDMA, + MLX5_CAP_TLP_EMULATION, }; static void mlx5_hca_caps_free(struct mlx5_core_dev *dev) diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index b37fe39cef27..25c6b42140b2 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1259,6 +1259,7 @@ enum mlx5_cap_type { MLX5_CAP_PORT_SELECTION = 0x25, MLX5_CAP_ADV_VIRTUALIZATION = 0x26, MLX5_CAP_ADV_RDMA = 0x28, + MLX5_CAP_TLP_EMULATION = 0x2a, /* NUM OF CAP Types */ MLX5_CAP_NUM }; @@ -1481,6 +1482,14 @@ enum mlx5_qcam_feature_groups { MLX5_GET64(virtio_emulation_cap, \ (mdev)->caps.hca[MLX5_CAP_VDPA_EMULATION]->cur, cap) +#define MLX5_CAP_DEV_TLP_EMULATION(mdev, cap)\ + MLX5_GET(tlp_dev_emu_capabilities, \ + (mdev)->caps.hca[MLX5_CAP_TLP_EMULATION]->cur, cap) + +#define MLX5_CAP64_DEV_TLP_EMULATION(mdev, cap)\ + MLX5_GET64(tlp_dev_emu_capabilities, \ + (mdev)->caps.hca[MLX5_CAP_TLP_EMULATION]->cur, cap) + #define MLX5_CAP_IPSEC(mdev, cap)\ MLX5_GET(ipsec_cap, (mdev)->caps.hca[MLX5_CAP_IPSEC]->cur, cap) -- cgit v1.2.3 From 90503f9ffee927c3abdc94a4862d13ae71ea9442 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Thu, 12 Feb 2026 15:30:39 -0800 Subject: powercap: intel_rapl: Use unit conversion macros from units.h Replace hardcoded numeric constants with standard unit conversion macros from linux/units.h for better code clarity and self-documentation. Add MICROJOULE_PER_JOULE and NANOJOULE_PER_JOULE to units.h to support energy unit conversions, following the existing pattern for power units. No functional changes. Signed-off-by: Kuppuswamy Sathyanarayanan Acked-by: Srinivas Pandruvada Link: https://patch.msgid.link/20260212233044.329790-8-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 19 ++++++++++--------- include/linux/units.h | 3 +++ 2 files changed, 13 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 8dde3b0eb454..380893baf987 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -964,13 +965,13 @@ static int rapl_check_unit_core(struct rapl_domain *rd) } value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET; - rd->energy_unit = (ENERGY_UNIT_SCALE * 1000000) >> value; + rd->energy_unit = (ENERGY_UNIT_SCALE * MICROJOULE_PER_JOULE) >> value; value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET; - rd->power_unit = 1000000 >> value; + rd->power_unit = MICROWATT_PER_WATT >> value; value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET; - rd->time_unit = 1000000 >> value; + rd->time_unit = USEC_PER_SEC >> value; pr_debug("Core CPU %s:%s energy=%dpJ, time=%dus, power=%duW\n", rd->rp->name, rd->name, rd->energy_unit, rd->time_unit, rd->power_unit); @@ -995,10 +996,10 @@ static int rapl_check_unit_atom(struct rapl_domain *rd) rd->energy_unit = ENERGY_UNIT_SCALE * (1ULL << value); value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET; - rd->power_unit = (1ULL << value) * 1000; + rd->power_unit = (1ULL << value) * MILLIWATT_PER_WATT; value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET; - rd->time_unit = 1000000 >> value; + rd->time_unit = USEC_PER_SEC >> value; pr_debug("Atom %s:%s energy=%dpJ, time=%dus, power=%duW\n", rd->rp->name, rd->name, rd->energy_unit, rd->time_unit, rd->power_unit); @@ -1169,13 +1170,13 @@ static int rapl_check_unit_tpmi(struct rapl_domain *rd) } value = (ra.value & TPMI_ENERGY_UNIT_MASK) >> TPMI_ENERGY_UNIT_OFFSET; - rd->energy_unit = (ENERGY_UNIT_SCALE * 1000000) >> value; + rd->energy_unit = (ENERGY_UNIT_SCALE * MICROJOULE_PER_JOULE) >> value; value = (ra.value & TPMI_POWER_UNIT_MASK) >> TPMI_POWER_UNIT_OFFSET; - rd->power_unit = 1000000 >> value; + rd->power_unit = MICROWATT_PER_WATT >> value; value = (ra.value & TPMI_TIME_UNIT_MASK) >> TPMI_TIME_UNIT_OFFSET; - rd->time_unit = 1000000 >> value; + rd->time_unit = USEC_PER_SEC >> value; pr_debug("Core CPU %s:%s energy=%dpJ, time=%dus, power=%duW\n", rd->rp->name, rd->name, rd->energy_unit, rd->time_unit, rd->power_unit); @@ -1208,7 +1209,7 @@ static const struct rapl_defaults rapl_defaults_spr_server = { .check_unit = rapl_check_unit_core, .set_floor_freq = set_floor_freq_default, .compute_time_window = rapl_compute_time_window_core, - .psys_domain_energy_unit = 1000000000, + .psys_domain_energy_unit = NANOJOULE_PER_JOULE, .spr_psys_bits = true, }; diff --git a/include/linux/units.h b/include/linux/units.h index 80d57c50b9e3..c6d78988613a 100644 --- a/include/linux/units.h +++ b/include/linux/units.h @@ -57,6 +57,9 @@ #define MICROWATT_PER_MILLIWATT 1000UL #define MICROWATT_PER_WATT 1000000UL +#define MICROJOULE_PER_JOULE 1000000UL +#define NANOJOULE_PER_JOULE 1000000000UL + #define BYTES_PER_KBIT (KILO / BITS_PER_BYTE) #define BYTES_PER_MBIT (MEGA / BITS_PER_BYTE) #define BYTES_PER_GBIT (GIGA / BITS_PER_BYTE) -- cgit v1.2.3 From d7ca7d1488cc916dbf0a6a594abbda81d4eaeee9 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Thu, 12 Feb 2026 15:30:40 -0800 Subject: powercap: intel_rapl: Allow interface drivers to configure rapl_defaults RAPL default settings vary across different RAPL interfaces (MSR, TPMI, MMIO). Currently, these defaults are stored in the common RAPL driver, which requires interface-specific handling logic and makes the common layer unnecessarily complex. There is no strong reason for the common code to own these defaults, since they are inherently interface-specific. To prepare for moving default configuration into the individual interface drivers, 1. Move struct rapl_defaults into a shared header so that interface drivers can directly populate their own default settings. 2. Change the @defaults field in struct rapl_if_priv from void * to const struct rapl_defaults * to improve type safety and readability and update the common driver to use the typed defaults structure. 3. Update all internal getter functions and local pointers to use const struct rapl_defaults * to maintain const-correctness. 4. Rename and export the common helper functions (check_unit, set_floor_freq, compute_time_window) so interface drivers may reuse or override them as appropriate. No functional changes. This is a preparatory refactoring to allow interface drivers to supply their own RAPL default settings. Co-developed-by: Zhang Rui Signed-off-by: Zhang Rui Signed-off-by: Kuppuswamy Sathyanarayanan Acked-by: Srinivas Pandruvada Link: https://patch.msgid.link/20260212233044.329790-9-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 64 ++++++++++++++++-------------------- include/linux/intel_rapl.h | 17 ++++++++-- 2 files changed, 43 insertions(+), 38 deletions(-) (limited to 'include/linux') diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 380893baf987..7c95eb658c16 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -221,20 +221,10 @@ static int get_pl_prim(struct rapl_domain *rd, int pl, enum pl_prims prim) #define power_zone_to_rapl_domain(_zone) \ container_of(_zone, struct rapl_domain, power_zone) -struct rapl_defaults { - u8 floor_freq_reg_addr; - int (*check_unit)(struct rapl_domain *rd); - void (*set_floor_freq)(struct rapl_domain *rd, bool mode); - u64 (*compute_time_window)(struct rapl_domain *rd, u64 val, - bool to_raw); - unsigned int dram_domain_energy_unit; - unsigned int psys_domain_energy_unit; - bool spr_psys_bits; -}; -static struct rapl_defaults *defaults_msr; +static const struct rapl_defaults *defaults_msr; static const struct rapl_defaults defaults_tpmi; -static struct rapl_defaults *get_defaults(struct rapl_package *rp) +static const struct rapl_defaults *get_defaults(struct rapl_package *rp) { return rp->priv->defaults; } @@ -351,7 +341,7 @@ static int find_nr_power_limit(struct rapl_domain *rd) static int set_domain_enable(struct powercap_zone *power_zone, bool mode) { struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone); - struct rapl_defaults *defaults = get_defaults(rd->rp); + const struct rapl_defaults *defaults = get_defaults(rd->rp); u64 val; int ret; @@ -640,7 +630,7 @@ static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type, u64 value, int to_raw) { u64 units = 1; - struct rapl_defaults *defaults = get_defaults(rd->rp); + const struct rapl_defaults *defaults = get_defaults(rd->rp); u64 scale = 1; switch (type) { @@ -785,11 +775,11 @@ static int rapl_config(struct rapl_package *rp) /* MMIO I/F shares the same register layout as MSR registers */ case RAPL_IF_MMIO: case RAPL_IF_MSR: - rp->priv->defaults = (void *)defaults_msr; + rp->priv->defaults = defaults_msr; rp->priv->rpi = (void *)rpi_msr; break; case RAPL_IF_TPMI: - rp->priv->defaults = (void *)&defaults_tpmi; + rp->priv->defaults = &defaults_tpmi; rp->priv->rpi = (void *)rpi_tpmi; break; default: @@ -806,7 +796,7 @@ static int rapl_config(struct rapl_package *rp) static enum rapl_primitives prim_fixups(struct rapl_domain *rd, enum rapl_primitives prim) { - struct rapl_defaults *defaults = get_defaults(rd->rp); + const struct rapl_defaults *defaults = get_defaults(rd->rp); if (!defaults->spr_psys_bits) return prim; @@ -951,7 +941,7 @@ static int rapl_write_pl_data(struct rapl_domain *rd, int pl, * power unit : microWatts : Represented in milliWatts by default * time unit : microseconds: Represented in seconds by default */ -static int rapl_check_unit_core(struct rapl_domain *rd) +int rapl_default_check_unit(struct rapl_domain *rd) { struct reg_action ra; u32 value; @@ -978,6 +968,7 @@ static int rapl_check_unit_core(struct rapl_domain *rd) return 0; } +EXPORT_SYMBOL_NS_GPL(rapl_default_check_unit, "INTEL_RAPL"); static int rapl_check_unit_atom(struct rapl_domain *rd) { @@ -1071,7 +1062,7 @@ static void package_power_limit_irq_restore(struct rapl_package *rp) wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); } -static void set_floor_freq_default(struct rapl_domain *rd, bool mode) +void rapl_default_set_floor_freq(struct rapl_domain *rd, bool mode) { int i; @@ -1085,11 +1076,12 @@ static void set_floor_freq_default(struct rapl_domain *rd, bool mode) rapl_write_pl_data(rd, i, PL_CLAMP, mode); } } +EXPORT_SYMBOL_NS_GPL(rapl_default_set_floor_freq, "INTEL_RAPL"); static void set_floor_freq_atom(struct rapl_domain *rd, bool enable) { static u32 power_ctrl_orig_val; - struct rapl_defaults *defaults = get_defaults(rd->rp); + const struct rapl_defaults *defaults = get_defaults(rd->rp); u32 mdata; if (!defaults->floor_freq_reg_addr) { @@ -1110,8 +1102,7 @@ static void set_floor_freq_atom(struct rapl_domain *rd, bool enable) defaults->floor_freq_reg_addr, mdata); } -static u64 rapl_compute_time_window_core(struct rapl_domain *rd, u64 value, - bool to_raw) +u64 rapl_default_compute_time_window(struct rapl_domain *rd, u64 value, bool to_raw) { u64 f, y; /* fraction and exp. used for time unit */ @@ -1142,6 +1133,7 @@ static u64 rapl_compute_time_window_core(struct rapl_domain *rd, u64 value, } return value; } +EXPORT_SYMBOL_NS_GPL(rapl_default_compute_time_window, "INTEL_RAPL"); static u64 rapl_compute_time_window_atom(struct rapl_domain *rd, u64 value, bool to_raw) @@ -1187,28 +1179,28 @@ static int rapl_check_unit_tpmi(struct rapl_domain *rd) static const struct rapl_defaults defaults_tpmi = { .check_unit = rapl_check_unit_tpmi, /* Reuse existing logic, ignore the PL_CLAMP failures and enable all Power Limits */ - .set_floor_freq = set_floor_freq_default, - .compute_time_window = rapl_compute_time_window_core, + .set_floor_freq = rapl_default_set_floor_freq, + .compute_time_window = rapl_default_compute_time_window, }; static const struct rapl_defaults rapl_defaults_core = { .floor_freq_reg_addr = 0, - .check_unit = rapl_check_unit_core, - .set_floor_freq = set_floor_freq_default, - .compute_time_window = rapl_compute_time_window_core, + .check_unit = rapl_default_check_unit, + .set_floor_freq = rapl_default_set_floor_freq, + .compute_time_window = rapl_default_compute_time_window, }; static const struct rapl_defaults rapl_defaults_hsw_server = { - .check_unit = rapl_check_unit_core, - .set_floor_freq = set_floor_freq_default, - .compute_time_window = rapl_compute_time_window_core, + .check_unit = rapl_default_check_unit, + .set_floor_freq = rapl_default_set_floor_freq, + .compute_time_window = rapl_default_compute_time_window, .dram_domain_energy_unit = 15300, }; static const struct rapl_defaults rapl_defaults_spr_server = { - .check_unit = rapl_check_unit_core, - .set_floor_freq = set_floor_freq_default, - .compute_time_window = rapl_compute_time_window_core, + .check_unit = rapl_default_check_unit, + .set_floor_freq = rapl_default_set_floor_freq, + .compute_time_window = rapl_default_compute_time_window, .psys_domain_energy_unit = NANOJOULE_PER_JOULE, .spr_psys_bits = true, }; @@ -1242,7 +1234,7 @@ static const struct rapl_defaults rapl_defaults_cht = { }; static const struct rapl_defaults rapl_defaults_amd = { - .check_unit = rapl_check_unit_core, + .check_unit = rapl_default_check_unit, }; static const struct x86_cpu_id rapl_ids[] __initconst = { @@ -1448,7 +1440,7 @@ static int rapl_check_domain(int domain, struct rapl_package *rp) */ static int rapl_get_domain_unit(struct rapl_domain *rd) { - struct rapl_defaults *defaults = get_defaults(rd->rp); + const struct rapl_defaults *defaults = get_defaults(rd->rp); int ret; if (!rd->regs[RAPL_DOMAIN_REG_UNIT].val) { @@ -2341,7 +2333,7 @@ static int __init rapl_init(void) id = x86_match_cpu(rapl_ids); if (id) { - defaults_msr = (struct rapl_defaults *)id->driver_data; + defaults_msr = (const struct rapl_defaults *)id->driver_data; rapl_msr_platdev = platform_device_alloc("intel_rapl_msr", 0); if (!rapl_msr_platdev) diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index fa1f328d6712..6d694099a3ad 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -128,6 +128,16 @@ struct reg_action { int err; }; +struct rapl_defaults { + u8 floor_freq_reg_addr; + int (*check_unit)(struct rapl_domain *rd); + void (*set_floor_freq)(struct rapl_domain *rd, bool mode); + u64 (*compute_time_window)(struct rapl_domain *rd, u64 val, bool to_raw); + unsigned int dram_domain_energy_unit; + unsigned int psys_domain_energy_unit; + bool spr_psys_bits; +}; + /** * struct rapl_if_priv: private data for different RAPL interfaces * @control_type: Each RAPL interface must have its own powercap @@ -142,7 +152,7 @@ struct reg_action { * registers. * @write_raw: Callback for writing RAPL interface specific * registers. - * @defaults: internal pointer to interface default settings + * @defaults: pointer to default settings * @rpi: internal pointer to interface primitive info */ struct rapl_if_priv { @@ -154,7 +164,7 @@ struct rapl_if_priv { int limits[RAPL_DOMAIN_MAX]; int (*read_raw)(int id, struct reg_action *ra, bool pmu_ctx); int (*write_raw)(int id, struct reg_action *ra); - void *defaults; + const struct rapl_defaults *defaults; void *rpi; }; @@ -211,6 +221,9 @@ void rapl_remove_package_cpuslocked(struct rapl_package *rp); struct rapl_package *rapl_find_package_domain(int id, struct rapl_if_priv *priv, bool id_is_cpu); struct rapl_package *rapl_add_package(int id, struct rapl_if_priv *priv, bool id_is_cpu); void rapl_remove_package(struct rapl_package *rp); +int rapl_default_check_unit(struct rapl_domain *rd); +void rapl_default_set_floor_freq(struct rapl_domain *rd, bool mode); +u64 rapl_default_compute_time_window(struct rapl_domain *rd, u64 value, bool to_raw); #ifdef CONFIG_PERF_EVENTS int rapl_package_add_pmu(struct rapl_package *rp); -- cgit v1.2.3 From cc39325f927850473d3a84b029ae6f9b508e9bd1 Mon Sep 17 00:00:00 2001 From: Mohsin Bashir Date: Mon, 2 Mar 2026 15:01:45 -0800 Subject: net: ethtool: Track pause storm events With TX pause enabled, if a device is unable to pass packets up to the stack (e.g., CPU is hanged), the device can cause pause storm. Given that devices can have native support to protect the neighbor from such flooding, such events need some tracking. This support is to track TX pause storm events for better observability. Reviewed-by: Oleksij Rempel Signed-off-by: Jakub Kicinski Signed-off-by: Mohsin Bashir Link: https://patch.msgid.link/20260302230149.1580195-2-mohsin.bashr@gmail.com Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/ethtool.yaml | 13 +++++++++++++ include/linux/ethtool.h | 2 ++ include/uapi/linux/ethtool_netlink_generated.h | 1 + net/ethtool/pause.c | 4 +++- 4 files changed, 19 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 0a2d2343f79a..4707063af3b4 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -879,6 +879,19 @@ attribute-sets: - name: rx-frames type: u64 + - + name: tx-pause-storm-events + type: u64 + doc: >- + TX pause storm event count. Increments each time device + detects that its pause assertion condition has been true + for too long for normal operation. As a result, the device + has temporarily disabled its own Pause TX function to + protect the network from itself. + This counter should never increment under normal overload + conditions; it indicates catastrophic failure like an OS + crash. The rate of incrementing is implementation specific. + - name: pause attr-cnt-name: __ethtool-a-pause-cnt diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 798abec67a1b..83c375840835 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -512,12 +512,14 @@ struct ethtool_eth_ctrl_stats { * * Equivalent to `30.3.4.3 aPAUSEMACCtrlFramesReceived` * from the standard. + * @tx_pause_storm_events: TX pause storm event count (see ethtool.yaml). */ struct ethtool_pause_stats { enum ethtool_mac_stats_src src; struct_group(stats, u64 tx_pause_frames; u64 rx_pause_frames; + u64 tx_pause_storm_events; ); }; diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 556a0c834df5..114b83017297 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -381,6 +381,7 @@ enum { ETHTOOL_A_PAUSE_STAT_PAD, ETHTOOL_A_PAUSE_STAT_TX_FRAMES, ETHTOOL_A_PAUSE_STAT_RX_FRAMES, + ETHTOOL_A_PAUSE_STAT_TX_PAUSE_STORM_EVENTS, __ETHTOOL_A_PAUSE_STAT_CNT, ETHTOOL_A_PAUSE_STAT_MAX = (__ETHTOOL_A_PAUSE_STAT_CNT - 1) diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c index 0f9af1e66548..5d28f642764c 100644 --- a/net/ethtool/pause.c +++ b/net/ethtool/pause.c @@ -130,7 +130,9 @@ static int pause_put_stats(struct sk_buff *skb, if (ethtool_put_stat(skb, pause_stats->tx_pause_frames, ETHTOOL_A_PAUSE_STAT_TX_FRAMES, pad) || ethtool_put_stat(skb, pause_stats->rx_pause_frames, - ETHTOOL_A_PAUSE_STAT_RX_FRAMES, pad)) + ETHTOOL_A_PAUSE_STAT_RX_FRAMES, pad) || + ethtool_put_stat(skb, pause_stats->tx_pause_storm_events, + ETHTOOL_A_PAUSE_STAT_TX_PAUSE_STORM_EVENTS, pad)) goto err_cancel; nla_nest_end(skb, nest); -- cgit v1.2.3 From 31a6a07eefeb4c84bd6730fbe9e95fd9221712cf Mon Sep 17 00:00:00 2001 From: Coiby Xu Date: Fri, 13 Feb 2026 09:28:46 +0800 Subject: integrity: Make arch_ima_get_secureboot integrity-wide EVM and other LSMs need the ability to query the secure boot status of the system, without directly calling the IMA arch_ima_get_secureboot function. Refactor the secure boot status check into a general function named arch_get_secureboot. Reported-and-suggested-by: Mimi Zohar Suggested-by: Roberto Sassu Signed-off-by: Coiby Xu Acked-by: Ard Biesheuvel Signed-off-by: Mimi Zohar --- MAINTAINERS | 1 + arch/powerpc/kernel/ima_arch.c | 5 --- arch/powerpc/kernel/secure_boot.c | 6 +++ arch/s390/kernel/ima_arch.c | 6 --- arch/s390/kernel/ipl.c | 5 +++ arch/x86/include/asm/efi.h | 4 +- arch/x86/platform/efi/efi.c | 2 +- include/linux/ima.h | 7 +--- include/linux/secure_boot.h | 19 +++++++++ security/integrity/Makefile | 3 +- security/integrity/efi_secureboot.c | 56 +++++++++++++++++++++++++++ security/integrity/ima/ima_appraise.c | 2 +- security/integrity/ima/ima_efi.c | 47 +--------------------- security/integrity/ima/ima_main.c | 3 +- security/integrity/integrity.h | 1 + security/integrity/platform_certs/load_uefi.c | 2 +- security/integrity/secure_boot.c | 16 ++++++++ 17 files changed, 115 insertions(+), 70 deletions(-) create mode 100644 include/linux/secure_boot.h create mode 100644 security/integrity/efi_secureboot.c create mode 100644 security/integrity/secure_boot.c (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 61bf550fd37c..04823afa8b74 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12668,6 +12668,7 @@ R: Eric Snowberg L: linux-integrity@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/zohar/linux-integrity.git +F: include/linux/secure_boot.h F: security/integrity/ F: security/integrity/ima/ diff --git a/arch/powerpc/kernel/ima_arch.c b/arch/powerpc/kernel/ima_arch.c index b7029beed847..0d8892a03526 100644 --- a/arch/powerpc/kernel/ima_arch.c +++ b/arch/powerpc/kernel/ima_arch.c @@ -7,11 +7,6 @@ #include #include -bool arch_ima_get_secureboot(void) -{ - return is_ppc_secureboot_enabled(); -} - /* * The "secure_rules" are enabled only on "secureboot" enabled systems. * These rules verify the file signatures against known good values. diff --git a/arch/powerpc/kernel/secure_boot.c b/arch/powerpc/kernel/secure_boot.c index 3a28795b4ed8..28436c1599e0 100644 --- a/arch/powerpc/kernel/secure_boot.c +++ b/arch/powerpc/kernel/secure_boot.c @@ -5,6 +5,7 @@ */ #include #include +#include #include #include @@ -44,6 +45,11 @@ out: return enabled; } +bool arch_get_secureboot(void) +{ + return is_ppc_secureboot_enabled(); +} + bool is_ppc_trustedboot_enabled(void) { struct device_node *node; diff --git a/arch/s390/kernel/ima_arch.c b/arch/s390/kernel/ima_arch.c index f3c3e6e1c5d3..6ccbe34ce408 100644 --- a/arch/s390/kernel/ima_arch.c +++ b/arch/s390/kernel/ima_arch.c @@ -1,12 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include -#include - -bool arch_ima_get_secureboot(void) -{ - return ipl_secure_flag; -} const char * const *arch_get_ima_policy(void) { diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 049c557c452f..bdbbedf52580 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -2504,6 +2504,11 @@ out: return buf; } +bool arch_get_secureboot(void) +{ + return ipl_secure_flag; +} + int ipl_report_free(struct ipl_report *report) { struct ipl_report_component *comp, *ncomp; diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index f227a70ac91f..ee382b56dd7b 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -401,9 +401,9 @@ extern int __init efi_memmap_split_count(efi_memory_desc_t *md, extern void __init efi_memmap_insert(struct efi_memory_map *old_memmap, void *buf, struct efi_mem_range *mem); -extern enum efi_secureboot_mode __x86_ima_efi_boot_mode(void); +enum efi_secureboot_mode __x86_efi_boot_mode(void); -#define arch_ima_efi_boot_mode __x86_ima_efi_boot_mode() +#define arch_efi_boot_mode __x86_efi_boot_mode() #ifdef CONFIG_EFI_RUNTIME_MAP int efi_get_runtime_map_size(void); diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index d00c6de7f3b7..74032f3ab9b0 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -920,7 +920,7 @@ umode_t efi_attr_is_visible(struct kobject *kobj, struct attribute *attr, int n) return attr->mode; } -enum efi_secureboot_mode __x86_ima_efi_boot_mode(void) +enum efi_secureboot_mode __x86_efi_boot_mode(void) { return boot_params.secure_boot; } diff --git a/include/linux/ima.h b/include/linux/ima.h index abf8923f8fc5..8e08baf16c2f 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -11,6 +11,7 @@ #include #include #include +#include #include struct linux_binprm; @@ -73,14 +74,8 @@ int ima_validate_range(phys_addr_t phys, size_t size); #endif #ifdef CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT -extern bool arch_ima_get_secureboot(void); extern const char * const *arch_get_ima_policy(void); #else -static inline bool arch_ima_get_secureboot(void) -{ - return false; -} - static inline const char * const *arch_get_ima_policy(void) { return NULL; diff --git a/include/linux/secure_boot.h b/include/linux/secure_boot.h new file mode 100644 index 000000000000..3ded3f03655c --- /dev/null +++ b/include/linux/secure_boot.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2026 Red Hat, Inc. All Rights Reserved. + * + * Author: Coiby Xu + */ + +#ifndef _LINUX_SECURE_BOOT_H +#define _LINUX_SECURE_BOOT_H + +#include + +/* + * Returns true if the platform secure boot is enabled. + * Returns false if disabled or not supported. + */ +bool arch_get_secureboot(void); + +#endif /* _LINUX_SECURE_BOOT_H */ diff --git a/security/integrity/Makefile b/security/integrity/Makefile index 92b63039c654..548665e2b702 100644 --- a/security/integrity/Makefile +++ b/security/integrity/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_INTEGRITY) += integrity.o -integrity-y := iint.o +integrity-y := iint.o secure_boot.o integrity-$(CONFIG_INTEGRITY_AUDIT) += integrity_audit.o integrity-$(CONFIG_INTEGRITY_SIGNATURE) += digsig.o integrity-$(CONFIG_INTEGRITY_ASYMMETRIC_KEYS) += digsig_asymmetric.o @@ -18,6 +18,7 @@ integrity-$(CONFIG_LOAD_IPL_KEYS) += platform_certs/load_ipl_s390.o integrity-$(CONFIG_LOAD_PPC_KEYS) += platform_certs/efi_parser.o \ platform_certs/load_powerpc.o \ platform_certs/keyring_handler.o +integrity-$(CONFIG_EFI) += efi_secureboot.o # The relative order of the 'ima' and 'evm' LSMs depends on the order below. obj-$(CONFIG_IMA) += ima/ obj-$(CONFIG_EVM) += evm/ diff --git a/security/integrity/efi_secureboot.c b/security/integrity/efi_secureboot.c new file mode 100644 index 000000000000..bfd4260a83a3 --- /dev/null +++ b/security/integrity/efi_secureboot.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-1.0+ +/* + * Copyright (C) 2018 IBM Corporation + */ +#include +#include +#include + +#ifndef arch_efi_boot_mode +#define arch_efi_boot_mode efi_secureboot_mode_unset +#endif + +static enum efi_secureboot_mode get_sb_mode(void) +{ + enum efi_secureboot_mode mode; + + if (!efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE)) { + pr_info("integrity: secureboot mode unknown, no efi\n"); + return efi_secureboot_mode_unknown; + } + + mode = efi_get_secureboot_mode(efi.get_variable); + if (mode == efi_secureboot_mode_disabled) + pr_info("integrity: secureboot mode disabled\n"); + else if (mode == efi_secureboot_mode_unknown) + pr_info("integrity: secureboot mode unknown\n"); + else + pr_info("integrity: secureboot mode enabled\n"); + return mode; +} + +/* + * Query secure boot status + * + * Note don't call this function too early e.g. in __setup hook otherwise the + * kernel may hang when calling efi_get_secureboot_mode. + * + */ +bool arch_get_secureboot(void) +{ + static enum efi_secureboot_mode sb_mode; + static bool initialized; + + if (!initialized && efi_enabled(EFI_BOOT)) { + sb_mode = arch_efi_boot_mode; + + if (sb_mode == efi_secureboot_mode_unset) + sb_mode = get_sb_mode(); + initialized = true; + } + + if (sb_mode == efi_secureboot_mode_enabled) + return true; + else + return false; +} diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c index 16c20c578ea8..ee2e0891febc 100644 --- a/security/integrity/ima/ima_appraise.c +++ b/security/integrity/ima/ima_appraise.c @@ -27,7 +27,7 @@ core_param(ima_appraise, ima_appraise_cmdline_default, charp, 0); void __init ima_appraise_parse_cmdline(void) { const char *str = ima_appraise_cmdline_default; - bool sb_state = arch_ima_get_secureboot(); + bool sb_state = arch_get_secureboot(); int appraisal_state = ima_appraise; if (!str) diff --git a/security/integrity/ima/ima_efi.c b/security/integrity/ima/ima_efi.c index 138029bfcce1..78191879dd98 100644 --- a/security/integrity/ima/ima_efi.c +++ b/security/integrity/ima/ima_efi.c @@ -2,52 +2,9 @@ /* * Copyright (C) 2018 IBM Corporation */ -#include #include #include -#include - -#ifndef arch_ima_efi_boot_mode -#define arch_ima_efi_boot_mode efi_secureboot_mode_unset -#endif - -static enum efi_secureboot_mode get_sb_mode(void) -{ - enum efi_secureboot_mode mode; - - if (!efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE)) { - pr_info("ima: secureboot mode unknown, no efi\n"); - return efi_secureboot_mode_unknown; - } - - mode = efi_get_secureboot_mode(efi.get_variable); - if (mode == efi_secureboot_mode_disabled) - pr_info("ima: secureboot mode disabled\n"); - else if (mode == efi_secureboot_mode_unknown) - pr_info("ima: secureboot mode unknown\n"); - else - pr_info("ima: secureboot mode enabled\n"); - return mode; -} - -bool arch_ima_get_secureboot(void) -{ - static enum efi_secureboot_mode sb_mode; - static bool initialized; - - if (!initialized && efi_enabled(EFI_BOOT)) { - sb_mode = arch_ima_efi_boot_mode; - - if (sb_mode == efi_secureboot_mode_unset) - sb_mode = get_sb_mode(); - initialized = true; - } - - if (sb_mode == efi_secureboot_mode_enabled) - return true; - else - return false; -} +#include /* secureboot arch rules */ static const char * const sb_arch_rules[] = { @@ -67,7 +24,7 @@ static const char * const sb_arch_rules[] = { const char * const *arch_get_ima_policy(void) { - if (IS_ENABLED(CONFIG_IMA_ARCH_POLICY) && arch_ima_get_secureboot()) { + if (IS_ENABLED(CONFIG_IMA_ARCH_POLICY) && arch_get_secureboot()) { if (IS_ENABLED(CONFIG_MODULE_SIG)) set_module_sig_enforced(); if (IS_ENABLED(CONFIG_KEXEC_SIG)) diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index 1d6229b156fb..5808b52c8426 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -953,8 +953,7 @@ static int ima_load_data(enum kernel_load_data_id id, bool contents) switch (id) { case LOADING_KEXEC_IMAGE: - if (IS_ENABLED(CONFIG_KEXEC_SIG) - && arch_ima_get_secureboot()) { + if (IS_ENABLED(CONFIG_KEXEC_SIG) && arch_get_secureboot()) { pr_err("impossible to appraise a kernel image without a file descriptor; try using kexec_file_load syscall.\n"); return -EACCES; } diff --git a/security/integrity/integrity.h b/security/integrity/integrity.h index 7b388b66cf80..4636629533af 100644 --- a/security/integrity/integrity.h +++ b/security/integrity/integrity.h @@ -14,6 +14,7 @@ #include #include +#include #include #include #include diff --git a/security/integrity/platform_certs/load_uefi.c b/security/integrity/platform_certs/load_uefi.c index d1fdd113450a..c0d6948446c3 100644 --- a/security/integrity/platform_certs/load_uefi.c +++ b/security/integrity/platform_certs/load_uefi.c @@ -212,7 +212,7 @@ static int __init load_uefi_certs(void) } /* the MOK/MOKx can not be trusted when secure boot is disabled */ - if (!arch_ima_get_secureboot()) + if (!arch_get_secureboot()) return 0; mokx = get_cert_list(L"MokListXRT", &mok_var, &mokxsize, &status); diff --git a/security/integrity/secure_boot.c b/security/integrity/secure_boot.c new file mode 100644 index 000000000000..fc2693c286f8 --- /dev/null +++ b/security/integrity/secure_boot.c @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2026 Red Hat, Inc. All Rights Reserved. + * + * Author: Coiby Xu + */ +#include + +/* + * Default weak implementation. + * Architectures that support secure boot must override this. + */ +__weak bool arch_get_secureboot(void) +{ + return false; +} -- cgit v1.2.3 From 0ec959cf4b5a609d7f27bf84064ef5372e30ab80 Mon Sep 17 00:00:00 2001 From: Coiby Xu Date: Tue, 30 Sep 2025 10:26:56 +0800 Subject: evm: fix security.evm for a file with IMA signature When both IMA and EVM fix modes are enabled, accessing a file with IMA signature but missing EVM HMAC won't cause security.evm to be fixed. Add a function evm_fix_hmac which will be explicitly called to fix EVM HMAC for this case. Suggested-by: Mimi Zohar Signed-off-by: Coiby Xu Signed-off-by: Mimi Zohar --- include/linux/evm.h | 8 ++++++++ security/integrity/evm/evm_main.c | 28 ++++++++++++++++++++++++++++ security/integrity/ima/ima_appraise.c | 5 +++++ 3 files changed, 41 insertions(+) (limited to 'include/linux') diff --git a/include/linux/evm.h b/include/linux/evm.h index ddece4a6b25d..913f4573b203 100644 --- a/include/linux/evm.h +++ b/include/linux/evm.h @@ -18,6 +18,8 @@ extern enum integrity_status evm_verifyxattr(struct dentry *dentry, const char *xattr_name, void *xattr_value, size_t xattr_value_len); +int evm_fix_hmac(struct dentry *dentry, const char *xattr_name, + const char *xattr_value, size_t xattr_value_len); int evm_inode_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr, struct xattr *xattrs, int *xattr_count); @@ -51,6 +53,12 @@ static inline enum integrity_status evm_verifyxattr(struct dentry *dentry, { return INTEGRITY_UNKNOWN; } + +static inline int evm_fix_hmac(struct dentry *dentry, const char *xattr_name, + const char *xattr_value, size_t xattr_value_len) +{ + return -EOPNOTSUPP; +} #endif static inline int evm_inode_init_security(struct inode *inode, struct inode *dir, diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c index cfc3531cf53f..1b0089b4b796 100644 --- a/security/integrity/evm/evm_main.c +++ b/security/integrity/evm/evm_main.c @@ -795,6 +795,34 @@ bool evm_revalidate_status(const char *xattr_name) return true; } +/** + * evm_fix_hmac - Calculate the HMAC and add it to security.evm for fix mode + * @dentry: pointer to the affected dentry which doesn't yet have security.evm + * xattr + * @xattr_name: pointer to the affected extended attribute name + * @xattr_value: pointer to the new extended attribute value + * @xattr_value_len: pointer to the new extended attribute value length + * + * Expects to be called with i_mutex locked. + * + * Return: 0 on success, -EPERM/-ENOMEM/-EOPNOTSUPP on failure + */ +int evm_fix_hmac(struct dentry *dentry, const char *xattr_name, + const char *xattr_value, size_t xattr_value_len) + +{ + if (!evm_fixmode || !evm_revalidate_status((xattr_name))) + return -EPERM; + + if (!(evm_initialized & EVM_INIT_HMAC)) + return -EPERM; + + if (is_unsupported_hmac_fs(dentry)) + return -EOPNOTSUPP; + + return evm_update_evmxattr(dentry, xattr_name, xattr_value, xattr_value_len); +} + /** * evm_inode_post_setxattr - update 'security.evm' to reflect the changes * @dentry: pointer to the affected dentry diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c index ee2e0891febc..0d41d102626a 100644 --- a/security/integrity/ima/ima_appraise.c +++ b/security/integrity/ima/ima_appraise.c @@ -591,6 +591,11 @@ out: xattr_value->type != EVM_IMA_XATTR_DIGSIG)) { if (!ima_fix_xattr(dentry, iint)) status = INTEGRITY_PASS; + } else if (status == INTEGRITY_NOLABEL) { + if (!evm_fix_hmac(dentry, XATTR_NAME_IMA, + (const char *)xattr_value, + xattr_len)) + status = INTEGRITY_PASS; } /* -- cgit v1.2.3 From fab0c75d500fd23de6ea1b30e44635418a6dae65 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 4 Mar 2026 10:10:22 -0800 Subject: x86/cpu: Add platform ID to CPU matching structure The existing x86_match_cpu() infrastructure can be used to match a bunch of attributes of a CPU: vendor, family, model, steppings and CPU features. But, there's one more attribute that's missing and unable to be matched against: the platform ID, enumerated on Intel CPUs in MSR_IA32_PLATFORM_ID. It is a little more obscure and is only queried during microcode loading. This is because Intel sometimes has CPUs with identical family/model/stepping but which need different microcode. These CPUs are differentiated with the platform ID. Add a field in 'struct x86_cpu_id' for the platform ID. Similar to the stepping field, make the new field a mask of platform IDs. Some examples: 0x01: matches only platform ID 0x0 0x02: matches only platform ID 0x1 0x03: matches platform IDs 0x0 or 0x1 0x80: matches only platform ID 0x7 0xff: matches all 8 possible platform IDs Since the mask is only a byte wide, it nestles in next to another u8 and does not even increase the size of 'struct x86_cpu_id'. Reserve the all 0's value as the wildcard (X86_PLATFORM_ANY). This avoids forcing changes to existing 'struct x86_cpu_id' users. They can just continue to fill the field with 0's and their matching will work exactly as before. Note: If someone is ever looking for space in 'struct x86_cpu_id', this new field could probably get stuck over in ->driver_data for the one user that there is. Signed-off-by: Dave Hansen Reviewed-by: Sohil Mehta Link: https://patch.msgid.link/20260304181022.058DF07C@davehans-spike.ostc.intel.com --- arch/x86/kernel/cpu/match.c | 3 +++ include/linux/mod_devicetable.h | 2 ++ 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index 6af1e8baeb0f..4604802692da 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -76,6 +76,9 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) if (m->steppings != X86_STEPPING_ANY && !(BIT(c->x86_stepping) & m->steppings)) continue; + if (m->platform_mask != X86_PLATFORM_ANY && + !(BIT(c->intel_platform_id) & m->platform_mask)) + continue; if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature)) continue; if (!x86_match_vendor_cpu_type(c, m)) diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 5b1725fe9707..23ff24080dfd 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -691,6 +691,7 @@ struct x86_cpu_id { __u16 feature; /* bit index */ /* Solely for kernel-internal use: DO NOT EXPORT to userspace! */ __u16 flags; + __u8 platform_mask; __u8 type; kernel_ulong_t driver_data; }; @@ -702,6 +703,7 @@ struct x86_cpu_id { #define X86_STEPPING_ANY 0 #define X86_STEP_MIN 0 #define X86_STEP_MAX 0xf +#define X86_PLATFORM_ANY 0x0 #define X86_FEATURE_ANY 0 /* Same as FPU, you can't test for that */ #define X86_CPU_TYPE_ANY 0 -- cgit v1.2.3 From de70eef32e10883fe74f6df635c616785b24b867 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 25 Feb 2026 21:13:09 -0800 Subject: ARM: omap: fix all kernel-doc warnings Use the correct struct member names to avoid kernel-doc warnings: Warning: include/linux/platform_data/voltage-omap.h:27 struct member 'volt_nominal' not described in 'omap_volt_data' Warning: include/linux/platform_data/voltage-omap.h:27 struct member 'vp_errgain' not described in 'omap_volt_data' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260226051309.556228-1-rdunlap@infradead.org Signed-off-by: Kevin Hilman --- include/linux/platform_data/voltage-omap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/voltage-omap.h b/include/linux/platform_data/voltage-omap.h index 6d74e507dbd2..2b48f2b0135d 100644 --- a/include/linux/platform_data/voltage-omap.h +++ b/include/linux/platform_data/voltage-omap.h @@ -10,14 +10,14 @@ /** * struct omap_volt_data - Omap voltage specific data. - * @voltage_nominal: The possible voltage value in uV + * @volt_nominal: The possible voltage value in uV * @sr_efuse_offs: The offset of the efuse register(from system * control module base address) from where to read * the n-target value for the smartreflex module. * @sr_errminlimit: Error min limit value for smartreflex. This value * differs at differnet opp and thus is linked * with voltage. - * @vp_errorgain: Error gain value for the voltage processor. This + * @vp_errgain: Error gain value for the voltage processor. This * field also differs according to the voltage/opp. */ struct omap_volt_data { -- cgit v1.2.3 From d4d8c6e6fd2a1c5144339884ca5f66e654ad54a5 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 3 Mar 2026 23:54:16 +0000 Subject: tcp: Initialise ehash secrets during connect() and listen(). inet_ehashfn() and inet6_ehashfn() initialise random secrets on the first call by net_get_random_once(). While the init part is patched out using static keys, with CONFIG_STACKPROTECTOR_STRONG=y, this causes a compiler to generate a stack canary due to an automatic variable, unsigned long ___flags, in the DO_ONCE() macro being passed to __do_once_start(). With FDO, this is visible in __inet_lookup_established() and __inet6_lookup_established() too. Let's initialise the secrets by get_random_sleepable_once() in the slow paths: inet_hash() for listen(), and inet_hash_connect() and inet6_hash_connect() for connect(). Note that IPv6 listener will initialise both IPv4 & IPv6 secrets in inet_hash() for IPv4-mapped IPv6 address. With the patch, the stack size is reduced by 16 bytes (___flags + a stack canary) and NOPs for the static key go away. Before: __inet6_lookup_established() ... push %rbx sub $0x38,%rsp # stack is 56 bytes mov %edx,%ebx # sport mov %gs:0x299419f(%rip),%rax # load stack canary mov %rax,0x30(%rsp) and store it onto stack mov 0x440(%rdi),%r15 # net->ipv4.tcp_death_row.hashinfo nop 32: mov %r8d,%ebp # hnum shl $0x10,%ebp # hnum << 16 nop 3d: mov 0x70(%rsp),%r14d # sdif or %ebx,%ebp # INET_COMBINED_PORTS(sport, hnum) mov 0x11a8382(%rip),%eax # inet6_ehashfn() ... After: __inet6_lookup_established() ... push %rbx sub $0x28,%rsp # stack is 40 bytes mov 0x60(%rsp),%ebp # sdif mov %r8d,%r14d # hnum shl $0x10,%r14d # hnum << 16 or %edx,%r14d # INET_COMBINED_PORTS(sport, hnum) mov 0x440(%rdi),%rax # net->ipv4.tcp_death_row.hashinfo mov 0x1194f09(%rip),%r10d # inet6_ehashfn() ... Suggested-by: Eric Dumazet Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260303235424.3877267-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/net.h | 2 ++ include/net/inet6_hashtables.h | 2 ++ net/ipv4/inet_hashtables.c | 17 +++++++++++++++-- net/ipv6/inet6_hashtables.c | 13 ++++++++++--- 4 files changed, 29 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/net.h b/include/linux/net.h index f58b38ab37f8..a8e818de95b3 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -304,6 +304,8 @@ do { \ #define net_get_random_once(buf, nbytes) \ get_random_once((buf), (nbytes)) +#define net_get_random_sleepable_once(buf, nbytes) \ + get_random_sleepable_once((buf), (nbytes)) /* * E.g. XFS meta- & log-data is in slab pages, or bcache meta diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index c16de5b7963f..2cc5d416bbb5 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -24,6 +24,8 @@ struct inet_hashinfo; +void inet6_init_ehash_secret(void); + static inline unsigned int __inet6_ehashfn(const u32 lhash, const u16 lport, const u32 fhash, diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 61e654b395be..ac7b67c603b5 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -30,12 +30,16 @@ #include #include +static void inet_init_ehash_secret(void) +{ + net_get_random_sleepable_once(&inet_ehash_secret, + sizeof(inet_ehash_secret)); +} + u32 inet_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, const __be32 faddr, const __be16 fport) { - net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret)); - return lport + __inet_ehashfn(laddr, 0, faddr, fport, inet_ehash_secret + net_hash_mix(net)); } @@ -793,6 +797,13 @@ int inet_hash(struct sock *sk) local_bh_enable(); return 0; } + +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + inet6_init_ehash_secret(); +#endif + inet_init_ehash_secret(); + WARN_ON(!sk_unhashed(sk)); ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); @@ -1239,6 +1250,8 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row, if (!inet_sk(sk)->inet_num) port_offset = inet_sk_port_offset(sk); + inet_init_ehash_secret(); + hash_port0 = inet_ehashfn(net, inet->inet_rcv_saddr, 0, inet->inet_daddr, inet->inet_dport); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 182d38e6d6d8..72bc68fef48a 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -23,15 +23,20 @@ #include #include +void inet6_init_ehash_secret(void) +{ + net_get_random_sleepable_once(&inet6_ehash_secret, + sizeof(inet6_ehash_secret)); + net_get_random_sleepable_once(&tcp_ipv6_hash_secret, + sizeof(tcp_ipv6_hash_secret)); +} + u32 inet6_ehashfn(const struct net *net, const struct in6_addr *laddr, const u16 lport, const struct in6_addr *faddr, const __be16 fport) { u32 lhash, fhash; - net_get_random_once(&inet6_ehash_secret, sizeof(inet6_ehash_secret)); - net_get_random_once(&tcp_ipv6_hash_secret, sizeof(tcp_ipv6_hash_secret)); - lhash = (__force u32)laddr->s6_addr32[3]; fhash = __ipv6_addr_jhash(faddr, tcp_ipv6_hash_secret); @@ -363,6 +368,8 @@ int inet6_hash_connect(struct inet_timewait_death_row *death_row, if (!inet_sk(sk)->inet_num) port_offset = inet6_sk_port_offset(sk); + inet6_init_ehash_secret(); + hash_port0 = inet6_ehashfn(net, daddr, 0, saddr, inet->inet_dport); return __inet_hash_connect(death_row, sk, port_offset, hash_port0, -- cgit v1.2.3 From 5b30afc20b3fea29b9beb83c6415c4ff06f774aa Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 4 Mar 2026 11:26:47 -1000 Subject: cgroup: Expose some cgroup helpers Expose the following through cgroup.h: - cgroup_on_dfl() - cgroup_is_dead() - cgroup_for_each_live_child() - cgroup_for_each_live_descendant_pre() - cgroup_for_each_live_descendant_post() Until now, these didn't need to be exposed because controllers only cared about the css hierarchy. The planned sched_ext hierarchical scheduler support will be based on the default cgroup hierarchy, which is in line with the existing BPF cgroup support, and thus needs these exposed. Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 65 +++++++++++++++++++++++++++++++++++++++-- kernel/cgroup/cgroup-internal.h | 6 ---- kernel/cgroup/cgroup.c | 55 ---------------------------------- 3 files changed, 63 insertions(+), 63 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index bc892e3b37ee..e52160e85af4 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -42,6 +42,14 @@ struct kernel_clone_args; #ifdef CONFIG_CGROUPS +/* + * To avoid confusing the compiler (and generating warnings) with code + * that attempts to access what would be a 0-element array (i.e. sized + * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this + * constant expression can be added. + */ +#define CGROUP_HAS_SUBSYS_CONFIG (CGROUP_SUBSYS_COUNT > 0) + enum css_task_iter_flags { CSS_TASK_ITER_PROCS = (1U << 0), /* walk only threadgroup leaders */ CSS_TASK_ITER_THREADED = (1U << 1), /* walk all threaded css_sets in the domain */ @@ -76,6 +84,7 @@ enum cgroup_lifetime_events { extern struct file_system_type cgroup_fs_type; extern struct cgroup_root cgrp_dfl_root; extern struct css_set init_css_set; +extern struct mutex cgroup_mutex; extern spinlock_t css_set_lock; extern struct blocking_notifier_head cgroup_lifetime_notifier; @@ -103,6 +112,8 @@ extern struct blocking_notifier_head cgroup_lifetime_notifier; #define cgroup_subsys_on_dfl(ss) \ static_branch_likely(&ss ## _on_dfl_key) +bool cgroup_on_dfl(const struct cgroup *cgrp); + bool css_has_online_children(struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgroup, @@ -274,6 +285,32 @@ void css_task_iter_end(struct css_task_iter *it); for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ (pos) = css_next_descendant_post((pos), (css))) +/* iterate over child cgrps, lock should be held throughout iteration */ +#define cgroup_for_each_live_child(child, cgrp) \ + list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ + if (({ lockdep_assert_held(&cgroup_mutex); \ + cgroup_is_dead(child); })) \ + ; \ + else + +/* walk live descendants in pre order */ +#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \ + css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \ + if (({ lockdep_assert_held(&cgroup_mutex); \ + (dsct) = (d_css)->cgroup; \ + cgroup_is_dead(dsct); })) \ + ; \ + else + +/* walk live descendants in postorder */ +#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \ + css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \ + if (({ lockdep_assert_held(&cgroup_mutex); \ + (dsct) = (d_css)->cgroup; \ + cgroup_is_dead(dsct); })) \ + ; \ + else + /** * cgroup_taskset_for_each - iterate cgroup_taskset * @task: the loop cursor @@ -336,6 +373,27 @@ static inline u64 cgroup_id(const struct cgroup *cgrp) return cgrp->kn->id; } +/** + * cgroup_css - obtain a cgroup's css for the specified subsystem + * @cgrp: the cgroup of interest + * @ss: the subsystem of interest (%NULL returns @cgrp->self) + * + * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This + * function must be called either under cgroup_mutex or rcu_read_lock() and + * the caller is responsible for pinning the returned css if it wants to + * keep accessing it outside the said locks. This function may return + * %NULL if @cgrp doesn't have @subsys_id enabled. + */ +static inline struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) +{ + if (CGROUP_HAS_SUBSYS_CONFIG && ss) + return rcu_dereference_check(cgrp->subsys[ss->id], + lockdep_is_held(&cgroup_mutex)); + else + return &cgrp->self; +} + /** * css_is_dying - test whether the specified css is dying * @css: target css @@ -372,6 +430,11 @@ static inline bool css_is_self(struct cgroup_subsys_state *css) return false; } +static inline bool cgroup_is_dead(const struct cgroup *cgrp) +{ + return !(cgrp->self.flags & CSS_ONLINE); +} + static inline void cgroup_get(struct cgroup *cgrp) { css_get(&cgrp->self); @@ -387,8 +450,6 @@ static inline void cgroup_put(struct cgroup *cgrp) css_put(&cgrp->self); } -extern struct mutex cgroup_mutex; - static inline void cgroup_lock(void) { mutex_lock(&cgroup_mutex); diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 3bfe37693d68..58797123b752 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -184,11 +184,6 @@ extern bool cgrp_dfl_visible; for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \ (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) -static inline bool cgroup_is_dead(const struct cgroup *cgrp) -{ - return !(cgrp->self.flags & CSS_ONLINE); -} - static inline bool notify_on_release(const struct cgroup *cgrp) { return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -222,7 +217,6 @@ static inline void get_css_set(struct css_set *cset) } bool cgroup_ssid_enabled(int ssid); -bool cgroup_on_dfl(const struct cgroup *cgrp); struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root); struct cgroup *task_cgroup_from_root(struct task_struct *task, diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index be1d71dda317..cdc63be63f2c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -68,14 +68,6 @@ /* let's not notify more than 100 times per second */ #define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100) -/* - * To avoid confusing the compiler (and generating warnings) with code - * that attempts to access what would be a 0-element array (i.e. sized - * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this - * constant expression can be added. - */ -#define CGROUP_HAS_SUBSYS_CONFIG (CGROUP_SUBSYS_COUNT > 0) - /* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. @@ -509,27 +501,6 @@ static u32 cgroup_ss_mask(struct cgroup *cgrp) return cgrp->root->subsys_mask; } -/** - * cgroup_css - obtain a cgroup's css for the specified subsystem - * @cgrp: the cgroup of interest - * @ss: the subsystem of interest (%NULL returns @cgrp->self) - * - * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This - * function must be called either under cgroup_mutex or rcu_read_lock() and - * the caller is responsible for pinning the returned css if it wants to - * keep accessing it outside the said locks. This function may return - * %NULL if @cgrp doesn't have @subsys_id enabled. - */ -static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, - struct cgroup_subsys *ss) -{ - if (CGROUP_HAS_SUBSYS_CONFIG && ss) - return rcu_dereference_check(cgrp->subsys[ss->id], - lockdep_is_held(&cgroup_mutex)); - else - return &cgrp->self; -} - /** * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss * @cgrp: the cgroup of interest @@ -741,32 +712,6 @@ EXPORT_SYMBOL_GPL(of_css); } \ } while (false) -/* iterate over child cgrps, lock should be held throughout iteration */ -#define cgroup_for_each_live_child(child, cgrp) \ - list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \ - if (({ lockdep_assert_held(&cgroup_mutex); \ - cgroup_is_dead(child); })) \ - ; \ - else - -/* walk live descendants in pre order */ -#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \ - css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \ - if (({ lockdep_assert_held(&cgroup_mutex); \ - (dsct) = (d_css)->cgroup; \ - cgroup_is_dead(dsct); })) \ - ; \ - else - -/* walk live descendants in postorder */ -#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \ - css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \ - if (({ lockdep_assert_held(&cgroup_mutex); \ - (dsct) = (d_css)->cgroup; \ - cgroup_is_dead(dsct); })) \ - ; \ - else - /* * The default css_set - used by init and its children prior to any * hierarchies being mounted. It contains a pointer to the root state -- cgit v1.2.3 From 336faf5d9115ca6982b82cf122e68738ea8c4173 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 25 Feb 2026 09:16:53 +1100 Subject: VFS: make lookup_one_qstr_excl() static. lookup_one_qstr_excl() is no longer used outside of namei.c, so make it static. Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Link: https://patch.msgid.link/20260224222542.3458677-9-neilb@ownmail.net Signed-off-by: Christian Brauner --- Documentation/filesystems/porting.rst | 7 +++++++ fs/namei.c | 5 ++--- include/linux/namei.h | 3 --- 3 files changed, 9 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 52ff1d19405b..1dd31ab417a2 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -1361,3 +1361,10 @@ to match what strlen() would return if it was ran on the string. However, if the string is freely accessible for the duration of inode's lifetime, consider using inode_set_cached_link() instead. + +--- + +**mandatory** + +lookup_one_qstr_excl() is no longer exported - use start_creating() or +similar. diff --git a/fs/namei.c b/fs/namei.c index 11c9a4a6c396..a5daa62399d7 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1782,8 +1782,8 @@ static struct dentry *lookup_dcache(const struct qstr *name, * Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't passed. * Will return -EEXIST if name is found and LOOKUP_EXCL was passed. */ -struct dentry *lookup_one_qstr_excl(const struct qstr *name, - struct dentry *base, unsigned int flags) +static struct dentry *lookup_one_qstr_excl(const struct qstr *name, + struct dentry *base, unsigned int flags) { struct dentry *dentry; struct dentry *old; @@ -1820,7 +1820,6 @@ found: } return dentry; } -EXPORT_SYMBOL(lookup_one_qstr_excl); /** * lookup_fast - do fast lockless (but racy) lookup of a dentry diff --git a/include/linux/namei.h b/include/linux/namei.h index 58600cf234bc..c7a7288cdd25 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -54,9 +54,6 @@ extern int path_pts(struct path *path); extern int user_path_at(int, const char __user *, unsigned, struct path *); -struct dentry *lookup_one_qstr_excl(const struct qstr *name, - struct dentry *base, - unsigned int flags); extern int kern_path(const char *, unsigned, struct path *); struct dentry *kern_path_parent(const char *name, struct path *parent); -- cgit v1.2.3 From 08e6183ed2568e733e05e7e1c9de737d91c21155 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 26 Feb 2026 18:36:07 +0100 Subject: wifi: move action code from per-type frame structs The action code actually serves to identify the type of action frame, so it really isn't part of the per-type structure. Pull it out and have it in the general action frame format. In theory, whether or not the action code is present in this way is up to each category, but all categories that are defined right now all have that value. While at it, and since this change requires changing all users, remove the 'u' and make it an anonymous union in this case, so that all code using this changes. Change IEEE80211_MIN_ACTION_SIZE to take an argument which says how much of the frame is needed, e.g. category, action_code or the specific frame type that's defined in the union. Again this also ensures that all code is updated. In some cases, fix bugs where the SKB length was checked after having accessed beyond the checked length, in particular in FTM code, e.g. ieee80211_is_ftm(). Link: https://patch.msgid.link/20260226183607.67e71846b59e.I9a24328e3ffcaae179466a935f1c3345029f9961@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath11k/mac.c | 4 +- drivers/net/wireless/ath/ath12k/mac.c | 4 +- drivers/net/wireless/ath/ath12k/wifi7/hw.c | 2 +- drivers/net/wireless/intel/iwlwifi/mld/time_sync.c | 6 +- .../net/wireless/intel/iwlwifi/mvm/ftm-initiator.c | 7 +- drivers/net/wireless/intel/iwlwifi/mvm/time-sync.c | 6 +- drivers/net/wireless/marvell/mwifiex/tdls.c | 12 +- drivers/net/wireless/marvell/mwl8k.c | 4 +- .../net/wireless/mediatek/mt76/mt76_connac_mac.c | 6 +- drivers/net/wireless/mediatek/mt76/mt7925/mac.c | 4 +- drivers/net/wireless/mediatek/mt76/mt7996/mac.c | 4 +- drivers/net/wireless/realtek/rtl8xxxu/core.c | 14 +-- drivers/net/wireless/realtek/rtlwifi/base.c | 28 ++--- drivers/net/wireless/realtek/rtlwifi/pci.c | 2 +- drivers/net/wireless/silabs/wfx/data_rx.c | 8 +- include/linux/ieee80211.h | 83 +++++--------- net/mac80211/agg-rx.c | 27 ++--- net/mac80211/agg-tx.c | 28 ++--- net/mac80211/eht.c | 21 ++-- net/mac80211/ht.c | 31 +++--- net/mac80211/ibss.c | 18 +-- net/mac80211/iface.c | 18 +-- net/mac80211/mesh.c | 14 +-- net/mac80211/mesh_hwmp.c | 20 ++-- net/mac80211/mesh_plink.c | 21 ++-- net/mac80211/mlme.c | 82 ++++++-------- net/mac80211/rx.c | 123 +++++++++------------ net/mac80211/s1g.c | 28 ++--- net/mac80211/spectmgmt.c | 31 +++--- net/mac80211/tdls.c | 29 ++--- net/mac80211/util.c | 5 +- net/mac80211/vht.c | 10 +- 32 files changed, 308 insertions(+), 392 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c index d1c121e943cb..a48b6bf1f29a 100644 --- a/drivers/net/wireless/ath/ath11k/mac.c +++ b/drivers/net/wireless/ath/ath11k/mac.c @@ -6288,10 +6288,10 @@ static int ath11k_mac_mgmt_action_frame_fill_elem_data(struct ath11k_vif *arvif, lockdep_assert_held(&ar->conf_mutex); /* make sure category field is present */ - if (skb->len < IEEE80211_MIN_ACTION_SIZE) + if (skb->len < IEEE80211_MIN_ACTION_SIZE(category)) return -EINVAL; - remaining_len = skb->len - IEEE80211_MIN_ACTION_SIZE; + remaining_len = skb->len - IEEE80211_MIN_ACTION_SIZE(category); has_protected = ieee80211_has_protected(hdr->frame_control); /* In case of SW crypto and hdr protected (PMF), packet will already be encrypted, diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index af7590639dbf..a03881c73d68 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -9119,10 +9119,10 @@ static int ath12k_mac_mgmt_action_frame_fill_elem_data(struct ath12k_link_vif *a lockdep_assert_wiphy(wiphy); /* make sure category field is present */ - if (skb->len < IEEE80211_MIN_ACTION_SIZE) + if (skb->len < IEEE80211_MIN_ACTION_SIZE(category)) return -EINVAL; - remaining_len = skb->len - IEEE80211_MIN_ACTION_SIZE; + remaining_len = skb->len - IEEE80211_MIN_ACTION_SIZE(category); has_protected = ieee80211_has_protected(hdr->frame_control); /* In case of SW crypto and hdr protected (PMF), packet will already be encrypted, diff --git a/drivers/net/wireless/ath/ath12k/wifi7/hw.c b/drivers/net/wireless/ath/ath12k/wifi7/hw.c index 27acdfc35459..ec6dba96640b 100644 --- a/drivers/net/wireless/ath/ath12k/wifi7/hw.c +++ b/drivers/net/wireless/ath/ath12k/wifi7/hw.c @@ -104,7 +104,7 @@ static bool ath12k_is_addba_resp_action_code(struct ieee80211_mgmt *mgmt) if (mgmt->u.action.category != WLAN_CATEGORY_BACK) return false; - if (mgmt->u.action.u.addba_resp.action_code != WLAN_ACTION_ADDBA_RESP) + if (mgmt->u.action.action_code != WLAN_ACTION_ADDBA_RESP) return false; return true; diff --git a/drivers/net/wireless/intel/iwlwifi/mld/time_sync.c b/drivers/net/wireless/intel/iwlwifi/mld/time_sync.c index 897ab65b71aa..474dd555e70b 100644 --- a/drivers/net/wireless/intel/iwlwifi/mld/time_sync.c +++ b/drivers/net/wireless/intel/iwlwifi/mld/time_sync.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* - * Copyright (C) 2025 Intel Corporation + * Copyright (C) 2025-2026 Intel Corporation */ #include "mld.h" @@ -116,9 +116,9 @@ static bool iwl_mld_is_skb_match(struct sk_buff *skb, u8 *addr, u8 dialog_token) u8 skb_dialog_token; if (ieee80211_is_timing_measurement(skb)) - skb_dialog_token = mgmt->u.action.u.wnm_timing_msr.dialog_token; + skb_dialog_token = mgmt->u.action.wnm_timing_msr.dialog_token; else - skb_dialog_token = mgmt->u.action.u.ftm.dialog_token; + skb_dialog_token = mgmt->u.action.ftm.dialog_token; if ((ether_addr_equal(mgmt->sa, addr) || ether_addr_equal(mgmt->da, addr)) && diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c b/drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c index ebc569e94f55..1b67836b1fac 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/ftm-initiator.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* * Copyright (C) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2025 Intel Corporation + * Copyright (C) 2018-2026 Intel Corporation */ #include #include @@ -1409,8 +1409,7 @@ void iwl_mvm_ftm_lc_notif(struct iwl_mvm *mvm, struct iwl_rx_cmd_buffer *rxb) struct iwl_mvm_loc_entry *entry; const u8 *ies, *lci, *civic, *msr_ie; size_t ies_len, lci_len = 0, civic_len = 0; - size_t baselen = IEEE80211_MIN_ACTION_SIZE + - sizeof(mgmt->u.action.u.ftm); + size_t baselen = IEEE80211_MIN_ACTION_SIZE(ftm); static const u8 rprt_type_lci = IEEE80211_SPCT_MSR_RPRT_TYPE_LCI; static const u8 rprt_type_civic = IEEE80211_SPCT_MSR_RPRT_TYPE_CIVIC; @@ -1419,7 +1418,7 @@ void iwl_mvm_ftm_lc_notif(struct iwl_mvm *mvm, struct iwl_rx_cmd_buffer *rxb) lockdep_assert_held(&mvm->mutex); - ies = mgmt->u.action.u.ftm.variable; + ies = mgmt->u.action.ftm.variable; ies_len = len - baselen; msr_ie = cfg80211_find_ie_match(WLAN_EID_MEASURE_REPORT, ies, ies_len, diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/time-sync.c b/drivers/net/wireless/intel/iwlwifi/mvm/time-sync.c index edae3e24192b..039b4daac73f 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/time-sync.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/time-sync.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* - * Copyright (C) 2022 Intel Corporation + * Copyright (C) 2022, 2026 Intel Corporation */ #include "mvm.h" @@ -18,9 +18,9 @@ static bool iwl_mvm_is_skb_match(struct sk_buff *skb, u8 *addr, u8 dialog_token) u8 skb_dialog_token; if (ieee80211_is_timing_measurement(skb)) - skb_dialog_token = mgmt->u.action.u.wnm_timing_msr.dialog_token; + skb_dialog_token = mgmt->u.action.wnm_timing_msr.dialog_token; else - skb_dialog_token = mgmt->u.action.u.ftm.dialog_token; + skb_dialog_token = mgmt->u.action.ftm.dialog_token; if ((ether_addr_equal(mgmt->sa, addr) || ether_addr_equal(mgmt->da, addr)) && diff --git a/drivers/net/wireless/marvell/mwifiex/tdls.c b/drivers/net/wireless/marvell/mwifiex/tdls.c index a4cf323e704b..845f2a22e071 100644 --- a/drivers/net/wireless/marvell/mwifiex/tdls.c +++ b/drivers/net/wireless/marvell/mwifiex/tdls.c @@ -755,16 +755,12 @@ mwifiex_construct_tdls_action_frame(struct mwifiex_private *priv, switch (action_code) { case WLAN_PUB_ACTION_TDLS_DISCOVER_RES: /* See the layout of 'struct ieee80211_mgmt'. */ - extra = sizeof(mgmt->u.action.u.tdls_discover_resp) + - sizeof(mgmt->u.action.category); + extra = IEEE80211_MIN_ACTION_SIZE(tdls_discover_resp) - 24; skb_put(skb, extra); mgmt->u.action.category = WLAN_CATEGORY_PUBLIC; - mgmt->u.action.u.tdls_discover_resp.action_code = - WLAN_PUB_ACTION_TDLS_DISCOVER_RES; - mgmt->u.action.u.tdls_discover_resp.dialog_token = - dialog_token; - mgmt->u.action.u.tdls_discover_resp.capability = - cpu_to_le16(capab); + mgmt->u.action.action_code = WLAN_PUB_ACTION_TDLS_DISCOVER_RES; + mgmt->u.action.tdls_discover_resp.dialog_token = dialog_token; + mgmt->u.action.tdls_discover_resp.capability = cpu_to_le16(capab); /* move back for addr4 */ memmove(pos + ETH_ALEN, &mgmt->u.action, extra); /* init address 4 */ diff --git a/drivers/net/wireless/marvell/mwl8k.c b/drivers/net/wireless/marvell/mwl8k.c index 99321d180f34..b1af02180341 100644 --- a/drivers/net/wireless/marvell/mwl8k.c +++ b/drivers/net/wireless/marvell/mwl8k.c @@ -1985,9 +1985,9 @@ mwl8k_txq_xmit(struct ieee80211_hw *hw, */ if (unlikely(ieee80211_is_action(wh->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_BACK && - mgmt->u.action.u.addba_req.action_code == WLAN_ACTION_ADDBA_REQ && + mgmt->u.action.action_code == WLAN_ACTION_ADDBA_REQ && priv->ap_fw)) { - u16 capab = le16_to_cpu(mgmt->u.action.u.addba_req.capab); + u16 capab = le16_to_cpu(mgmt->u.action.addba_req.capab); tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2; index = mwl8k_tid_queue_mapping(tid); } diff --git a/drivers/net/wireless/mediatek/mt76/mt76_connac_mac.c b/drivers/net/wireless/mediatek/mt76/mt76_connac_mac.c index b41ca1410da9..f946ddc20a47 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76_connac_mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt76_connac_mac.c @@ -413,10 +413,10 @@ mt76_connac2_mac_write_txwi_80211(struct mt76_dev *dev, __le32 *txwi, u32 val; if (ieee80211_is_action(fc) && - skb->len >= IEEE80211_MIN_ACTION_SIZE + 1 + 1 + 2 && + skb->len >= IEEE80211_MIN_ACTION_SIZE(addba_req.capab) && mgmt->u.action.category == WLAN_CATEGORY_BACK && - mgmt->u.action.u.addba_req.action_code == WLAN_ACTION_ADDBA_REQ) { - u16 capab = le16_to_cpu(mgmt->u.action.u.addba_req.capab); + mgmt->u.action.action_code == WLAN_ACTION_ADDBA_REQ) { + u16 capab = le16_to_cpu(mgmt->u.action.addba_req.capab); txwi[5] |= cpu_to_le32(MT_TXD5_ADD_BA); tid = (capab >> 2) & IEEE80211_QOS_CTL_TID_MASK; diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/mac.c b/drivers/net/wireless/mediatek/mt76/mt7925/mac.c index 0d9435900423..caaf71c31480 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7925/mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7925/mac.c @@ -668,9 +668,9 @@ mt7925_mac_write_txwi_80211(struct mt76_dev *dev, __le32 *txwi, u32 val; if (ieee80211_is_action(fc) && - skb->len >= IEEE80211_MIN_ACTION_SIZE + 1 && + skb->len >= IEEE80211_MIN_ACTION_SIZE(action_code) && mgmt->u.action.category == WLAN_CATEGORY_BACK && - mgmt->u.action.u.addba_req.action_code == WLAN_ACTION_ADDBA_REQ) + mgmt->u.action.action_code == WLAN_ACTION_ADDBA_REQ) tid = MT_TX_ADDBA; else if (ieee80211_is_mgmt(hdr->frame_control)) tid = MT_TX_NORMAL; diff --git a/drivers/net/wireless/mediatek/mt76/mt7996/mac.c b/drivers/net/wireless/mediatek/mt76/mt7996/mac.c index d4f3ee943b47..84cbf36b493c 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7996/mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7996/mac.c @@ -800,9 +800,9 @@ mt7996_mac_write_txwi_80211(struct mt7996_dev *dev, __le32 *txwi, u32 val; if (ieee80211_is_action(fc) && - skb->len >= IEEE80211_MIN_ACTION_SIZE + 1 && + skb->len >= IEEE80211_MIN_ACTION_SIZE(action_code) && mgmt->u.action.category == WLAN_CATEGORY_BACK && - mgmt->u.action.u.addba_req.action_code == WLAN_ACTION_ADDBA_REQ) { + mgmt->u.action.action_code == WLAN_ACTION_ADDBA_REQ) { if (is_mt7990(&dev->mt76)) txwi[6] |= cpu_to_le32(FIELD_PREP(MT_TXD6_TID_ADDBA, tid)); else diff --git a/drivers/net/wireless/realtek/rtl8xxxu/core.c b/drivers/net/wireless/realtek/rtl8xxxu/core.c index 794187d28caa..804fc604e5f8 100644 --- a/drivers/net/wireless/realtek/rtl8xxxu/core.c +++ b/drivers/net/wireless/realtek/rtl8xxxu/core.c @@ -5146,10 +5146,10 @@ static void rtl8xxxu_dump_action(struct device *dev, if (!(rtl8xxxu_debug & RTL8XXXU_DEBUG_ACTION)) return; - switch (mgmt->u.action.u.addba_resp.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_ACTION_ADDBA_RESP: - cap = le16_to_cpu(mgmt->u.action.u.addba_resp.capab); - timeout = le16_to_cpu(mgmt->u.action.u.addba_resp.timeout); + cap = le16_to_cpu(mgmt->u.action.addba_resp.capab); + timeout = le16_to_cpu(mgmt->u.action.addba_resp.timeout); dev_info(dev, "WLAN_ACTION_ADDBA_RESP: " "timeout %i, tid %02x, buf_size %02x, policy %02x, " "status %02x\n", @@ -5157,11 +5157,11 @@ static void rtl8xxxu_dump_action(struct device *dev, (cap & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2, (cap & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6, (cap >> 1) & 0x1, - le16_to_cpu(mgmt->u.action.u.addba_resp.status)); + le16_to_cpu(mgmt->u.action.addba_resp.status)); break; case WLAN_ACTION_ADDBA_REQ: - cap = le16_to_cpu(mgmt->u.action.u.addba_req.capab); - timeout = le16_to_cpu(mgmt->u.action.u.addba_req.timeout); + cap = le16_to_cpu(mgmt->u.action.addba_req.capab); + timeout = le16_to_cpu(mgmt->u.action.addba_req.timeout); dev_info(dev, "WLAN_ACTION_ADDBA_REQ: " "timeout %i, tid %02x, buf_size %02x, policy %02x\n", timeout, @@ -5171,7 +5171,7 @@ static void rtl8xxxu_dump_action(struct device *dev, break; default: dev_info(dev, "action frame %02x\n", - mgmt->u.action.u.addba_resp.action_code); + mgmt->u.action.action_code); break; } } diff --git a/drivers/net/wireless/realtek/rtlwifi/base.c b/drivers/net/wireless/realtek/rtlwifi/base.c index 0ac9cf0937aa..aad377864e73 100644 --- a/drivers/net/wireless/realtek/rtlwifi/base.c +++ b/drivers/net/wireless/realtek/rtlwifi/base.c @@ -1409,7 +1409,7 @@ bool rtl_action_proc(struct ieee80211_hw *hw, struct sk_buff *skb, u8 is_tx) sta_entry = (struct rtl_sta_info *)sta->drv_priv; capab = - le16_to_cpu(mgmt->u.action.u.addba_req.capab); + le16_to_cpu(mgmt->u.action.addba_req.capab); tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2; if (tid >= MAX_TID_COUNT) { @@ -2392,35 +2392,35 @@ static struct sk_buff *rtl_make_smps_action(struct ieee80211_hw *hw, struct sk_buff *skb; struct ieee80211_mgmt *action_frame; - /* 27 = header + category + action + smps mode */ - skb = dev_alloc_skb(27 + hw->extra_tx_headroom); + skb = dev_alloc_skb(IEEE80211_MIN_ACTION_SIZE(ht_smps) + + hw->extra_tx_headroom); if (!skb) return NULL; skb_reserve(skb, hw->extra_tx_headroom); - action_frame = skb_put_zero(skb, 27); + action_frame = skb_put_zero(skb, IEEE80211_MIN_ACTION_SIZE(ht_smps)); memcpy(action_frame->da, da, ETH_ALEN); memcpy(action_frame->sa, rtlefuse->dev_addr, ETH_ALEN); memcpy(action_frame->bssid, bssid, ETH_ALEN); action_frame->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); action_frame->u.action.category = WLAN_CATEGORY_HT; - action_frame->u.action.u.ht_smps.action = WLAN_HT_ACTION_SMPS; + action_frame->u.action.action_code = WLAN_HT_ACTION_SMPS; switch (smps) { case IEEE80211_SMPS_AUTOMATIC:/* 0 */ case IEEE80211_SMPS_NUM_MODES:/* 4 */ WARN_ON(1); fallthrough; case IEEE80211_SMPS_OFF:/* 1 */ /*MIMO_PS_NOLIMIT*/ - action_frame->u.action.u.ht_smps.smps_control = + action_frame->u.action.ht_smps.smps_control = WLAN_HT_SMPS_CONTROL_DISABLED;/* 0 */ break; case IEEE80211_SMPS_STATIC:/* 2 */ /*MIMO_PS_STATIC*/ - action_frame->u.action.u.ht_smps.smps_control = + action_frame->u.action.ht_smps.smps_control = WLAN_HT_SMPS_CONTROL_STATIC;/* 1 */ break; case IEEE80211_SMPS_DYNAMIC:/* 3 */ /*MIMO_PS_DYNAMIC*/ - action_frame->u.action.u.ht_smps.smps_control = + action_frame->u.action.ht_smps.smps_control = WLAN_HT_SMPS_CONTROL_DYNAMIC;/* 3 */ break; } @@ -2519,25 +2519,25 @@ struct sk_buff *rtl_make_del_ba(struct ieee80211_hw *hw, struct ieee80211_mgmt *action_frame; u16 params; - /* 27 = header + category + action + smps mode */ - skb = dev_alloc_skb(34 + hw->extra_tx_headroom); + skb = dev_alloc_skb(IEEE80211_MIN_ACTION_SIZE(delba) + + hw->extra_tx_headroom); if (!skb) return NULL; skb_reserve(skb, hw->extra_tx_headroom); - action_frame = skb_put_zero(skb, 34); + action_frame = skb_put_zero(skb, IEEE80211_MIN_ACTION_SIZE(delba)); memcpy(action_frame->sa, sa, ETH_ALEN); memcpy(action_frame->da, rtlefuse->dev_addr, ETH_ALEN); memcpy(action_frame->bssid, bssid, ETH_ALEN); action_frame->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); action_frame->u.action.category = WLAN_CATEGORY_BACK; - action_frame->u.action.u.delba.action_code = WLAN_ACTION_DELBA; + action_frame->u.action.action_code = WLAN_ACTION_DELBA; params = (u16)(1 << 11); /* bit 11 initiator */ params |= (u16)(tid << 12); /* bit 15:12 TID number */ - action_frame->u.action.u.delba.params = cpu_to_le16(params); - action_frame->u.action.u.delba.reason_code = + action_frame->u.action.delba.params = cpu_to_le16(params); + action_frame->u.action.delba.reason_code = cpu_to_le16(WLAN_REASON_QSTA_TIMEOUT); return skb; diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c b/drivers/net/wireless/realtek/rtlwifi/pci.c index d080469264cf..19e2ff62d9f1 100644 --- a/drivers/net/wireless/realtek/rtlwifi/pci.c +++ b/drivers/net/wireless/realtek/rtlwifi/pci.c @@ -507,7 +507,7 @@ static void _rtl_pci_tx_isr(struct ieee80211_hw *hw, int prio) if (ieee80211_is_action(fc)) { struct ieee80211_mgmt *action_frame = (struct ieee80211_mgmt *)skb->data; - if (action_frame->u.action.u.ht_smps.action == + if (action_frame->u.action.action_code == WLAN_HT_ACTION_SMPS) { dev_kfree_skb(skb); goto tx_status_ok; diff --git a/drivers/net/wireless/silabs/wfx/data_rx.c b/drivers/net/wireless/silabs/wfx/data_rx.c index e099a9e65bae..15c06b2b8633 100644 --- a/drivers/net/wireless/silabs/wfx/data_rx.c +++ b/drivers/net/wireless/silabs/wfx/data_rx.c @@ -21,14 +21,14 @@ static void wfx_rx_handle_ba(struct wfx_vif *wvif, struct ieee80211_mgmt *mgmt) if (wfx_api_older_than(wvif->wdev, 3, 6)) return; - switch (mgmt->u.action.u.addba_req.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_ACTION_ADDBA_REQ: - params = le16_to_cpu(mgmt->u.action.u.addba_req.capab); + params = le16_to_cpu(mgmt->u.action.addba_req.capab); tid = (params & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2; ieee80211_start_rx_ba_session_offl(vif, mgmt->sa, tid); break; case WLAN_ACTION_DELBA: - params = le16_to_cpu(mgmt->u.action.u.delba.params); + params = le16_to_cpu(mgmt->u.action.delba.params); tid = (params & IEEE80211_DELBA_PARAM_TID_MASK) >> 12; ieee80211_stop_rx_ba_session_offl(vif, mgmt->sa, tid); break; @@ -80,7 +80,7 @@ void wfx_rx_cb(struct wfx_vif *wvif, const struct wfx_hif_ind_rx *arg, struct sk */ if (ieee80211_is_action(frame->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_BACK && - skb->len > IEEE80211_MIN_ACTION_SIZE) { + skb->len > IEEE80211_MIN_ACTION_SIZE(action_code)) { wfx_rx_handle_ba(wvif, mgmt); goto drop; } diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 3651b2e6c518..aea360e90cb1 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1046,31 +1046,28 @@ struct ieee80211_mgmt { } __packed probe_resp; struct { u8 category; + u8 action_code; union { struct { - u8 action_code; u8 dialog_token; u8 status_code; u8 variable[]; } __packed wme_action; struct{ - u8 action_code; + u8 no_fixed_fields[0]; u8 variable[]; } __packed chan_switch; struct{ - u8 action_code; struct ieee80211_ext_chansw_ie data; u8 variable[]; } __packed ext_chan_switch; struct{ - u8 action_code; u8 dialog_token; u8 element_id; u8 length; struct ieee80211_msrment_ie msr_elem; } __packed measurement; struct{ - u8 action_code; u8 dialog_token; __le16 capab; __le16 timeout; @@ -1079,7 +1076,6 @@ struct ieee80211_mgmt { u8 variable[]; } __packed addba_req; struct{ - u8 action_code; u8 dialog_token; __le16 status; __le16 capab; @@ -1088,54 +1084,45 @@ struct ieee80211_mgmt { u8 variable[]; } __packed addba_resp; struct{ - u8 action_code; __le16 params; __le16 reason_code; } __packed delba; struct { - u8 action_code; + u8 no_fixed_fields[0]; u8 variable[]; } __packed self_prot; struct{ - u8 action_code; + u8 no_fixed_fields[0]; u8 variable[]; } __packed mesh_action; struct { - u8 action; u8 trans_id[WLAN_SA_QUERY_TR_ID_LEN]; } __packed sa_query; struct { - u8 action; u8 smps_control; } __packed ht_smps; struct { - u8 action_code; u8 chanwidth; } __packed ht_notify_cw; struct { - u8 action_code; u8 dialog_token; __le16 capability; u8 variable[]; } __packed tdls_discover_resp; struct { - u8 action_code; u8 operating_mode; } __packed vht_opmode_notif; struct { - u8 action_code; u8 membership[WLAN_MEMBERSHIP_LEN]; u8 position[WLAN_USER_POSITION_LEN]; } __packed vht_group_notif; struct { - u8 action_code; u8 dialog_token; u8 tpc_elem_id; u8 tpc_elem_length; struct ieee80211_tpc_report_ie tpc; } __packed tpc_report; struct { - u8 action_code; u8 dialog_token; u8 follow_up; u8 tod[6]; @@ -1145,11 +1132,10 @@ struct ieee80211_mgmt { u8 variable[]; } __packed ftm; struct { - u8 action_code; + u8 no_fixed_fields[0]; u8 variable[]; } __packed s1g; struct { - u8 action_code; u8 dialog_token; u8 follow_up; u32 tod; @@ -1158,41 +1144,37 @@ struct ieee80211_mgmt { u8 max_toa_error; } __packed wnm_timing_msr; struct { - u8 action_code; u8 dialog_token; u8 variable[]; } __packed ttlm_req; struct { - u8 action_code; u8 dialog_token; __le16 status_code; u8 variable[]; } __packed ttlm_res; struct { - u8 action_code; + u8 no_fixed_fields[0]; + /* no variable fields either */ } __packed ttlm_tear_down; struct { - u8 action_code; u8 dialog_token; u8 variable[]; } __packed ml_reconf_req; struct { - u8 action_code; u8 dialog_token; u8 count; u8 variable[]; } __packed ml_reconf_resp; struct { - u8 action_code; + u8 no_fixed_fields[0]; u8 variable[]; } __packed epcs; struct { - u8 action_code; u8 dialog_token; u8 control; u8 variable[]; } __packed eml_omn; - } u; + }; } __packed action; DECLARE_FLEX_ARRAY(u8, body); /* Generic frame body */ } u; @@ -1210,8 +1192,7 @@ struct ieee80211_mgmt { #define BSS_MEMBERSHIP_SELECTOR_MIN BSS_MEMBERSHIP_SELECTOR_UHR_PHY -/* mgmt header + 1 byte category code */ -#define IEEE80211_MIN_ACTION_SIZE offsetof(struct ieee80211_mgmt, u.action.u) +#define IEEE80211_MIN_ACTION_SIZE(type) offsetofend(struct ieee80211_mgmt, u.action.type) /* Management MIC information element (IEEE 802.11w) for CMAC */ @@ -2391,7 +2372,7 @@ static inline bool ieee80211_is_bufferable_mmpdu(struct sk_buff *skb) if (!ieee80211_is_action(fc)) return false; - if (skb->len < offsetofend(typeof(*mgmt), u.action.u.ftm.action_code)) + if (skb->len < IEEE80211_MIN_ACTION_SIZE(action_code)) return true; /* action frame - additionally check for non-bufferable FTM */ @@ -2400,8 +2381,8 @@ static inline bool ieee80211_is_bufferable_mmpdu(struct sk_buff *skb) mgmt->u.action.category != WLAN_CATEGORY_PROTECTED_DUAL_OF_ACTION) return true; - if (mgmt->u.action.u.ftm.action_code == WLAN_PUB_ACTION_FTM_REQUEST || - mgmt->u.action.u.ftm.action_code == WLAN_PUB_ACTION_FTM_RESPONSE) + if (mgmt->u.action.action_code == WLAN_PUB_ACTION_FTM_REQUEST || + mgmt->u.action.action_code == WLAN_PUB_ACTION_FTM_RESPONSE) return false; return true; @@ -2451,7 +2432,7 @@ static inline bool _ieee80211_is_robust_mgmt_frame(struct ieee80211_hdr *hdr) */ static inline bool ieee80211_is_robust_mgmt_frame(struct sk_buff *skb) { - if (skb->len < IEEE80211_MIN_ACTION_SIZE) + if (skb->len < IEEE80211_MIN_ACTION_SIZE(category)) return false; return _ieee80211_is_robust_mgmt_frame((void *)skb->data); } @@ -2467,7 +2448,7 @@ static inline bool ieee80211_is_public_action(struct ieee80211_hdr *hdr, { struct ieee80211_mgmt *mgmt = (void *)hdr; - if (len < IEEE80211_MIN_ACTION_SIZE) + if (len < IEEE80211_MIN_ACTION_SIZE(category)) return false; if (!ieee80211_is_action(hdr->frame_control)) return false; @@ -2485,13 +2466,14 @@ static inline bool ieee80211_is_public_action(struct ieee80211_hdr *hdr, static inline bool ieee80211_is_protected_dual_of_public_action(struct sk_buff *skb) { + struct ieee80211_mgmt *mgmt = (void *)skb->data; u8 action; if (!ieee80211_is_public_action((void *)skb->data, skb->len) || - skb->len < IEEE80211_MIN_ACTION_SIZE + 1) + skb->len < IEEE80211_MIN_ACTION_SIZE(action_code)) return false; - action = *(u8 *)(skb->data + IEEE80211_MIN_ACTION_SIZE); + action = mgmt->u.action.action_code; return action != WLAN_PUB_ACTION_20_40_BSS_COEX && action != WLAN_PUB_ACTION_DSE_REG_LOC_ANN && @@ -2530,7 +2512,7 @@ static inline bool _ieee80211_is_group_privacy_action(struct ieee80211_hdr *hdr) */ static inline bool ieee80211_is_group_privacy_action(struct sk_buff *skb) { - if (skb->len < IEEE80211_MIN_ACTION_SIZE) + if (skb->len < IEEE80211_MIN_ACTION_SIZE(category)) return false; return _ieee80211_is_group_privacy_action((void *)skb->data); } @@ -2626,8 +2608,7 @@ static inline bool ieee80211_action_contains_tpc(struct sk_buff *skb) if (!ieee80211_is_action(mgmt->frame_control)) return false; - if (skb->len < IEEE80211_MIN_ACTION_SIZE + - sizeof(mgmt->u.action.u.tpc_report)) + if (skb->len < IEEE80211_MIN_ACTION_SIZE(tpc_report)) return false; /* @@ -2646,12 +2627,11 @@ static inline bool ieee80211_action_contains_tpc(struct sk_buff *skb) return false; /* both spectrum mgmt and link measurement have same action code */ - if (mgmt->u.action.u.tpc_report.action_code != - WLAN_ACTION_SPCT_TPC_RPRT) + if (mgmt->u.action.action_code != WLAN_ACTION_SPCT_TPC_RPRT) return false; - if (mgmt->u.action.u.tpc_report.tpc_elem_id != WLAN_EID_TPC_REPORT || - mgmt->u.action.u.tpc_report.tpc_elem_length != + if (mgmt->u.action.tpc_report.tpc_elem_id != WLAN_EID_TPC_REPORT || + mgmt->u.action.tpc_report.tpc_elem_length != sizeof(struct ieee80211_tpc_report_ie)) return false; @@ -2667,16 +2647,15 @@ static inline bool ieee80211_is_timing_measurement(struct sk_buff *skb) { struct ieee80211_mgmt *mgmt = (void *)skb->data; - if (skb->len < IEEE80211_MIN_ACTION_SIZE) + if (skb->len < IEEE80211_MIN_ACTION_SIZE(wnm_timing_msr)) return false; if (!ieee80211_is_action(mgmt->frame_control)) return false; if (mgmt->u.action.category == WLAN_CATEGORY_WNM_UNPROTECTED && - mgmt->u.action.u.wnm_timing_msr.action_code == - WLAN_UNPROTECTED_WNM_ACTION_TIMING_MEASUREMENT_RESPONSE && - skb->len >= offsetofend(typeof(*mgmt), u.action.u.wnm_timing_msr)) + mgmt->u.action.action_code == + WLAN_UNPROTECTED_WNM_ACTION_TIMING_MEASUREMENT_RESPONSE) return true; return false; @@ -2691,15 +2670,13 @@ static inline bool ieee80211_is_ftm(struct sk_buff *skb) { struct ieee80211_mgmt *mgmt = (void *)skb->data; - if (!ieee80211_is_public_action((void *)mgmt, skb->len)) + if (skb->len < IEEE80211_MIN_ACTION_SIZE(ftm)) return false; - if (mgmt->u.action.u.ftm.action_code == - WLAN_PUB_ACTION_FTM_RESPONSE && - skb->len >= offsetofend(typeof(*mgmt), u.action.u.ftm)) - return true; + if (!ieee80211_is_public_action((void *)mgmt, skb->len)) + return false; - return false; + return mgmt->u.action.action_code == WLAN_PUB_ACTION_FTM_RESPONSE; } struct element { diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index f301a8622bee..0a2be8cb600f 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu * Copyright 2007-2010, Intel Corporation * Copyright(c) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2025 Intel Corporation + * Copyright (C) 2018-2026 Intel Corporation */ /** @@ -251,19 +251,20 @@ static void ieee80211_send_addba_resp(struct sta_info *sta, u8 *da, u16 tid, skb_reserve(skb, local->hw.extra_tx_headroom); mgmt = ieee80211_mgmt_ba(skb, da, sdata); - skb_put(skb, 1 + sizeof(mgmt->u.action.u.addba_resp)); + skb_put(skb, 2 + sizeof(mgmt->u.action.addba_resp)); mgmt->u.action.category = WLAN_CATEGORY_BACK; - mgmt->u.action.u.addba_resp.action_code = WLAN_ACTION_ADDBA_RESP; - mgmt->u.action.u.addba_resp.dialog_token = dialog_token; + mgmt->u.action.action_code = WLAN_ACTION_ADDBA_RESP; + + mgmt->u.action.addba_resp.dialog_token = dialog_token; capab = u16_encode_bits(amsdu, IEEE80211_ADDBA_PARAM_AMSDU_MASK); capab |= u16_encode_bits(policy, IEEE80211_ADDBA_PARAM_POLICY_MASK); capab |= u16_encode_bits(tid, IEEE80211_ADDBA_PARAM_TID_MASK); capab |= u16_encode_bits(buf_size, IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK); - mgmt->u.action.u.addba_resp.capab = cpu_to_le16(capab); - mgmt->u.action.u.addba_resp.timeout = cpu_to_le16(timeout); - mgmt->u.action.u.addba_resp.status = cpu_to_le16(status); + mgmt->u.action.addba_resp.capab = cpu_to_le16(capab); + mgmt->u.action.addba_resp.timeout = cpu_to_le16(timeout); + mgmt->u.action.addba_resp.status = cpu_to_le16(status); if (sta->sta.valid_links || sta->sta.deflink.he_cap.has_he) ieee80211_add_addbaext(skb, req_addba_ext_data, buf_size); @@ -477,22 +478,22 @@ void ieee80211_process_addba_request(struct ieee80211_local *local, u8 dialog_token, addba_ext_data; /* extract session parameters from addba request frame */ - dialog_token = mgmt->u.action.u.addba_req.dialog_token; - timeout = le16_to_cpu(mgmt->u.action.u.addba_req.timeout); + dialog_token = mgmt->u.action.addba_req.dialog_token; + timeout = le16_to_cpu(mgmt->u.action.addba_req.timeout); start_seq_num = - le16_to_cpu(mgmt->u.action.u.addba_req.start_seq_num) >> 4; + le16_to_cpu(mgmt->u.action.addba_req.start_seq_num) >> 4; - capab = le16_to_cpu(mgmt->u.action.u.addba_req.capab); + capab = le16_to_cpu(mgmt->u.action.addba_req.capab); ba_policy = (capab & IEEE80211_ADDBA_PARAM_POLICY_MASK) >> 1; tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2; buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6; addba_ext_data = ieee80211_retrieve_addba_ext_data(sta, - mgmt->u.action.u.addba_req.variable, + mgmt->u.action.addba_req.variable, len - offsetof(typeof(*mgmt), - u.action.u.addba_req.variable), + u.action.addba_req.variable), &buf_size); __ieee80211_start_rx_ba_session(sta, dialog_token, timeout, diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 93b47a7ba9c4..d5a62b8d5a80 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu * Copyright 2007-2010, Intel Corporation * Copyright(c) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2024 Intel Corporation + * Copyright (C) 2018-2026 Intel Corporation */ #include @@ -68,7 +68,7 @@ static void ieee80211_send_addba_request(struct sta_info *sta, u16 tid, struct ieee80211_mgmt *mgmt; u16 capab; - skb = dev_alloc_skb(sizeof(*mgmt) + + skb = dev_alloc_skb(IEEE80211_MIN_ACTION_SIZE(addba_req) + 2 + sizeof(struct ieee80211_addba_ext_ie) + local->hw.extra_tx_headroom); if (!skb) @@ -77,21 +77,21 @@ static void ieee80211_send_addba_request(struct sta_info *sta, u16 tid, skb_reserve(skb, local->hw.extra_tx_headroom); mgmt = ieee80211_mgmt_ba(skb, sta->sta.addr, sdata); - skb_put(skb, 1 + sizeof(mgmt->u.action.u.addba_req)); + skb_put(skb, 2 + sizeof(mgmt->u.action.addba_req)); mgmt->u.action.category = WLAN_CATEGORY_BACK; - mgmt->u.action.u.addba_req.action_code = WLAN_ACTION_ADDBA_REQ; + mgmt->u.action.action_code = WLAN_ACTION_ADDBA_REQ; - mgmt->u.action.u.addba_req.dialog_token = dialog_token; + mgmt->u.action.addba_req.dialog_token = dialog_token; capab = IEEE80211_ADDBA_PARAM_AMSDU_MASK; capab |= IEEE80211_ADDBA_PARAM_POLICY_MASK; capab |= u16_encode_bits(tid, IEEE80211_ADDBA_PARAM_TID_MASK); capab |= u16_encode_bits(agg_size, IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK); - mgmt->u.action.u.addba_req.capab = cpu_to_le16(capab); + mgmt->u.action.addba_req.capab = cpu_to_le16(capab); - mgmt->u.action.u.addba_req.timeout = cpu_to_le16(timeout); - mgmt->u.action.u.addba_req.start_seq_num = + mgmt->u.action.addba_req.timeout = cpu_to_le16(timeout); + mgmt->u.action.addba_req.start_seq_num = cpu_to_le16(start_seq_num << 4); if (sta->sta.deflink.he_cap.has_he) @@ -978,15 +978,15 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local, lockdep_assert_wiphy(sta->local->hw.wiphy); - capab = le16_to_cpu(mgmt->u.action.u.addba_resp.capab); + capab = le16_to_cpu(mgmt->u.action.addba_resp.capab); amsdu = capab & IEEE80211_ADDBA_PARAM_AMSDU_MASK; tid = u16_get_bits(capab, IEEE80211_ADDBA_PARAM_TID_MASK); buf_size = u16_get_bits(capab, IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK); ieee80211_retrieve_addba_ext_data(sta, - mgmt->u.action.u.addba_resp.variable, + mgmt->u.action.addba_resp.variable, len - offsetof(typeof(*mgmt), - u.action.u.addba_resp.variable), + u.action.addba_resp.variable), &buf_size); buf_size = min(buf_size, local->hw.max_tx_aggregation_subframes); @@ -999,7 +999,7 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local, if (!tid_tx) return; - if (mgmt->u.action.u.addba_resp.dialog_token != tid_tx->dialog_token) { + if (mgmt->u.action.addba_resp.dialog_token != tid_tx->dialog_token) { ht_dbg(sta->sdata, "wrong addBA response token, %pM tid %d\n", sta->sta.addr, tid); return; @@ -1029,7 +1029,7 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local, * is set to 0, the Buffer Size subfield is set to a value * of at least 1. */ - if (le16_to_cpu(mgmt->u.action.u.addba_resp.status) + if (le16_to_cpu(mgmt->u.action.addba_resp.status) == WLAN_STATUS_SUCCESS && buf_size) { if (test_and_set_bit(HT_AGG_STATE_RESPONSE_RECEIVED, &tid_tx->state)) { @@ -1046,7 +1046,7 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local, sta->ampdu_mlme.addba_req_num[tid] = 0; tid_tx->timeout = - le16_to_cpu(mgmt->u.action.u.addba_resp.timeout); + le16_to_cpu(mgmt->u.action.addba_resp.timeout); if (tid_tx->timeout) { mod_timer(&tid_tx->session_timer, diff --git a/net/mac80211/eht.c b/net/mac80211/eht.c index 078e1e23d8d1..768bfc4e737d 100644 --- a/net/mac80211/eht.c +++ b/net/mac80211/eht.c @@ -108,7 +108,7 @@ static void ieee80211_send_eml_op_mode_notif(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *req, int opt_len) { - int len = offsetofend(struct ieee80211_mgmt, u.action.u.eml_omn); + int len = IEEE80211_MIN_ACTION_SIZE(eml_omn); struct ieee80211_local *local = sdata->local; struct ieee80211_mgmt *mgmt; struct sk_buff *skb; @@ -127,16 +127,15 @@ ieee80211_send_eml_op_mode_notif(struct ieee80211_sub_if_data *sdata, memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN); mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT; - mgmt->u.action.u.eml_omn.action_code = - WLAN_PROTECTED_EHT_ACTION_EML_OP_MODE_NOTIF; - mgmt->u.action.u.eml_omn.dialog_token = - req->u.action.u.eml_omn.dialog_token; - mgmt->u.action.u.eml_omn.control = req->u.action.u.eml_omn.control & + mgmt->u.action.action_code = WLAN_PROTECTED_EHT_ACTION_EML_OP_MODE_NOTIF; + mgmt->u.action.eml_omn.dialog_token = + req->u.action.eml_omn.dialog_token; + mgmt->u.action.eml_omn.control = req->u.action.eml_omn.control & ~(IEEE80211_EML_CTRL_EMLSR_PARAM_UPDATE | IEEE80211_EML_CTRL_INDEV_COEX_ACT); /* Copy optional fields from the received notification frame */ - memcpy(mgmt->u.action.u.eml_omn.variable, - req->u.action.u.eml_omn.variable, opt_len); + memcpy(mgmt->u.action.eml_omn.variable, + req->u.action.eml_omn.variable, opt_len); ieee80211_tx_skb(sdata, skb); } @@ -144,14 +143,14 @@ ieee80211_send_eml_op_mode_notif(struct ieee80211_sub_if_data *sdata, void ieee80211_rx_eml_op_mode_notif(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { - int len = offsetofend(struct ieee80211_mgmt, u.action.u.eml_omn); + int len = IEEE80211_MIN_ACTION_SIZE(eml_omn); enum nl80211_iftype type = ieee80211_vif_type_p2p(&sdata->vif); struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); const struct wiphy_iftype_ext_capab *ift_ext_capa; struct ieee80211_mgmt *mgmt = (void *)skb->data; struct ieee80211_local *local = sdata->local; - u8 control = mgmt->u.action.u.eml_omn.control; - u8 *ptr = mgmt->u.action.u.eml_omn.variable; + u8 control = mgmt->u.action.eml_omn.control; + u8 *ptr = mgmt->u.action.eml_omn.variable; struct ieee80211_eml_params eml_params = { .link_id = status->link_id, .control = control, diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 1c82a28b03de..9e2469a8ce64 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu * Copyright 2007-2010, Intel Corporation * Copyright 2017 Intel Deutschland GmbH - * Copyright(c) 2020-2025 Intel Corporation + * Copyright(c) 2020-2026 Intel Corporation */ #include @@ -462,22 +462,23 @@ void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt; u16 params; - skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom); + skb = dev_alloc_skb(IEEE80211_MIN_ACTION_SIZE(delba) + + local->hw.extra_tx_headroom); if (!skb) return; skb_reserve(skb, local->hw.extra_tx_headroom); mgmt = ieee80211_mgmt_ba(skb, da, sdata); - skb_put(skb, 1 + sizeof(mgmt->u.action.u.delba)); + skb_put(skb, 2 + sizeof(mgmt->u.action.delba)); mgmt->u.action.category = WLAN_CATEGORY_BACK; - mgmt->u.action.u.delba.action_code = WLAN_ACTION_DELBA; + mgmt->u.action.action_code = WLAN_ACTION_DELBA; params = (u16)(initiator << 11); /* bit 11 initiator */ params |= (u16)(tid << 12); /* bit 15:12 TID number */ - mgmt->u.action.u.delba.params = cpu_to_le16(params); - mgmt->u.action.u.delba.reason_code = cpu_to_le16(reason_code); + mgmt->u.action.delba.params = cpu_to_le16(params); + mgmt->u.action.delba.reason_code = cpu_to_le16(reason_code); ieee80211_tx_skb(sdata, skb); } @@ -489,14 +490,14 @@ void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata, u16 tid, params; u16 initiator; - params = le16_to_cpu(mgmt->u.action.u.delba.params); + params = le16_to_cpu(mgmt->u.action.delba.params); tid = (params & IEEE80211_DELBA_PARAM_TID_MASK) >> 12; initiator = (params & IEEE80211_DELBA_PARAM_INITIATOR_MASK) >> 11; ht_dbg_ratelimited(sdata, "delba from %pM (%s) tid %d reason code %d\n", mgmt->sa, initiator ? "initiator" : "recipient", tid, - le16_to_cpu(mgmt->u.action.u.delba.reason_code)); + le16_to_cpu(mgmt->u.action.delba.reason_code)); if (initiator == WLAN_BACK_INITIATOR) __ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_INITIATOR, 0, @@ -530,20 +531,20 @@ int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, struct ieee80211_tx_info *info; u8 status_link_id = link_id < 0 ? 0 : link_id; - /* 27 = header + category + action + smps mode */ - skb = dev_alloc_skb(27 + local->hw.extra_tx_headroom); + skb = dev_alloc_skb(IEEE80211_MIN_ACTION_SIZE(ht_smps) + + local->hw.extra_tx_headroom); if (!skb) return -ENOMEM; skb_reserve(skb, local->hw.extra_tx_headroom); - action_frame = skb_put(skb, 27); + action_frame = skb_put_zero(skb, IEEE80211_MIN_ACTION_SIZE(ht_smps)); memcpy(action_frame->da, da, ETH_ALEN); memcpy(action_frame->sa, sdata->dev->dev_addr, ETH_ALEN); memcpy(action_frame->bssid, bssid, ETH_ALEN); action_frame->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); action_frame->u.action.category = WLAN_CATEGORY_HT; - action_frame->u.action.u.ht_smps.action = WLAN_HT_ACTION_SMPS; + action_frame->u.action.action_code = WLAN_HT_ACTION_SMPS; switch (smps) { case IEEE80211_SMPS_AUTOMATIC: case IEEE80211_SMPS_NUM_MODES: @@ -551,15 +552,15 @@ int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, smps = IEEE80211_SMPS_OFF; fallthrough; case IEEE80211_SMPS_OFF: - action_frame->u.action.u.ht_smps.smps_control = + action_frame->u.action.ht_smps.smps_control = WLAN_HT_SMPS_CONTROL_DISABLED; break; case IEEE80211_SMPS_STATIC: - action_frame->u.action.u.ht_smps.smps_control = + action_frame->u.action.ht_smps.smps_control = WLAN_HT_SMPS_CONTROL_STATIC; break; case IEEE80211_SMPS_DYNAMIC: - action_frame->u.action.u.ht_smps.smps_control = + action_frame->u.action.ht_smps.smps_control = WLAN_HT_SMPS_CONTROL_DYNAMIC; break; } diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 168f84a1353b..0298272c37ec 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -9,7 +9,7 @@ * Copyright 2009, Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright(c) 2016 Intel Deutschland GmbH - * Copyright(c) 2018-2025 Intel Corporation + * Copyright(c) 2018-2026 Intel Corporation */ #include @@ -888,19 +888,11 @@ ieee80211_rx_mgmt_spectrum_mgmt(struct ieee80211_sub_if_data *sdata, struct ieee80211_rx_status *rx_status, struct ieee802_11_elems *elems) { - int required_len; - - if (len < IEEE80211_MIN_ACTION_SIZE + 1) + if (len < IEEE80211_MIN_ACTION_SIZE(chan_switch)) return; /* CSA is the only action we handle for now */ - if (mgmt->u.action.u.measurement.action_code != - WLAN_ACTION_SPCT_CHL_SWITCH) - return; - - required_len = IEEE80211_MIN_ACTION_SIZE + - sizeof(mgmt->u.action.u.chan_switch); - if (len < required_len) + if (mgmt->u.action.action_code != WLAN_ACTION_SPCT_CHL_SWITCH) return; if (!sdata->vif.bss_conf.csa_active) @@ -1613,12 +1605,12 @@ void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, case WLAN_CATEGORY_SPECTRUM_MGMT: ies_len = skb->len - offsetof(struct ieee80211_mgmt, - u.action.u.chan_switch.variable); + u.action.chan_switch.variable); if (ies_len < 0) break; - elems = ieee802_11_parse_elems(mgmt->u.action.u.chan_switch.variable, + elems = ieee802_11_parse_elems(mgmt->u.action.chan_switch.variable, ies_len, IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION, diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 676b2a43c9f2..2e391cec73a0 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1579,7 +1579,7 @@ static void ieee80211_iface_process_skb(struct ieee80211_local *local, sta = sta_info_get_bss(sdata, mgmt->sa); if (sta) { - switch (mgmt->u.action.u.addba_req.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_ACTION_ADDBA_REQ: ieee80211_process_addba_request(local, sta, mgmt, len); @@ -1599,9 +1599,9 @@ static void ieee80211_iface_process_skb(struct ieee80211_local *local, } } else if (ieee80211_is_action(mgmt->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_HT) { - switch (mgmt->u.action.u.ht_smps.action) { + switch (mgmt->u.action.action_code) { case WLAN_HT_ACTION_NOTIFY_CHANWIDTH: { - u8 chanwidth = mgmt->u.action.u.ht_notify_cw.chanwidth; + u8 chanwidth = mgmt->u.action.ht_notify_cw.chanwidth; struct ieee80211_rx_status *status; struct link_sta_info *link_sta; struct sta_info *sta; @@ -1628,7 +1628,7 @@ static void ieee80211_iface_process_skb(struct ieee80211_local *local, } } else if (ieee80211_is_action(mgmt->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_VHT) { - switch (mgmt->u.action.u.vht_group_notif.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_VHT_ACTION_OPMODE_NOTIF: { struct ieee80211_rx_status *status; enum nl80211_band band; @@ -1637,7 +1637,7 @@ static void ieee80211_iface_process_skb(struct ieee80211_local *local, status = IEEE80211_SKB_RXCB(skb); band = status->band; - opmode = mgmt->u.action.u.vht_opmode_notif.operating_mode; + opmode = mgmt->u.action.vht_opmode_notif.operating_mode; sta = sta_info_get_bss(sdata, mgmt->sa); @@ -1658,7 +1658,7 @@ static void ieee80211_iface_process_skb(struct ieee80211_local *local, } } else if (ieee80211_is_action(mgmt->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_S1G) { - switch (mgmt->u.action.u.s1g.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_S1G_TWT_TEARDOWN: case WLAN_S1G_TWT_SETUP: ieee80211_s1g_rx_twt_action(sdata, skb); @@ -1669,7 +1669,7 @@ static void ieee80211_iface_process_skb(struct ieee80211_local *local, } else if (ieee80211_is_action(mgmt->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_PROTECTED_EHT) { if (sdata->vif.type == NL80211_IFTYPE_AP) { - switch (mgmt->u.action.u.eml_omn.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_PROTECTED_EHT_ACTION_EML_OP_MODE_NOTIF: ieee80211_rx_eml_op_mode_notif(sdata, skb); break; @@ -1677,7 +1677,7 @@ static void ieee80211_iface_process_skb(struct ieee80211_local *local, break; } } else if (sdata->vif.type == NL80211_IFTYPE_STATION) { - switch (mgmt->u.action.u.ttlm_req.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_PROTECTED_EHT_ACTION_TTLM_REQ: ieee80211_process_neg_ttlm_req(sdata, mgmt, skb->len); @@ -1765,7 +1765,7 @@ static void ieee80211_iface_process_status(struct ieee80211_sub_if_data *sdata, if (ieee80211_is_action(mgmt->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_S1G) { - switch (mgmt->u.action.u.s1g.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_S1G_TWT_TEARDOWN: case WLAN_S1G_TWT_SETUP: ieee80211_s1g_status_twt_action(sdata, skb); diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 28624e57aa49..6696c611dfa4 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008, 2009 open80211s Ltd. - * Copyright (C) 2018 - 2025 Intel Corporation + * Copyright (C) 2018-2026 Intel Corporation * Authors: Luis Carlos Cobo * Javier Cardona */ @@ -19,8 +19,7 @@ static struct kmem_cache *rm_cache; bool mesh_action_is_path_sel(struct ieee80211_mgmt *mgmt) { - return (mgmt->u.action.u.mesh_action.action_code == - WLAN_MESH_ACTION_HWMP_PATH_SELECTION); + return mgmt->u.action.action_code == WLAN_MESH_ACTION_HWMP_PATH_SELECTION; } void ieee80211s_init(void) @@ -1618,13 +1617,12 @@ static void mesh_rx_csa_frame(struct ieee80211_sub_if_data *sdata, size_t baselen; u8 *pos; - if (mgmt->u.action.u.measurement.action_code != - WLAN_ACTION_SPCT_CHL_SWITCH) + if (mgmt->u.action.action_code != WLAN_ACTION_SPCT_CHL_SWITCH) return; - pos = mgmt->u.action.u.chan_switch.variable; + pos = mgmt->u.action.chan_switch.variable; baselen = offsetof(struct ieee80211_mgmt, - u.action.u.chan_switch.variable); + u.action.chan_switch.variable); elems = ieee802_11_parse_elems(pos, len - baselen, IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION, @@ -1670,7 +1668,7 @@ static void ieee80211_mesh_rx_mgmt_action(struct ieee80211_sub_if_data *sdata, { switch (mgmt->u.action.category) { case WLAN_CATEGORY_SELF_PROTECTED: - switch (mgmt->u.action.u.self_prot.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_SP_MESH_PEERING_OPEN: case WLAN_SP_MESH_PEERING_CLOSE: case WLAN_SP_MESH_PEERING_CONFIRM: diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 98d5aaa36d00..9d89ebcce1c1 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008, 2009 open80211s Ltd. - * Copyright (C) 2019, 2021-2023, 2025 Intel Corporation + * Copyright (C) 2019, 2021-2023, 2025-2026 Intel Corporation * Author: Luis Carlos Cobo */ @@ -105,12 +105,11 @@ static int mesh_path_sel_frame_tx(enum mpath_frame_type action, u8 flags, u32 lifetime, u32 metric, u32 preq_id, struct ieee80211_sub_if_data *sdata) { + int hdr_len = IEEE80211_MIN_ACTION_SIZE(mesh_action); struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ieee80211_mgmt *mgmt; u8 *pos, ie_len; - int hdr_len = offsetofend(struct ieee80211_mgmt, - u.action.u.mesh_action); skb = dev_alloc_skb(local->tx_headroom + hdr_len + @@ -127,8 +126,7 @@ static int mesh_path_sel_frame_tx(enum mpath_frame_type action, u8 flags, /* BSSID == SA */ memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN); mgmt->u.action.category = WLAN_CATEGORY_MESH_ACTION; - mgmt->u.action.u.mesh_action.action_code = - WLAN_MESH_ACTION_HWMP_PATH_SELECTION; + mgmt->u.action.action_code = WLAN_MESH_ACTION_HWMP_PATH_SELECTION; switch (action) { case MPATH_PREQ: @@ -237,13 +235,12 @@ int mesh_path_error_tx(struct ieee80211_sub_if_data *sdata, u8 ttl, const u8 *target, u32 target_sn, u16 target_rcode, const u8 *ra) { + int hdr_len = IEEE80211_MIN_ACTION_SIZE(mesh_action); struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct ieee80211_mgmt *mgmt; u8 *pos, ie_len; - int hdr_len = offsetofend(struct ieee80211_mgmt, - u.action.u.mesh_action); if (time_before(jiffies, ifmsh->next_perr)) return -EAGAIN; @@ -265,8 +262,7 @@ int mesh_path_error_tx(struct ieee80211_sub_if_data *sdata, /* BSSID == SA */ memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN); mgmt->u.action.category = WLAN_CATEGORY_MESH_ACTION; - mgmt->u.action.u.mesh_action.action_code = - WLAN_MESH_ACTION_HWMP_PATH_SELECTION; + mgmt->u.action.action_code = WLAN_MESH_ACTION_HWMP_PATH_SELECTION; ie_len = 15; pos = skb_put(skb, 2 + ie_len); *pos++ = WLAN_EID_PERR; @@ -938,7 +934,7 @@ void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata, struct sta_info *sta; /* need action_code */ - if (len < IEEE80211_MIN_ACTION_SIZE + 1) + if (len < IEEE80211_MIN_ACTION_SIZE(mesh_action)) return; rcu_read_lock(); @@ -949,8 +945,8 @@ void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata, } rcu_read_unlock(); - baselen = (u8 *) mgmt->u.action.u.mesh_action.variable - (u8 *) mgmt; - elems = ieee802_11_parse_elems(mgmt->u.action.u.mesh_action.variable, + baselen = mgmt->u.action.mesh_action.variable - (u8 *)mgmt; + elems = ieee802_11_parse_elems(mgmt->u.action.mesh_action.variable, len - baselen, IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION, diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 04c931cd2063..7d823a55636f 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008, 2009 open80211s Ltd. - * Copyright (C) 2019, 2021-2025 Intel Corporation + * Copyright (C) 2019, 2021-2026 Intel Corporation * Author: Luis Carlos Cobo */ #include @@ -13,7 +13,7 @@ #include "rate.h" #include "mesh.h" -#define PLINK_CNF_AID(mgmt) ((mgmt)->u.action.u.self_prot.variable + 2) +#define PLINK_CNF_AID(mgmt) ((mgmt)->u.action.self_prot.variable + 2) #define PLINK_GET_LLID(p) (p + 2) #define PLINK_GET_PLID(p) (p + 4) @@ -215,6 +215,7 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, enum ieee80211_self_protected_actioncode action, u8 *da, u16 llid, u16 plid, u16 reason) { + int hdr_len = IEEE80211_MIN_ACTION_SIZE(self_prot); struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ieee80211_tx_info *info; @@ -223,7 +224,6 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, u16 peering_proto = 0; u8 *pos, ie_len = 4; u8 ie_len_he_cap, ie_len_eht_cap; - int hdr_len = offsetofend(struct ieee80211_mgmt, u.action.u.self_prot); int err = -ENOMEM; ie_len_he_cap = ieee80211_ie_len_he_cap(sdata); @@ -260,7 +260,7 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN); mgmt->u.action.category = WLAN_CATEGORY_SELF_PROTECTED; - mgmt->u.action.u.self_prot.action_code = action; + mgmt->u.action.action_code = action; if (action != WLAN_SP_MESH_PEERING_CLOSE) { struct ieee80211_supported_band *sband; @@ -1141,7 +1141,7 @@ mesh_process_plink_frame(struct ieee80211_sub_if_data *sdata, return; } - ftype = mgmt->u.action.u.self_prot.action_code; + ftype = mgmt->u.action.action_code; if ((ftype == WLAN_SP_MESH_PEERING_OPEN && ie_len != 4) || (ftype == WLAN_SP_MESH_PEERING_CONFIRM && ie_len != 6) || (ftype == WLAN_SP_MESH_PEERING_CLOSE && ie_len != 6 @@ -1224,8 +1224,8 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, size_t baselen; u8 *baseaddr; - /* need action_code, aux */ - if (len < IEEE80211_MIN_ACTION_SIZE + 3) + /* need aux */ + if (len < IEEE80211_MIN_ACTION_SIZE(self_prot) + 1) return; if (sdata->u.mesh.user_mpm) @@ -1238,10 +1238,9 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, return; } - baseaddr = mgmt->u.action.u.self_prot.variable; - baselen = (u8 *) mgmt->u.action.u.self_prot.variable - (u8 *) mgmt; - if (mgmt->u.action.u.self_prot.action_code == - WLAN_SP_MESH_PEERING_CONFIRM) { + baseaddr = mgmt->u.action.self_prot.variable; + baselen = mgmt->u.action.self_prot.variable - (u8 *)mgmt; + if (mgmt->u.action.action_code == WLAN_SP_MESH_PEERING_CONFIRM) { baseaddr += 4; baselen += 4; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 170330d924a3..da79df92994d 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -7957,7 +7957,7 @@ ieee80211_send_neg_ttlm_req(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; struct ieee80211_mgmt *mgmt; struct sk_buff *skb; - int hdr_len = offsetofend(struct ieee80211_mgmt, u.action.u.ttlm_req); + int hdr_len = IEEE80211_MIN_ACTION_SIZE(ttlm_req); int ttlm_max_len = 2 + 1 + sizeof(struct ieee80211_ttlm_elem) + 1 + 2 * 2 * IEEE80211_TTLM_NUM_TIDS; @@ -7974,9 +7974,8 @@ ieee80211_send_neg_ttlm_req(struct ieee80211_sub_if_data *sdata, memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT; - mgmt->u.action.u.ttlm_req.action_code = - WLAN_PROTECTED_EHT_ACTION_TTLM_REQ; - mgmt->u.action.u.ttlm_req.dialog_token = dialog_token; + mgmt->u.action.action_code = WLAN_PROTECTED_EHT_ACTION_TTLM_REQ; + mgmt->u.action.ttlm_req.dialog_token = dialog_token; ieee80211_neg_ttlm_add_suggested_map(skb, neg_ttlm); ieee80211_tx_skb(sdata, skb); } @@ -8026,7 +8025,7 @@ ieee80211_send_neg_ttlm_res(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; struct ieee80211_mgmt *mgmt; struct sk_buff *skb; - int hdr_len = offsetofend(struct ieee80211_mgmt, u.action.u.ttlm_res); + int hdr_len = IEEE80211_MIN_ACTION_SIZE(ttlm_res); int ttlm_max_len = 2 + 1 + sizeof(struct ieee80211_ttlm_elem) + 1 + 2 * 2 * IEEE80211_TTLM_NUM_TIDS; u16 status_code; @@ -8044,9 +8043,8 @@ ieee80211_send_neg_ttlm_res(struct ieee80211_sub_if_data *sdata, memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT; - mgmt->u.action.u.ttlm_res.action_code = - WLAN_PROTECTED_EHT_ACTION_TTLM_RES; - mgmt->u.action.u.ttlm_res.dialog_token = dialog_token; + mgmt->u.action.action_code = WLAN_PROTECTED_EHT_ACTION_TTLM_RES; + mgmt->u.action.ttlm_res.dialog_token = dialog_token; switch (ttlm_res) { default: WARN_ON(1); @@ -8063,7 +8061,7 @@ ieee80211_send_neg_ttlm_res(struct ieee80211_sub_if_data *sdata, break; } - mgmt->u.action.u.ttlm_res.status_code = cpu_to_le16(status_code); + mgmt->u.action.ttlm_res.status_code = cpu_to_le16(status_code); ieee80211_tx_skb(sdata, skb); } @@ -8163,10 +8161,9 @@ void ieee80211_process_neg_ttlm_req(struct ieee80211_sub_if_data *sdata, if (!ieee80211_vif_is_mld(&sdata->vif)) return; - dialog_token = mgmt->u.action.u.ttlm_req.dialog_token; - ies_len = len - offsetof(struct ieee80211_mgmt, - u.action.u.ttlm_req.variable); - elems = ieee802_11_parse_elems(mgmt->u.action.u.ttlm_req.variable, + dialog_token = mgmt->u.action.ttlm_req.dialog_token; + ies_len = len - IEEE80211_MIN_ACTION_SIZE(ttlm_req); + elems = ieee802_11_parse_elems(mgmt->u.action.ttlm_req.variable, ies_len, IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION, @@ -8217,8 +8214,7 @@ void ieee80211_process_neg_ttlm_res(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { if (!ieee80211_vif_is_mld(&sdata->vif) || - mgmt->u.action.u.ttlm_req.dialog_token != - sdata->u.mgd.dialog_token_alloc) + mgmt->u.action.ttlm_res.dialog_token != sdata->u.mgd.dialog_token_alloc) return; wiphy_delayed_work_cancel(sdata->local->hw.wiphy, @@ -8232,7 +8228,7 @@ void ieee80211_process_neg_ttlm_res(struct ieee80211_sub_if_data *sdata, * This can be better implemented in the future, to handle request * rejections. */ - if (le16_to_cpu(mgmt->u.action.u.ttlm_res.status_code) != WLAN_STATUS_SUCCESS) + if (le16_to_cpu(mgmt->u.action.ttlm_res.status_code) != WLAN_STATUS_SUCCESS) __ieee80211_disconnect(sdata); } @@ -8265,12 +8261,11 @@ static void ieee80211_teardown_ttlm_work(struct wiphy *wiphy, void ieee80211_send_teardown_neg_ttlm(struct ieee80211_vif *vif) { + int frame_len = IEEE80211_MIN_ACTION_SIZE(ttlm_tear_down); struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_local *local = sdata->local; struct ieee80211_mgmt *mgmt; struct sk_buff *skb; - int frame_len = offsetofend(struct ieee80211_mgmt, - u.action.u.ttlm_tear_down); struct ieee80211_tx_info *info; skb = dev_alloc_skb(local->hw.extra_tx_headroom + frame_len); @@ -8286,8 +8281,7 @@ void ieee80211_send_teardown_neg_ttlm(struct ieee80211_vif *vif) memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT; - mgmt->u.action.u.ttlm_tear_down.action_code = - WLAN_PROTECTED_EHT_ACTION_TTLM_TEARDOWN; + mgmt->u.action.action_code = WLAN_PROTECTED_EHT_ACTION_TTLM_TEARDOWN; info = IEEE80211_SKB_CB(skb); info->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; @@ -8370,13 +8364,13 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, case WLAN_CATEGORY_SPECTRUM_MGMT: ies_len = skb->len - offsetof(struct ieee80211_mgmt, - u.action.u.chan_switch.variable); + u.action.chan_switch.variable); if (ies_len < 0) break; /* CSA IE cannot be overridden, no need for BSSID */ - elems = ieee802_11_parse_elems(mgmt->u.action.u.chan_switch.variable, + elems = ieee802_11_parse_elems(mgmt->u.action.chan_switch.variable, ies_len, IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION, @@ -8398,7 +8392,7 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, case WLAN_CATEGORY_PROTECTED_DUAL_OF_ACTION: ies_len = skb->len - offsetof(struct ieee80211_mgmt, - u.action.u.ext_chan_switch.variable); + u.action.ext_chan_switch.variable); if (ies_len < 0) break; @@ -8407,7 +8401,7 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, * extended CSA IE can't be overridden, no need for * BSSID */ - elems = ieee802_11_parse_elems(mgmt->u.action.u.ext_chan_switch.variable, + elems = ieee802_11_parse_elems(mgmt->u.action.ext_chan_switch.variable, ies_len, IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION, @@ -8424,7 +8418,7 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, /* for the handling code pretend it was an IE */ elems->ext_chansw_ie = - &mgmt->u.action.u.ext_chan_switch.data; + &mgmt->u.action.ext_chan_switch.data; ieee80211_sta_process_chanswitch(link, rx_status->mactime, @@ -10426,25 +10420,25 @@ void ieee80211_process_ml_reconf_resp(struct ieee80211_sub_if_data *sdata, u8 *pos; if (!ieee80211_vif_is_mld(&sdata->vif) || - len < offsetofend(typeof(*mgmt), u.action.u.ml_reconf_resp) || - mgmt->u.action.u.ml_reconf_resp.dialog_token != - sdata->u.mgd.reconf.dialog_token || + len < IEEE80211_MIN_ACTION_SIZE(ml_reconf_resp) || + mgmt->u.action.ml_reconf_resp.dialog_token != + sdata->u.mgd.reconf.dialog_token || !sta_changed_links) return; - pos = mgmt->u.action.u.ml_reconf_resp.variable; - len -= offsetofend(typeof(*mgmt), u.action.u.ml_reconf_resp); + pos = mgmt->u.action.ml_reconf_resp.variable; + len -= offsetofend(typeof(*mgmt), u.action.ml_reconf_resp); /* each status duple is 3 octets */ - if (len < mgmt->u.action.u.ml_reconf_resp.count * 3) { + if (len < mgmt->u.action.ml_reconf_resp.count * 3) { sdata_info(sdata, "mlo: reconf: unexpected len=%zu, count=%u\n", - len, mgmt->u.action.u.ml_reconf_resp.count); + len, mgmt->u.action.ml_reconf_resp.count); goto disconnect; } link_mask = sta_changed_links; - for (i = 0; i < mgmt->u.action.u.ml_reconf_resp.count; i++) { + for (i = 0; i < mgmt->u.action.ml_reconf_resp.count; i++) { u16 status = get_unaligned_le16(pos + 1); link_id = *pos; @@ -10729,8 +10723,7 @@ ieee80211_build_ml_reconf_req(struct ieee80211_sub_if_data *sdata, return NULL; skb_reserve(skb, local->hw.extra_tx_headroom); - mgmt = skb_put_zero(skb, offsetofend(struct ieee80211_mgmt, - u.action.u.ml_reconf_req)); + mgmt = skb_put_zero(skb, IEEE80211_MIN_ACTION_SIZE(ml_reconf_req)); /* Add the MAC header */ mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | @@ -10741,12 +10734,11 @@ ieee80211_build_ml_reconf_req(struct ieee80211_sub_if_data *sdata, /* Add the action frame fixed fields */ mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT; - mgmt->u.action.u.ml_reconf_req.action_code = - WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_REQ; + mgmt->u.action.action_code = WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_REQ; /* allocate a dialog token and store it */ sdata->u.mgd.reconf.dialog_token = ++sdata->u.mgd.dialog_token_alloc; - mgmt->u.action.u.ml_reconf_req.dialog_token = + mgmt->u.action.ml_reconf_req.dialog_token = sdata->u.mgd.reconf.dialog_token; /* Add the ML reconfiguration element and the common information */ @@ -11116,11 +11108,10 @@ static bool ieee80211_mgd_epcs_supp(struct ieee80211_sub_if_data *sdata) int ieee80211_mgd_set_epcs(struct ieee80211_sub_if_data *sdata, bool enable) { + int frame_len = IEEE80211_MIN_ACTION_SIZE(epcs) + (enable ? 1 : 0); struct ieee80211_local *local = sdata->local; struct ieee80211_mgmt *mgmt; struct sk_buff *skb; - int frame_len = offsetofend(struct ieee80211_mgmt, - u.action.u.epcs) + (enable ? 1 : 0); if (!ieee80211_mgd_epcs_supp(sdata)) return -EINVAL; @@ -11149,15 +11140,15 @@ int ieee80211_mgd_set_epcs(struct ieee80211_sub_if_data *sdata, bool enable) mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT; if (enable) { - u8 *pos = mgmt->u.action.u.epcs.variable; + u8 *pos = mgmt->u.action.epcs.variable; - mgmt->u.action.u.epcs.action_code = + mgmt->u.action.action_code = WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_REQ; *pos = ++sdata->u.mgd.dialog_token_alloc; sdata->u.mgd.epcs.dialog_token = *pos; } else { - mgmt->u.action.u.epcs.action_code = + mgmt->u.action.action_code = WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_TEARDOWN; ieee80211_epcs_teardown(sdata); @@ -11246,7 +11237,7 @@ void ieee80211_process_epcs_ena_resp(struct ieee80211_sub_if_data *sdata, return; /* Handle dialog token and status code */ - pos = mgmt->u.action.u.epcs.variable; + pos = mgmt->u.action.epcs.variable; dialog_token = *pos; status_code = get_unaligned_le16(pos + 1); @@ -11268,8 +11259,7 @@ void ieee80211_process_epcs_ena_resp(struct ieee80211_sub_if_data *sdata, return; pos += IEEE80211_EPCS_ENA_RESP_BODY_LEN; - ies_len = len - offsetof(struct ieee80211_mgmt, - u.action.u.epcs.variable) - + ies_len = len - IEEE80211_MIN_ACTION_SIZE(epcs) - IEEE80211_EPCS_ENA_RESP_BODY_LEN; elems = ieee802_11_parse_elems(pos, ies_len, diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 6c4b549444c6..3bd046bebf9e 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -274,7 +274,7 @@ static void ieee80211_handle_mu_mimo_mon(struct ieee80211_sub_if_data *sdata, if (!sdata) return; - BUILD_BUG_ON(sizeof(action) != IEEE80211_MIN_ACTION_SIZE + 1); + BUILD_BUG_ON(sizeof(action) != IEEE80211_MIN_ACTION_SIZE(action_code)); if (skb->len < rtap_space + sizeof(action) + VHT_MUMIMO_GROUPS_DATA_LEN) @@ -1162,7 +1162,7 @@ static ieee80211_rx_result ieee80211_rx_mesh_check(struct ieee80211_rx_data *rx) u8 category; /* make sure category field is present */ - if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE) + if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE(category)) return RX_DROP_U_RUNT_ACTION; mgmt = (struct ieee80211_mgmt *)hdr; @@ -3422,7 +3422,7 @@ static void ieee80211_process_sa_query_req(struct ieee80211_sub_if_data *sdata, return; } - if (len < 24 + 1 + sizeof(resp->u.action.u.sa_query)) { + if (len < IEEE80211_MIN_ACTION_SIZE(sa_query)) { /* Too short SA Query request frame */ return; } @@ -3432,17 +3432,16 @@ static void ieee80211_process_sa_query_req(struct ieee80211_sub_if_data *sdata, return; skb_reserve(skb, local->hw.extra_tx_headroom); - resp = skb_put_zero(skb, 24); + resp = skb_put_zero(skb, IEEE80211_MIN_ACTION_SIZE(sa_query)); memcpy(resp->da, sdata->vif.cfg.ap_addr, ETH_ALEN); memcpy(resp->sa, sdata->vif.addr, ETH_ALEN); memcpy(resp->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); resp->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); - skb_put(skb, 1 + sizeof(resp->u.action.u.sa_query)); resp->u.action.category = WLAN_CATEGORY_SA_QUERY; - resp->u.action.u.sa_query.action = WLAN_ACTION_SA_QUERY_RESPONSE; - memcpy(resp->u.action.u.sa_query.trans_id, - mgmt->u.action.u.sa_query.trans_id, + resp->u.action.action_code = WLAN_ACTION_SA_QUERY_RESPONSE; + memcpy(resp->u.action.sa_query.trans_id, + mgmt->u.action.sa_query.trans_id, WLAN_SA_QUERY_TR_ID_LEN); ieee80211_tx_skb(sdata, skb); @@ -3516,7 +3515,7 @@ ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx) /* drop too small action frames */ if (ieee80211_is_action(mgmt->frame_control) && - rx->skb->len < IEEE80211_MIN_ACTION_SIZE) + rx->skb->len < IEEE80211_MIN_ACTION_SIZE(category)) return RX_DROP_U_RUNT_ACTION; /* Drop non-broadcast Beacon frames */ @@ -3565,29 +3564,28 @@ ieee80211_process_rx_twt_action(struct ieee80211_rx_data *rx) if (!rx->sta) return false; - switch (mgmt->u.action.u.s1g.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_S1G_TWT_SETUP: { struct ieee80211_twt_setup *twt; - if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE + - 1 + /* action code */ + if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE(action_code) + sizeof(struct ieee80211_twt_setup) + 2 /* TWT req_type agrt */) break; - twt = (void *)mgmt->u.action.u.s1g.variable; + twt = (void *)mgmt->u.action.s1g.variable; if (twt->element_id != WLAN_EID_S1G_TWT) break; - if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE + - 4 + /* action code + token + tlv */ + if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE(action_code) + + 3 + /* token + tlv */ twt->length) break; return true; /* queue the frame */ } case WLAN_S1G_TWT_TEARDOWN: - if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE + 2) + if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE(action_code) + 1) break; return true; /* queue the frame */ @@ -3632,10 +3630,10 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) break; /* verify action & smps_control/chanwidth are present */ - if (len < IEEE80211_MIN_ACTION_SIZE + 2) + if (len < IEEE80211_MIN_ACTION_SIZE(ht_smps)) goto invalid; - switch (mgmt->u.action.u.ht_smps.action) { + switch (mgmt->u.action.action_code) { case WLAN_HT_ACTION_SMPS: { struct ieee80211_supported_band *sband; enum ieee80211_smps_mode smps_mode; @@ -3646,7 +3644,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) goto handled; /* convert to HT capability */ - switch (mgmt->u.action.u.ht_smps.smps_control) { + switch (mgmt->u.action.ht_smps.smps_control) { case WLAN_HT_SMPS_CONTROL_DISABLED: smps_mode = IEEE80211_SMPS_OFF; break; @@ -3679,7 +3677,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) goto handled; } case WLAN_HT_ACTION_NOTIFY_CHANWIDTH: { - u8 chanwidth = mgmt->u.action.u.ht_notify_cw.chanwidth; + u8 chanwidth = mgmt->u.action.ht_notify_cw.chanwidth; if (chanwidth != IEEE80211_HT_CHANWIDTH_20MHZ && chanwidth != IEEE80211_HT_CHANWIDTH_ANY) @@ -3699,7 +3697,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) break; case WLAN_CATEGORY_PUBLIC: case WLAN_CATEGORY_PROTECTED_DUAL_OF_ACTION: - if (len < IEEE80211_MIN_ACTION_SIZE + 1) + if (len < IEEE80211_MIN_ACTION_SIZE(action_code)) goto invalid; if (sdata->vif.type != NL80211_IFTYPE_STATION) break; @@ -3707,11 +3705,10 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) break; if (!ether_addr_equal(mgmt->bssid, sdata->deflink.u.mgd.bssid)) break; - if (mgmt->u.action.u.ext_chan_switch.action_code != + if (mgmt->u.action.action_code != WLAN_PUB_ACTION_EXT_CHANSW_ANN) break; - if (len < offsetof(struct ieee80211_mgmt, - u.action.u.ext_chan_switch.variable)) + if (len < IEEE80211_MIN_ACTION_SIZE(ext_chan_switch)) goto invalid; goto queue; case WLAN_CATEGORY_VHT: @@ -3723,18 +3720,18 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) break; /* verify action code is present */ - if (len < IEEE80211_MIN_ACTION_SIZE + 1) + if (len < IEEE80211_MIN_ACTION_SIZE(action_code)) goto invalid; - switch (mgmt->u.action.u.vht_opmode_notif.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_VHT_ACTION_OPMODE_NOTIF: { /* verify opmode is present */ - if (len < IEEE80211_MIN_ACTION_SIZE + 2) + if (len < IEEE80211_MIN_ACTION_SIZE(vht_opmode_notif)) goto invalid; goto queue; } case WLAN_VHT_ACTION_GROUPID_MGMT: { - if (len < IEEE80211_MIN_ACTION_SIZE + 25) + if (len < IEEE80211_MIN_ACTION_SIZE(vht_group_notif)) goto invalid; goto queue; } @@ -3751,23 +3748,20 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) break; /* verify action_code is present */ - if (len < IEEE80211_MIN_ACTION_SIZE + 1) + if (len < IEEE80211_MIN_ACTION_SIZE(action_code)) break; - switch (mgmt->u.action.u.addba_req.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_ACTION_ADDBA_REQ: - if (len < (IEEE80211_MIN_ACTION_SIZE + - sizeof(mgmt->u.action.u.addba_req))) + if (len < IEEE80211_MIN_ACTION_SIZE(addba_req)) goto invalid; break; case WLAN_ACTION_ADDBA_RESP: - if (len < (IEEE80211_MIN_ACTION_SIZE + - sizeof(mgmt->u.action.u.addba_resp))) + if (len < IEEE80211_MIN_ACTION_SIZE(addba_resp)) goto invalid; break; case WLAN_ACTION_DELBA: - if (len < (IEEE80211_MIN_ACTION_SIZE + - sizeof(mgmt->u.action.u.delba))) + if (len < IEEE80211_MIN_ACTION_SIZE(delba)) goto invalid; break; default: @@ -3777,16 +3771,15 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) goto queue; case WLAN_CATEGORY_SPECTRUM_MGMT: /* verify action_code is present */ - if (len < IEEE80211_MIN_ACTION_SIZE + 1) + if (len < IEEE80211_MIN_ACTION_SIZE(action_code)) break; - switch (mgmt->u.action.u.measurement.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_ACTION_SPCT_MSR_REQ: if (status->band != NL80211_BAND_5GHZ) break; - if (len < (IEEE80211_MIN_ACTION_SIZE + - sizeof(mgmt->u.action.u.measurement))) + if (len < IEEE80211_MIN_ACTION_SIZE(measurement)) break; if (sdata->vif.type != NL80211_IFTYPE_STATION) @@ -3796,8 +3789,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) goto handled; case WLAN_ACTION_SPCT_CHL_SWITCH: { u8 *bssid; - if (len < (IEEE80211_MIN_ACTION_SIZE + - sizeof(mgmt->u.action.u.chan_switch))) + if (len < IEEE80211_MIN_ACTION_SIZE(chan_switch)) break; if (sdata->vif.type != NL80211_IFTYPE_STATION && @@ -3822,11 +3814,10 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) } break; case WLAN_CATEGORY_SELF_PROTECTED: - if (len < (IEEE80211_MIN_ACTION_SIZE + - sizeof(mgmt->u.action.u.self_prot.action_code))) + if (len < IEEE80211_MIN_ACTION_SIZE(self_prot)) break; - switch (mgmt->u.action.u.self_prot.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_SP_MESH_PEERING_OPEN: case WLAN_SP_MESH_PEERING_CLOSE: case WLAN_SP_MESH_PEERING_CONFIRM: @@ -3844,8 +3835,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) } break; case WLAN_CATEGORY_MESH_ACTION: - if (len < (IEEE80211_MIN_ACTION_SIZE + - sizeof(mgmt->u.action.u.mesh_action.action_code))) + if (len < IEEE80211_MIN_ACTION_SIZE(action_code)) break; if (!ieee80211_vif_is_mesh(&sdata->vif)) @@ -3855,11 +3845,10 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) break; goto queue; case WLAN_CATEGORY_S1G: - if (len < offsetofend(typeof(*mgmt), - u.action.u.s1g.action_code)) + if (len < IEEE80211_MIN_ACTION_SIZE(action_code)) break; - switch (mgmt->u.action.u.s1g.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_S1G_TWT_SETUP: case WLAN_S1G_TWT_TEARDOWN: if (ieee80211_process_rx_twt_action(rx)) @@ -3870,33 +3859,29 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) } break; case WLAN_CATEGORY_PROTECTED_EHT: - if (len < offsetofend(typeof(*mgmt), - u.action.u.ttlm_req.action_code)) + if (len < IEEE80211_MIN_ACTION_SIZE(action_code)) break; - switch (mgmt->u.action.u.ttlm_req.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_PROTECTED_EHT_ACTION_TTLM_REQ: if (sdata->vif.type != NL80211_IFTYPE_STATION) break; - if (len < offsetofend(typeof(*mgmt), - u.action.u.ttlm_req)) + if (len < IEEE80211_MIN_ACTION_SIZE(ttlm_req)) goto invalid; goto queue; case WLAN_PROTECTED_EHT_ACTION_TTLM_RES: if (sdata->vif.type != NL80211_IFTYPE_STATION) break; - if (len < offsetofend(typeof(*mgmt), - u.action.u.ttlm_res)) + if (len < IEEE80211_MIN_ACTION_SIZE(ttlm_res)) goto invalid; goto queue; case WLAN_PROTECTED_EHT_ACTION_TTLM_TEARDOWN: if (sdata->vif.type != NL80211_IFTYPE_STATION) break; - if (len < offsetofend(typeof(*mgmt), - u.action.u.ttlm_tear_down)) + if (len < IEEE80211_MIN_ACTION_SIZE(ttlm_tear_down)) goto invalid; goto queue; case WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_RESP: @@ -3906,34 +3891,29 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) /* The reconfiguration response action frame must * least one 'Status Duple' entry (3 octets) */ - if (len < - offsetofend(typeof(*mgmt), - u.action.u.ml_reconf_resp) + 3) + if (len < IEEE80211_MIN_ACTION_SIZE(ml_reconf_resp) + 3) goto invalid; goto queue; case WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_RESP: if (sdata->vif.type != NL80211_IFTYPE_STATION) break; - if (len < offsetofend(typeof(*mgmt), - u.action.u.epcs) + - IEEE80211_EPCS_ENA_RESP_BODY_LEN) + if (len < IEEE80211_MIN_ACTION_SIZE(epcs) + + IEEE80211_EPCS_ENA_RESP_BODY_LEN) goto invalid; goto queue; case WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_TEARDOWN: if (sdata->vif.type != NL80211_IFTYPE_STATION) break; - if (len < offsetofend(typeof(*mgmt), - u.action.u.epcs)) + if (len < IEEE80211_MIN_ACTION_SIZE(epcs)) goto invalid; goto queue; case WLAN_PROTECTED_EHT_ACTION_EML_OP_MODE_NOTIF: if (sdata->vif.type != NL80211_IFTYPE_AP) break; - if (len < offsetofend(typeof(*mgmt), - u.action.u.eml_omn)) + if (len < IEEE80211_MIN_ACTION_SIZE(eml_omn)) goto invalid; goto queue; default: @@ -4015,11 +3995,10 @@ ieee80211_rx_h_action_post_userspace(struct ieee80211_rx_data *rx) switch (mgmt->u.action.category) { case WLAN_CATEGORY_SA_QUERY: - if (len < (IEEE80211_MIN_ACTION_SIZE + - sizeof(mgmt->u.action.u.sa_query))) + if (len < IEEE80211_MIN_ACTION_SIZE(sa_query)) break; - switch (mgmt->u.action.u.sa_query.action) { + switch (mgmt->u.action.action_code) { case WLAN_ACTION_SA_QUERY_REQUEST: if (sdata->vif.type != NL80211_IFTYPE_STATION) break; diff --git a/net/mac80211/s1g.c b/net/mac80211/s1g.c index 1f68df6e8067..297abaa6fecf 100644 --- a/net/mac80211/s1g.c +++ b/net/mac80211/s1g.c @@ -2,7 +2,7 @@ /* * S1G handling * Copyright(c) 2020 Adapt-IP - * Copyright (C) 2023 Intel Corporation + * Copyright (C) 2023, 2026 Intel Corporation */ #include #include @@ -27,14 +27,14 @@ bool ieee80211_s1g_is_twt_setup(struct sk_buff *skb) if (likely(mgmt->u.action.category != WLAN_CATEGORY_S1G)) return false; - return mgmt->u.action.u.s1g.action_code == WLAN_S1G_TWT_SETUP; + return mgmt->u.action.action_code == WLAN_S1G_TWT_SETUP; } static void ieee80211_s1g_send_twt_setup(struct ieee80211_sub_if_data *sdata, const u8 *da, const u8 *bssid, struct ieee80211_twt_setup *twt) { - int len = IEEE80211_MIN_ACTION_SIZE + 4 + twt->length; + int len = IEEE80211_MIN_ACTION_SIZE(s1g) + 3 + twt->length; struct ieee80211_local *local = sdata->local; struct ieee80211_mgmt *mgmt; struct sk_buff *skb; @@ -52,8 +52,8 @@ ieee80211_s1g_send_twt_setup(struct ieee80211_sub_if_data *sdata, const u8 *da, memcpy(mgmt->bssid, bssid, ETH_ALEN); mgmt->u.action.category = WLAN_CATEGORY_S1G; - mgmt->u.action.u.s1g.action_code = WLAN_S1G_TWT_SETUP; - memcpy(mgmt->u.action.u.s1g.variable, twt, 3 + twt->length); + mgmt->u.action.action_code = WLAN_S1G_TWT_SETUP; + memcpy(mgmt->u.action.s1g.variable, twt, 3 + twt->length); IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT | IEEE80211_TX_INTFL_MLME_CONN_TX | @@ -71,12 +71,12 @@ ieee80211_s1g_send_twt_teardown(struct ieee80211_sub_if_data *sdata, u8 *id; skb = dev_alloc_skb(local->hw.extra_tx_headroom + - IEEE80211_MIN_ACTION_SIZE + 2); + IEEE80211_MIN_ACTION_SIZE(s1g) + 1); if (!skb) return; skb_reserve(skb, local->hw.extra_tx_headroom); - mgmt = skb_put_zero(skb, IEEE80211_MIN_ACTION_SIZE + 2); + mgmt = skb_put_zero(skb, IEEE80211_MIN_ACTION_SIZE(s1g) + 1); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); memcpy(mgmt->da, da, ETH_ALEN); @@ -84,8 +84,8 @@ ieee80211_s1g_send_twt_teardown(struct ieee80211_sub_if_data *sdata, memcpy(mgmt->bssid, bssid, ETH_ALEN); mgmt->u.action.category = WLAN_CATEGORY_S1G; - mgmt->u.action.u.s1g.action_code = WLAN_S1G_TWT_TEARDOWN; - id = (u8 *)mgmt->u.action.u.s1g.variable; + mgmt->u.action.action_code = WLAN_S1G_TWT_TEARDOWN; + id = (u8 *)mgmt->u.action.s1g.variable; *id = flowid; IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT | @@ -98,7 +98,7 @@ ieee80211_s1g_rx_twt_setup(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_mgmt *mgmt = (void *)skb->data; - struct ieee80211_twt_setup *twt = (void *)mgmt->u.action.u.s1g.variable; + struct ieee80211_twt_setup *twt = (void *)mgmt->u.action.s1g.variable; struct ieee80211_twt_params *twt_agrt = (void *)twt->params; twt_agrt->req_type &= cpu_to_le16(~IEEE80211_TWT_REQTYPE_REQUEST); @@ -128,7 +128,7 @@ ieee80211_s1g_rx_twt_teardown(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)skb->data; drv_twt_teardown_request(sdata->local, sdata, &sta->sta, - mgmt->u.action.u.s1g.variable[0]); + mgmt->u.action.s1g.variable[0]); } static void @@ -136,7 +136,7 @@ ieee80211_s1g_tx_twt_setup_fail(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)skb->data; - struct ieee80211_twt_setup *twt = (void *)mgmt->u.action.u.s1g.variable; + struct ieee80211_twt_setup *twt = (void *)mgmt->u.action.s1g.variable; struct ieee80211_twt_params *twt_agrt = (void *)twt->params; u8 flowid = le16_get_bits(twt_agrt->req_type, IEEE80211_TWT_REQTYPE_FLOWID); @@ -160,7 +160,7 @@ void ieee80211_s1g_rx_twt_action(struct ieee80211_sub_if_data *sdata, if (!sta) return; - switch (mgmt->u.action.u.s1g.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_S1G_TWT_SETUP: ieee80211_s1g_rx_twt_setup(sdata, sta, skb); break; @@ -185,7 +185,7 @@ void ieee80211_s1g_status_twt_action(struct ieee80211_sub_if_data *sdata, if (!sta) return; - switch (mgmt->u.action.u.s1g.action_code) { + switch (mgmt->u.action.action_code) { case WLAN_S1G_TWT_SETUP: /* process failed twt setup frames */ ieee80211_s1g_tx_twt_setup_fail(sdata, sta, skb); diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c index 7422888d3640..e2eaf8d8d7ff 100644 --- a/net/mac80211/spectmgmt.c +++ b/net/mac80211/spectmgmt.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu * Copyright 2007-2008, Intel Corporation * Copyright 2008, Johannes Berg - * Copyright (C) 2018, 2020, 2022-2024 Intel Corporation + * Copyright (C) 2018, 2020, 2022-2024, 2026 Intel Corporation */ #include @@ -409,35 +409,30 @@ static void ieee80211_send_refuse_measurement_request(struct ieee80211_sub_if_da struct sk_buff *skb; struct ieee80211_mgmt *msr_report; - skb = dev_alloc_skb(sizeof(*msr_report) + local->hw.extra_tx_headroom + - sizeof(struct ieee80211_msrment_ie)); + skb = dev_alloc_skb(IEEE80211_MIN_ACTION_SIZE(measurement) + + local->hw.extra_tx_headroom); if (!skb) return; skb_reserve(skb, local->hw.extra_tx_headroom); - msr_report = skb_put_zero(skb, 24); + msr_report = skb_put_zero(skb, IEEE80211_MIN_ACTION_SIZE(measurement)); memcpy(msr_report->da, da, ETH_ALEN); memcpy(msr_report->sa, sdata->vif.addr, ETH_ALEN); memcpy(msr_report->bssid, bssid, ETH_ALEN); msr_report->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); - skb_put(skb, 1 + sizeof(msr_report->u.action.u.measurement)); msr_report->u.action.category = WLAN_CATEGORY_SPECTRUM_MGMT; - msr_report->u.action.u.measurement.action_code = - WLAN_ACTION_SPCT_MSR_RPRT; - msr_report->u.action.u.measurement.dialog_token = dialog_token; + msr_report->u.action.action_code = WLAN_ACTION_SPCT_MSR_RPRT; - msr_report->u.action.u.measurement.element_id = WLAN_EID_MEASURE_REPORT; - msr_report->u.action.u.measurement.length = + msr_report->u.action.measurement.dialog_token = dialog_token; + msr_report->u.action.measurement.element_id = WLAN_EID_MEASURE_REPORT; + msr_report->u.action.measurement.length = sizeof(struct ieee80211_msrment_ie); - - memset(&msr_report->u.action.u.measurement.msr_elem, 0, - sizeof(struct ieee80211_msrment_ie)); - msr_report->u.action.u.measurement.msr_elem.token = request_ie->token; - msr_report->u.action.u.measurement.msr_elem.mode |= + msr_report->u.action.measurement.msr_elem.token = request_ie->token; + msr_report->u.action.measurement.msr_elem.mode |= IEEE80211_SPCT_MSR_RPRT_MODE_REFUSED; - msr_report->u.action.u.measurement.msr_elem.type = request_ie->type; + msr_report->u.action.measurement.msr_elem.type = request_ie->type; ieee80211_tx_skb(sdata, skb); } @@ -454,7 +449,7 @@ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, * TODO: Answer basic measurement as unmeasured */ ieee80211_send_refuse_measurement_request(sdata, - &mgmt->u.action.u.measurement.msr_elem, + &mgmt->u.action.measurement.msr_elem, mgmt->sa, mgmt->bssid, - mgmt->u.action.u.measurement.dialog_token); + mgmt->u.action.measurement.dialog_token); } diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index dbbfe2d6842f..1f30a4eda374 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -6,7 +6,7 @@ * Copyright 2014, Intel Corporation * Copyright 2014 Intel Mobile Communications GmbH * Copyright 2015 - 2016 Intel Deutschland GmbH - * Copyright (C) 2019, 2021-2025 Intel Corporation + * Copyright (C) 2019, 2021-2026 Intel Corporation */ #include @@ -879,28 +879,23 @@ ieee80211_prep_tdls_direct(struct wiphy *wiphy, struct net_device *dev, struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_mgmt *mgmt; - mgmt = skb_put_zero(skb, 24); + if (action_code != WLAN_PUB_ACTION_TDLS_DISCOVER_RES) + return -EINVAL; + + mgmt = skb_put_zero(skb, IEEE80211_MIN_ACTION_SIZE(tdls_discover_resp)); memcpy(mgmt->da, peer, ETH_ALEN); memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); memcpy(mgmt->bssid, link->u.mgd.bssid, ETH_ALEN); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); - switch (action_code) { - case WLAN_PUB_ACTION_TDLS_DISCOVER_RES: - skb_put(skb, 1 + sizeof(mgmt->u.action.u.tdls_discover_resp)); - mgmt->u.action.category = WLAN_CATEGORY_PUBLIC; - mgmt->u.action.u.tdls_discover_resp.action_code = - WLAN_PUB_ACTION_TDLS_DISCOVER_RES; - mgmt->u.action.u.tdls_discover_resp.dialog_token = - dialog_token; - mgmt->u.action.u.tdls_discover_resp.capability = - cpu_to_le16(ieee80211_get_tdls_sta_capab(link, - status_code)); - break; - default: - return -EINVAL; - } + mgmt->u.action.category = WLAN_CATEGORY_PUBLIC; + mgmt->u.action.action_code = WLAN_PUB_ACTION_TDLS_DISCOVER_RES; + + mgmt->u.action.tdls_discover_resp.dialog_token = dialog_token; + mgmt->u.action.tdls_discover_resp.capability = + cpu_to_le16(ieee80211_get_tdls_sta_capab(link, + status_code)); return 0; } diff --git a/net/mac80211/util.c b/net/mac80211/util.c index b2e6c8b98381..55054de62508 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -3766,12 +3766,11 @@ again: int ieee80211_send_action_csa(struct ieee80211_sub_if_data *sdata, struct cfg80211_csa_settings *csa_settings) { + int hdr_len = IEEE80211_MIN_ACTION_SIZE(chan_switch); struct sk_buff *skb; struct ieee80211_mgmt *mgmt; struct ieee80211_local *local = sdata->local; int freq; - int hdr_len = offsetofend(struct ieee80211_mgmt, - u.action.u.chan_switch); u8 *pos; if (sdata->vif.type != NL80211_IFTYPE_ADHOC && @@ -3800,7 +3799,7 @@ int ieee80211_send_action_csa(struct ieee80211_sub_if_data *sdata, memcpy(mgmt->bssid, ifibss->bssid, ETH_ALEN); } mgmt->u.action.category = WLAN_CATEGORY_SPECTRUM_MGMT; - mgmt->u.action.u.chan_switch.action_code = WLAN_ACTION_SPCT_CHL_SWITCH; + mgmt->u.action.action_code = WLAN_ACTION_SPCT_CHL_SWITCH; pos = skb_put(skb, 5); *pos++ = WLAN_EID_CHANNEL_SWITCH; /* EID */ *pos++ = 3; /* IE length */ diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index b099d79e8fbb..80120f9f17b6 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -4,7 +4,7 @@ * * Portions of this file * Copyright(c) 2015 - 2016 Intel Deutschland GmbH - * Copyright (C) 2018 - 2024 Intel Corporation + * Copyright (C) 2018-2026 Intel Corporation */ #include @@ -723,17 +723,17 @@ void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata, if (!link_conf->mu_mimo_owner) return; - if (!memcmp(mgmt->u.action.u.vht_group_notif.position, + if (!memcmp(mgmt->u.action.vht_group_notif.position, link_conf->mu_group.position, WLAN_USER_POSITION_LEN) && - !memcmp(mgmt->u.action.u.vht_group_notif.membership, + !memcmp(mgmt->u.action.vht_group_notif.membership, link_conf->mu_group.membership, WLAN_MEMBERSHIP_LEN)) return; memcpy(link_conf->mu_group.membership, - mgmt->u.action.u.vht_group_notif.membership, + mgmt->u.action.vht_group_notif.membership, WLAN_MEMBERSHIP_LEN); memcpy(link_conf->mu_group.position, - mgmt->u.action.u.vht_group_notif.position, + mgmt->u.action.vht_group_notif.position, WLAN_USER_POSITION_LEN); ieee80211_link_info_change_notify(sdata, link, -- cgit v1.2.3 From 9aa84d5c6c99480c523aeb7a6ce93b6635f3e290 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 4 Mar 2026 14:41:48 +0100 Subject: wifi: ieee80211: fix UHR operation DBE vs. P-EDCA order Draft P802.11bn_D1.3 switched the order here to align with the order of the fields. Adjust the code accordingly. Link: https://patch.msgid.link/20260304144148.ce45942294e1.I22ab3f16e6376a19c3953cf81dd67105ea8e529d@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211-uhr.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211-uhr.h b/include/linux/ieee80211-uhr.h index 9729d23e4766..d199f3ebdba0 100644 --- a/include/linux/ieee80211-uhr.h +++ b/include/linux/ieee80211-uhr.h @@ -12,8 +12,8 @@ #define IEEE80211_UHR_OPER_PARAMS_DPS_ENA 0x0001 #define IEEE80211_UHR_OPER_PARAMS_NPCA_ENA 0x0002 -#define IEEE80211_UHR_OPER_PARAMS_DBE_ENA 0x0004 -#define IEEE80211_UHR_OPER_PARAMS_PEDCA_ENA 0x0008 +#define IEEE80211_UHR_OPER_PARAMS_PEDCA_ENA 0x0004 +#define IEEE80211_UHR_OPER_PARAMS_DBE_ENA 0x0008 struct ieee80211_uhr_operation { __le16 params; -- cgit v1.2.3 From 98acd4c1d9f7dc9c426e840c16e81b57315ff84b Mon Sep 17 00:00:00 2001 From: Ria Thomas Date: Thu, 5 Mar 2026 14:43:04 +0530 Subject: wifi: mac80211: add support for NDP ADDBA/DELBA for S1G S1G defines use of NDP Block Ack (BA) for aggregation, requiring negotiation of NDP ADDBA/DELBA action frames. If the S1G recipient supports HT-immediate block ack, the sender must send an NDP ADDBA Request indicating it expects only NDP BlockAck frames for the agreement. Introduce support for NDP ADDBA and DELBA exchange in mac80211. The implementation negotiates the BA mechanism during setup based on station capabilities and driver support (IEEE80211_HW_SUPPORTS_NDP_BLOCKACK). If negotiation fails due to mismatched expectations, a rejection with status code WLAN_STATUS_REJECTED_NDP_BLOCK_ACK_SUGGESTED is returned as per IEEE 802.11-2024. Trace sample: IEEE 802.11 Wireless Management Fixed parameters Category code: Block Ack (3) Action code: NDP ADDBA Request (0x80) Dialog token: 0x01 Block Ack Parameters: 0x1003, A-MSDUs, Block Ack Policy .... .... .... ...1 = A-MSDUs: Permitted in QoS Data MPDUs .... .... .... ..1. = Block Ack Policy: Immediate Block Ack .... .... ..00 00.. = Traffic Identifier: 0x0 0001 0000 00.. .... = Number of Buffers (1 Buffer = 2304 Bytes): 64 Block Ack Timeout: 0x0000 Block Ack Starting Sequence Control (SSC): 0x0010 .... .... .... 0000 = Fragment: 0 0000 0000 0001 .... = Starting Sequence Number: 1 IEEE 802.11 Wireless Management Fixed parameters Category code: Block Ack (3) Action code: NDP ADDBA Response (0x81) Dialog token: 0x02 Status code: BlockAck negotiation refused because, due to buffer constraints and other unspecified reasons, the recipient prefers to generate only NDP BlockAck frames (0x006d) Block Ack Parameters: 0x1002, Block Ack Policy .... .... .... ...0 = A-MSDUs: Not Permitted .... .... .... ..1. = Block Ack Policy: Immediate Block Ack .... .... ..00 00.. = Traffic Identifier: 0x0 0001 0000 00.. .... = Number of Buffers (1 Buffer = 2304 Bytes): 64 Block Ack Timeout: 0x0000 Signed-off-by: Ria Thomas Link: https://patch.msgid.link/20260305091304.310990-1-ria.thomas@morsemicro.com Signed-off-by: Johannes Berg --- include/linux/ieee80211-ht.h | 3 +++ include/linux/ieee80211.h | 2 ++ include/net/mac80211.h | 4 ++++ net/mac80211/agg-rx.c | 24 +++++++++++++++++++++--- net/mac80211/agg-tx.c | 13 +++++++++---- net/mac80211/debugfs.c | 1 + net/mac80211/ht.c | 8 +++++--- net/mac80211/ieee80211_i.h | 6 +++++- net/mac80211/iface.c | 3 +++ net/mac80211/rx.c | 11 +++++++++-- net/mac80211/s1g.c | 8 ++++++++ net/mac80211/sta_info.h | 2 ++ 12 files changed, 72 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211-ht.h b/include/linux/ieee80211-ht.h index 21bbf470540f..7612b72f9c7c 100644 --- a/include/linux/ieee80211-ht.h +++ b/include/linux/ieee80211-ht.h @@ -281,6 +281,9 @@ enum ieee80211_back_actioncode { WLAN_ACTION_ADDBA_REQ = 0, WLAN_ACTION_ADDBA_RESP = 1, WLAN_ACTION_DELBA = 2, + WLAN_ACTION_NDP_ADDBA_REQ = 128, + WLAN_ACTION_NDP_ADDBA_RESP = 129, + WLAN_ACTION_NDP_DELBA = 130, }; /* BACK (block-ack) parties */ diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index aea360e90cb1..52db36120314 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1482,6 +1482,8 @@ enum ieee80211_statuscode { WLAN_STATUS_REJECT_DSE_BAND = 96, WLAN_STATUS_DENIED_WITH_SUGGESTED_BAND_AND_CHANNEL = 99, WLAN_STATUS_DENIED_DUE_TO_SPECTRUM_MANAGEMENT = 103, + /* 802.11ah */ + WLAN_STATUS_REJECTED_NDP_BLOCK_ACK_SUGGESTED = 109, /* 802.11ai */ WLAN_STATUS_FILS_AUTHENTICATION_FAILURE = 112, WLAN_STATUS_UNKNOWN_AUTHENTICATION_SERVER = 113, diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 9f8251fb9832..9cc482191ab9 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2913,6 +2913,9 @@ struct ieee80211_txq { * HW flag so drivers can opt in according to their own control, e.g. in * testing. * + * @IEEE80211_HW_SUPPORTS_NDP_BLOCKACK: HW can transmit/receive S1G NDP + * BlockAck frames. + * * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays */ enum ieee80211_hw_flags { @@ -2973,6 +2976,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_DISALLOW_PUNCTURING, IEEE80211_HW_HANDLES_QUIET_CSA, IEEE80211_HW_STRICT, + IEEE80211_HW_SUPPORTS_NDP_BLOCKACK, /* keep last, obviously */ NUM_IEEE80211_HW_FLAGS diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 0a2be8cb600f..0140ac826b23 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -94,7 +94,8 @@ void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, /* check if this is a self generated aggregation halt */ if (initiator == WLAN_BACK_RECIPIENT && tx) ieee80211_send_delba(sta->sdata, sta->sta.addr, - tid, WLAN_BACK_RECIPIENT, reason); + tid, WLAN_BACK_RECIPIENT, reason, + ieee80211_s1g_use_ndp_ba(sta->sdata, sta)); /* * return here in case tid_rx is not assigned - which will happen if @@ -240,6 +241,7 @@ static void ieee80211_send_addba_resp(struct sta_info *sta, u8 *da, u16 tid, struct sk_buff *skb; struct ieee80211_mgmt *mgmt; bool amsdu = ieee80211_hw_check(&local->hw, SUPPORTS_AMSDU_IN_AMPDU); + bool use_ndp = ieee80211_s1g_use_ndp_ba(sdata, sta); u16 capab; skb = dev_alloc_skb(sizeof(*mgmt) + @@ -253,7 +255,8 @@ static void ieee80211_send_addba_resp(struct sta_info *sta, u8 *da, u16 tid, skb_put(skb, 2 + sizeof(mgmt->u.action.addba_resp)); mgmt->u.action.category = WLAN_CATEGORY_BACK; - mgmt->u.action.action_code = WLAN_ACTION_ADDBA_RESP; + mgmt->u.action.action_code = use_ndp ? + WLAN_ACTION_NDP_ADDBA_RESP : WLAN_ACTION_ADDBA_RESP; mgmt->u.action.addba_resp.dialog_token = dialog_token; @@ -276,6 +279,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, u8 dialog_token, u16 timeout, u16 start_seq_num, u16 ba_policy, u16 tid, u16 buf_size, bool tx, bool auto_seq, + bool req_ndp, const u8 addba_ext_data) { struct ieee80211_local *local = sta->sdata->local; @@ -301,6 +305,18 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, goto end; } + if (tx && ieee80211_s1g_use_ndp_ba(sta->sdata, sta) && !req_ndp) { + /* + * According to IEEE 802.11-2024: Inform S1G originator + * ADDBA rejected as NDP BlockAck is preferred + */ + status = WLAN_STATUS_REJECTED_NDP_BLOCK_ACK_SUGGESTED; + ht_dbg(sta->sdata, + "Rejecting AddBA Req from %pM tid %u - require NDP BlockAck\n", + sta->sta.addr, tid); + goto end; + } + if (!sta->sta.valid_links && !sta->sta.deflink.ht_cap.ht_supported && !sta->sta.deflink.he_cap.has_he && @@ -474,6 +490,7 @@ void ieee80211_process_addba_request(struct ieee80211_local *local, struct ieee80211_mgmt *mgmt, size_t len) { + bool req_ndp = mgmt->u.action.action_code == WLAN_ACTION_NDP_ADDBA_REQ; u16 capab, tid, timeout, ba_policy, buf_size, start_seq_num; u8 dialog_token, addba_ext_data; @@ -498,7 +515,8 @@ void ieee80211_process_addba_request(struct ieee80211_local *local, __ieee80211_start_rx_ba_session(sta, dialog_token, timeout, start_seq_num, ba_policy, tid, - buf_size, true, false, addba_ext_data); + buf_size, true, false, + req_ndp, addba_ext_data); } void ieee80211_manage_rx_ba_offl(struct ieee80211_vif *vif, diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index d5a62b8d5a80..01d927b88264 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -60,7 +60,7 @@ static void ieee80211_send_addba_request(struct sta_info *sta, u16 tid, u8 dialog_token, u16 start_seq_num, - u16 agg_size, u16 timeout) + u16 agg_size, u16 timeout, bool ndp) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; @@ -80,7 +80,8 @@ static void ieee80211_send_addba_request(struct sta_info *sta, u16 tid, skb_put(skb, 2 + sizeof(mgmt->u.action.addba_req)); mgmt->u.action.category = WLAN_CATEGORY_BACK; - mgmt->u.action.action_code = WLAN_ACTION_ADDBA_REQ; + mgmt->u.action.action_code = ndp ? + WLAN_ACTION_NDP_ADDBA_REQ : WLAN_ACTION_ADDBA_REQ; mgmt->u.action.addba_req.dialog_token = dialog_token; capab = IEEE80211_ADDBA_PARAM_AMSDU_MASK; @@ -484,7 +485,8 @@ static void ieee80211_send_addba_with_timeout(struct sta_info *sta, /* send AddBA request */ ieee80211_send_addba_request(sta, tid, tid_tx->dialog_token, - tid_tx->ssn, buf_size, tid_tx->timeout); + tid_tx->ssn, buf_size, tid_tx->timeout, + tid_tx->ndp); WARN_ON(test_and_set_bit(HT_AGG_STATE_SENT_ADDBA, &tid_tx->state)); } @@ -521,6 +523,7 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) */ synchronize_net(); + tid_tx->ndp = ieee80211_s1g_use_ndp_ba(sdata, sta); params.ssn = sta->tid_seq[tid] >> 4; ret = drv_ampdu_action(local, sdata, ¶ms); tid_tx->ssn = params.ssn; @@ -940,7 +943,9 @@ void ieee80211_stop_tx_ba_cb(struct sta_info *sta, int tid, if (send_delba) ieee80211_send_delba(sdata, sta->sta.addr, tid, - WLAN_BACK_INITIATOR, WLAN_REASON_QSTA_NOT_USE); + WLAN_BACK_INITIATOR, + WLAN_REASON_QSTA_NOT_USE, + tid_tx->ndp); } void ieee80211_stop_tx_ba_cb_irqsafe(struct ieee80211_vif *vif, diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index d02f07368c51..e8d0a8b71d59 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -490,6 +490,7 @@ static const char *hw_flag_names[] = { FLAG(DISALLOW_PUNCTURING), FLAG(HANDLES_QUIET_CSA), FLAG(STRICT), + FLAG(SUPPORTS_NDP_BLOCKACK), #undef FLAG }; diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 9e2469a8ce64..33f1e1b235e9 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -379,7 +379,7 @@ void ieee80211_ba_session_work(struct wiphy *wiphy, struct wiphy_work *work) sta->ampdu_mlme.tid_rx_manage_offl)) __ieee80211_start_rx_ba_session(sta, 0, 0, 0, 1, tid, IEEE80211_MAX_AMPDU_BUF_HT, - false, true, 0); + false, true, false, 0); if (test_and_clear_bit(tid + IEEE80211_NUM_TIDS, sta->ampdu_mlme.tid_rx_manage_offl)) @@ -455,7 +455,8 @@ void ieee80211_ba_session_work(struct wiphy *wiphy, struct wiphy_work *work) void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, const u8 *da, u16 tid, - u16 initiator, u16 reason_code) + u16 initiator, u16 reason_code, + bool use_ndp) { struct ieee80211_local *local = sdata->local; struct sk_buff *skb; @@ -473,7 +474,8 @@ void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, skb_put(skb, 2 + sizeof(mgmt->u.action.delba)); mgmt->u.action.category = WLAN_CATEGORY_BACK; - mgmt->u.action.action_code = WLAN_ACTION_DELBA; + mgmt->u.action.action_code = use_ndp ? + WLAN_ACTION_NDP_DELBA : WLAN_ACTION_DELBA; params = (u16)(initiator << 11); /* bit 11 initiator */ params |= (u16)(tid << 12); /* bit 15:12 TID number */ diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index a4babf7624e5..d71e0c6d2165 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2190,7 +2190,8 @@ bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata, struct link_sta_info *link_sta); void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, const u8 *da, u16 tid, - u16 initiator, u16 reason_code); + u16 initiator, u16 reason_code, + bool use_ndp); int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, enum ieee80211_smps_mode smps, const u8 *da, const u8 *bssid, int link_id); @@ -2206,6 +2207,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, u8 dialog_token, u16 timeout, u16 start_seq_num, u16 ba_policy, u16 tid, u16 buf_size, bool tx, bool auto_seq, + bool req_ndp, const u8 addba_ext_data); void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, enum ieee80211_agg_stop_reason reason); @@ -2331,6 +2333,8 @@ void ieee80211_s1g_status_twt_action(struct ieee80211_sub_if_data *sdata, void ieee80211_s1g_cap_to_sta_s1g_cap(struct ieee80211_sub_if_data *sdata, const struct ieee80211_s1g_cap *s1g_cap_ie, struct link_sta_info *link_sta); +bool ieee80211_s1g_use_ndp_ba(const struct ieee80211_sub_if_data *sdata, + const struct sta_info *sta); /* Spectrum management */ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 2e391cec73a0..40ce0bb72726 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1581,14 +1581,17 @@ static void ieee80211_iface_process_skb(struct ieee80211_local *local, if (sta) { switch (mgmt->u.action.action_code) { case WLAN_ACTION_ADDBA_REQ: + case WLAN_ACTION_NDP_ADDBA_REQ: ieee80211_process_addba_request(local, sta, mgmt, len); break; case WLAN_ACTION_ADDBA_RESP: + case WLAN_ACTION_NDP_ADDBA_RESP: ieee80211_process_addba_resp(local, sta, mgmt, len); break; case WLAN_ACTION_DELBA: + case WLAN_ACTION_NDP_DELBA: ieee80211_process_delba(sdata, sta, mgmt, len); break; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 3bd046bebf9e..19c33f7a8193 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1475,7 +1475,9 @@ static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx, !test_and_set_bit(tid, rx->sta->ampdu_mlme.unexpected_agg)) ieee80211_send_delba(rx->sdata, rx->sta->sta.addr, tid, WLAN_BACK_RECIPIENT, - WLAN_REASON_QSTA_REQUIRE_SETUP); + WLAN_REASON_QSTA_REQUIRE_SETUP, + ieee80211_s1g_use_ndp_ba(rx->sdata, + rx->sta)); goto dont_reorder; } @@ -3372,7 +3374,9 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames) !test_and_set_bit(tid, rx->sta->ampdu_mlme.unexpected_agg)) ieee80211_send_delba(rx->sdata, rx->sta->sta.addr, tid, WLAN_BACK_RECIPIENT, - WLAN_REASON_QSTA_REQUIRE_SETUP); + WLAN_REASON_QSTA_REQUIRE_SETUP, + ieee80211_s1g_use_ndp_ba(rx->sdata, + rx->sta)); tid_agg_rx = rcu_dereference(rx->sta->ampdu_mlme.tid_rx[tid]); if (!tid_agg_rx) @@ -3753,14 +3757,17 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) switch (mgmt->u.action.action_code) { case WLAN_ACTION_ADDBA_REQ: + case WLAN_ACTION_NDP_ADDBA_REQ: if (len < IEEE80211_MIN_ACTION_SIZE(addba_req)) goto invalid; break; case WLAN_ACTION_ADDBA_RESP: + case WLAN_ACTION_NDP_ADDBA_RESP: if (len < IEEE80211_MIN_ACTION_SIZE(addba_resp)) goto invalid; break; case WLAN_ACTION_DELBA: + case WLAN_ACTION_NDP_DELBA: if (len < IEEE80211_MIN_ACTION_SIZE(delba)) goto invalid; break; diff --git a/net/mac80211/s1g.c b/net/mac80211/s1g.c index 297abaa6fecf..5af4a0c6c642 100644 --- a/net/mac80211/s1g.c +++ b/net/mac80211/s1g.c @@ -220,3 +220,11 @@ void ieee80211_s1g_cap_to_sta_s1g_cap(struct ieee80211_sub_if_data *sdata, ieee80211_sta_recalc_aggregates(&link_sta->sta->sta); } + +bool ieee80211_s1g_use_ndp_ba(const struct ieee80211_sub_if_data *sdata, + const struct sta_info *sta) +{ + return sdata->vif.cfg.s1g && + ieee80211_hw_check(&sdata->local->hw, SUPPORTS_NDP_BLOCKACK) && + (sta && sta->sta.deflink.s1g_cap.s1g); +} diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index f1b1bbf2a2d4..58ccbea7f6f6 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -171,6 +171,7 @@ struct sta_info; * @bar_pending: BAR needs to be re-sent * @amsdu: support A-MSDU within A-MDPU * @ssn: starting sequence number of the session + * @ndp: this session is using NDP Block ACKs * * This structure's lifetime is managed by RCU, assignments to * the array holding it must hold the aggregation mutex. @@ -199,6 +200,7 @@ struct tid_ampdu_tx { u16 failed_bar_ssn; bool bar_pending; bool amsdu; + bool ndp; u8 tid; }; -- cgit v1.2.3 From 96fefcabf340fcf8b3208dcd8685961955a66040 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 4 Mar 2026 10:32:31 -0500 Subject: vfs: widen inode hash/lookup functions to u64 Change the inode hash/lookup VFS API functions to accept u64 parameters instead of unsigned long for inode numbers and hash values. This is preparation for widening i_ino itself to u64, which will allow filesystems to store full 64-bit inode numbers on 32-bit architectures. Since unsigned long implicitly widens to u64 on all architectures, this change is backward-compatible with all existing callers. Reviewed-by: Jan Kara Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20260304-iino-u64-v3-1-2257ad83d372@kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/f2fs/node.c | 2 +- fs/inode.c | 36 ++++++++++++++++++------------------ include/linux/fs.h | 46 ++++++++++++++++++++++------------------------ 3 files changed, 41 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 74992fd9c9b6..5ca6f518cfa1 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1997,7 +1997,7 @@ out: return ret; } -static int f2fs_match_ino(struct inode *inode, unsigned long ino, void *data) +static int f2fs_match_ino(struct inode *inode, u64 ino, void *data) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); bool clean; diff --git a/fs/inode.c b/fs/inode.c index cc12b68e021b..62df5dda0589 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -672,7 +672,7 @@ static inline void inode_sb_list_del(struct inode *inode) } } -static unsigned long hash(struct super_block *sb, unsigned long hashval) +static unsigned long hash(struct super_block *sb, u64 hashval) { unsigned long tmp; @@ -685,12 +685,12 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval) /** * __insert_inode_hash - hash an inode * @inode: unhashed inode - * @hashval: unsigned long value used to locate this object in the + * @hashval: u64 value used to locate this object in the * inode_hashtable. * * Add an inode to the inode hash for this superblock. */ -void __insert_inode_hash(struct inode *inode, unsigned long hashval) +void __insert_inode_hash(struct inode *inode, u64 hashval) { struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); @@ -1087,7 +1087,7 @@ repeat: * iget_locked for details. */ static struct inode *find_inode_fast(struct super_block *sb, - struct hlist_head *head, unsigned long ino, + struct hlist_head *head, u64 ino, bool hash_locked, bool *isnew) { struct inode *inode = NULL; @@ -1301,7 +1301,7 @@ EXPORT_SYMBOL(unlock_two_nondirectories); * Note that both @test and @set are called with the inode_hash_lock held, so * they can't sleep. */ -struct inode *inode_insert5(struct inode *inode, unsigned long hashval, +struct inode *inode_insert5(struct inode *inode, u64 hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) { @@ -1378,7 +1378,7 @@ EXPORT_SYMBOL(inode_insert5); * Note that both @test and @set are called with the inode_hash_lock held, so * they can't sleep. */ -struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, +struct inode *iget5_locked(struct super_block *sb, u64 hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) { @@ -1408,7 +1408,7 @@ EXPORT_SYMBOL(iget5_locked); * This is equivalent to iget5_locked, except the @test callback must * tolerate the inode not being stable, including being mid-teardown. */ -struct inode *iget5_locked_rcu(struct super_block *sb, unsigned long hashval, +struct inode *iget5_locked_rcu(struct super_block *sb, u64 hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) { @@ -1455,7 +1455,7 @@ EXPORT_SYMBOL_GPL(iget5_locked_rcu); * hashed, and with the I_NEW flag set. The file system gets to fill it in * before unlocking it via unlock_new_inode(). */ -struct inode *iget_locked(struct super_block *sb, unsigned long ino) +struct inode *iget_locked(struct super_block *sb, u64 ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; @@ -1527,7 +1527,7 @@ EXPORT_SYMBOL(iget_locked); * * Returns 1 if the inode number is unique, 0 if it is not. */ -static int test_inode_iunique(struct super_block *sb, unsigned long ino) +static int test_inode_iunique(struct super_block *sb, u64 ino) { struct hlist_head *b = inode_hashtable + hash(sb, ino); struct inode *inode; @@ -1616,7 +1616,7 @@ EXPORT_SYMBOL(igrab); * * Note2: @test is called with the inode_hash_lock held, so can't sleep. */ -struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, +struct inode *ilookup5_nowait(struct super_block *sb, u64 hashval, int (*test)(struct inode *, void *), void *data, bool *isnew) { struct hlist_head *head = inode_hashtable + hash(sb, hashval); @@ -1647,7 +1647,7 @@ EXPORT_SYMBOL(ilookup5_nowait); * * Note: @test is called with the inode_hash_lock held, so can't sleep. */ -struct inode *ilookup5(struct super_block *sb, unsigned long hashval, +struct inode *ilookup5(struct super_block *sb, u64 hashval, int (*test)(struct inode *, void *), void *data) { struct inode *inode; @@ -1677,7 +1677,7 @@ EXPORT_SYMBOL(ilookup5); * Search for the inode @ino in the inode cache, and if the inode is in the * cache, the inode is returned with an incremented reference count. */ -struct inode *ilookup(struct super_block *sb, unsigned long ino) +struct inode *ilookup(struct super_block *sb, u64 ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; @@ -1726,8 +1726,8 @@ EXPORT_SYMBOL(ilookup); * very carefully implemented. */ struct inode *find_inode_nowait(struct super_block *sb, - unsigned long hashval, - int (*match)(struct inode *, unsigned long, + u64 hashval, + int (*match)(struct inode *, u64, void *), void *data) { @@ -1773,7 +1773,7 @@ EXPORT_SYMBOL(find_inode_nowait); * * The caller must hold the RCU read lock. */ -struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval, +struct inode *find_inode_rcu(struct super_block *sb, u64 hashval, int (*test)(struct inode *, void *), void *data) { struct hlist_head *head = inode_hashtable + hash(sb, hashval); @@ -1812,7 +1812,7 @@ EXPORT_SYMBOL(find_inode_rcu); * The caller must hold the RCU read lock. */ struct inode *find_inode_by_ino_rcu(struct super_block *sb, - unsigned long ino) + u64 ino) { struct hlist_head *head = inode_hashtable + hash(sb, ino); struct inode *inode; @@ -1833,7 +1833,7 @@ EXPORT_SYMBOL(find_inode_by_ino_rcu); int insert_inode_locked(struct inode *inode) { struct super_block *sb = inode->i_sb; - ino_t ino = inode->i_ino; + u64 ino = inode->i_ino; struct hlist_head *head = inode_hashtable + hash(sb, ino); bool isnew; @@ -1884,7 +1884,7 @@ repeat: } EXPORT_SYMBOL(insert_inode_locked); -int insert_inode_locked4(struct inode *inode, unsigned long hashval, +int insert_inode_locked4(struct inode *inode, u64 hashval, int (*test)(struct inode *, void *), void *data) { struct inode *old; diff --git a/include/linux/fs.h b/include/linux/fs.h index 8b3dd145b25e..b4f5e5fdbe4b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2934,33 +2934,31 @@ static inline int inode_generic_drop(struct inode *inode) } extern void d_mark_dontcache(struct inode *inode); -extern struct inode *ilookup5_nowait(struct super_block *sb, - unsigned long hashval, int (*test)(struct inode *, void *), - void *data, bool *isnew); -extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval, - int (*test)(struct inode *, void *), void *data); -extern struct inode *ilookup(struct super_block *sb, unsigned long ino); - -extern struct inode *inode_insert5(struct inode *inode, unsigned long hashval, - int (*test)(struct inode *, void *), - int (*set)(struct inode *, void *), - void *data); -struct inode *iget5_locked(struct super_block *, unsigned long, +struct inode *ilookup5_nowait(struct super_block *sb, u64 hashval, + int (*test)(struct inode *, void *), void *data, + bool *isnew); +struct inode *ilookup5(struct super_block *sb, u64 hashval, + int (*test)(struct inode *, void *), void *data); +struct inode *ilookup(struct super_block *sb, u64 ino); + +struct inode *inode_insert5(struct inode *inode, u64 hashval, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *), void *data); +struct inode *iget5_locked(struct super_block *, u64, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); -struct inode *iget5_locked_rcu(struct super_block *, unsigned long, +struct inode *iget5_locked_rcu(struct super_block *, u64, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); -extern struct inode * iget_locked(struct super_block *, unsigned long); -extern struct inode *find_inode_nowait(struct super_block *, - unsigned long, - int (*match)(struct inode *, - unsigned long, void *), - void *data); -extern struct inode *find_inode_rcu(struct super_block *, unsigned long, - int (*)(struct inode *, void *), void *); -extern struct inode *find_inode_by_ino_rcu(struct super_block *, unsigned long); -extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *); +struct inode *iget_locked(struct super_block *, u64); +struct inode *find_inode_nowait(struct super_block *, u64, + int (*match)(struct inode *, u64, void *), + void *data); +struct inode *find_inode_rcu(struct super_block *, u64, + int (*)(struct inode *, void *), void *); +struct inode *find_inode_by_ino_rcu(struct super_block *, u64); +int insert_inode_locked4(struct inode *, u64, + int (*test)(struct inode *, void *), void *); extern int insert_inode_locked(struct inode *); #ifdef CONFIG_DEBUG_LOCK_ALLOC extern void lockdep_annotate_inode_mutex_key(struct inode *inode); @@ -3015,7 +3013,7 @@ int setattr_should_drop_sgid(struct mnt_idmap *idmap, */ #define alloc_inode_sb(_sb, _cache, _gfp) kmem_cache_alloc_lru(_cache, &_sb->s_inode_lru, _gfp) -extern void __insert_inode_hash(struct inode *, unsigned long hashval); +void __insert_inode_hash(struct inode *, u64 hashval); static inline void insert_inode_hash(struct inode *inode) { __insert_inode_hash(inode, inode->i_ino); -- cgit v1.2.3 From 125dfa218134df7cc112667e92984de9d8cd0bf6 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 4 Mar 2026 10:32:32 -0500 Subject: audit: widen ino fields to u64 inode->i_ino is being widened from unsigned long to u64. The audit subsystem uses unsigned long ino in struct fields, function parameters, and local variables that store inode numbers from arbitrary filesystems. On 32-bit platforms this truncates inode numbers that exceed 32 bits, which will cause incorrect audit log entries and broken watch/mark comparisons. Widen all audit ino fields, parameters, and locals to u64, and update the inode format string from %lu to %llu to match. Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20260304-iino-u64-v3-2-2257ad83d372@kernel.org Acked-by: Paul Moore Signed-off-by: Christian Brauner --- include/linux/audit.h | 2 +- kernel/audit.h | 13 ++++++------- kernel/audit_fsnotify.c | 4 ++-- kernel/audit_watch.c | 12 ++++++------ kernel/auditsc.c | 4 ++-- 5 files changed, 17 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/audit.h b/include/linux/audit.h index b642b5faca65..b915aaa7ed73 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -15,7 +15,7 @@ #include #include -#define AUDIT_INO_UNSET ((unsigned long)-1) +#define AUDIT_INO_UNSET ((u64)-1) #define AUDIT_DEV_UNSET ((dev_t)-1) struct audit_sig_info { diff --git a/kernel/audit.h b/kernel/audit.h index 7c401729e21b..ac81fa02bcd7 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -76,7 +76,7 @@ struct audit_names { int name_len; /* number of chars to log */ bool hidden; /* don't log this record */ - unsigned long ino; + u64 ino; dev_t dev; umode_t mode; kuid_t uid; @@ -225,9 +225,9 @@ extern int auditd_test_task(struct task_struct *task); #define AUDIT_INODE_BUCKETS 32 extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; -static inline int audit_hash_ino(u32 ino) +static inline int audit_hash_ino(u64 ino) { - return (ino & (AUDIT_INODE_BUCKETS-1)); + return ((u32)ino & (AUDIT_INODE_BUCKETS-1)); } /* Indicates that audit should log the full pathname. */ @@ -277,16 +277,15 @@ extern int audit_to_watch(struct audit_krule *krule, char *path, int len, extern int audit_add_watch(struct audit_krule *krule, struct list_head **list); extern void audit_remove_watch_rule(struct audit_krule *krule); extern char *audit_watch_path(struct audit_watch *watch); -extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, - dev_t dev); +extern int audit_watch_compare(struct audit_watch *watch, u64 ino, dev_t dev); extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len); extern char *audit_mark_path(struct audit_fsnotify_mark *mark); extern void audit_remove_mark(struct audit_fsnotify_mark *audit_mark); extern void audit_remove_mark_rule(struct audit_krule *krule); -extern int audit_mark_compare(struct audit_fsnotify_mark *mark, - unsigned long ino, dev_t dev); +extern int audit_mark_compare(struct audit_fsnotify_mark *mark, u64 ino, + dev_t dev); extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old); extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark); diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index a4401f651060..711454f9f724 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -25,7 +25,7 @@ */ struct audit_fsnotify_mark { dev_t dev; /* associated superblock device */ - unsigned long ino; /* associated inode number */ + u64 ino; /* associated inode number */ char *path; /* insertion path */ struct fsnotify_mark mark; /* fsnotify mark on the inode */ struct audit_krule *rule; @@ -57,7 +57,7 @@ char *audit_mark_path(struct audit_fsnotify_mark *mark) return mark->path; } -int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev) +int audit_mark_compare(struct audit_fsnotify_mark *mark, u64 ino, dev_t dev) { if (mark->ino == AUDIT_INO_UNSET) return 0; diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 096faac2435c..33577f0f54ef 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -37,7 +37,7 @@ struct audit_watch { refcount_t count; /* reference count */ dev_t dev; /* associated superblock device */ char *path; /* insertion path */ - unsigned long ino; /* associated inode number */ + u64 ino; /* associated inode number */ struct audit_parent *parent; /* associated parent */ struct list_head wlist; /* entry in parent->watches list */ struct list_head rules; /* anchor for krule->rlist */ @@ -125,7 +125,7 @@ char *audit_watch_path(struct audit_watch *watch) return watch->path; } -int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev) +int audit_watch_compare(struct audit_watch *watch, u64 ino, dev_t dev) { return (watch->ino != AUDIT_INO_UNSET) && (watch->ino == ino) && @@ -244,7 +244,7 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc /* Update inode info in audit rules based on filesystem event. */ static void audit_update_watch(struct audit_parent *parent, const struct qstr *dname, dev_t dev, - unsigned long ino, unsigned invalidating) + u64 ino, unsigned invalidating) { struct audit_watch *owatch, *nwatch, *nextw; struct audit_krule *r, *nextr; @@ -285,7 +285,7 @@ static void audit_update_watch(struct audit_parent *parent, list_del(&oentry->rule.list); audit_panic("error updating watch, removing"); } else { - int h = audit_hash_ino((u32)ino); + int h = audit_hash_ino(ino); /* * nentry->rule.watch == oentry->rule.watch so @@ -439,7 +439,7 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) audit_add_to_parent(krule, parent); - h = audit_hash_ino((u32)watch->ino); + h = audit_hash_ino(watch->ino); *list = &audit_inode_hash[h]; error: path_put(&parent_path); @@ -527,7 +527,7 @@ int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old) int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark) { struct file *exe_file; - unsigned long ino; + u64 ino; dev_t dev; /* only do exe filtering if we are recording @current events/records */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index f6af6a8f68c4..ab54fccba215 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -886,7 +886,7 @@ static int audit_filter_inode_name(struct task_struct *tsk, struct audit_names *n, struct audit_context *ctx) { - int h = audit_hash_ino((u32)n->ino); + int h = audit_hash_ino(n->ino); struct list_head *list = &audit_inode_hash[h]; return __audit_filter_op(tsk, ctx, list, n, ctx->major); @@ -1534,7 +1534,7 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, audit_log_format(ab, " name=(null)"); if (n->ino != AUDIT_INO_UNSET) - audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#ho ouid=%u ogid=%u rdev=%02x:%02x", + audit_log_format(ab, " inode=%llu dev=%02x:%02x mode=%#ho ouid=%u ogid=%u rdev=%02x:%02x", n->ino, MAJOR(n->dev), MINOR(n->dev), -- cgit v1.2.3 From 0b2600f81cefcdfcda58d50df7be8fd48ada8ce2 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 4 Mar 2026 10:32:42 -0500 Subject: treewide: change inode->i_ino from unsigned long to u64 On 32-bit architectures, unsigned long is only 32 bits wide, which causes 64-bit inode numbers to be silently truncated. Several filesystems (NFS, XFS, BTRFS, etc.) can generate inode numbers that exceed 32 bits, and this truncation can lead to inode number collisions and other subtle bugs on 32-bit systems. Change the type of inode->i_ino from unsigned long to u64 to ensure that inode numbers are always represented as 64-bit values regardless of architecture. Update all format specifiers treewide from %lu/%lx to %llu/%llx to match the new type, along with corresponding local variable types. This is the bulk treewide conversion. Earlier patches in this series handled trace events separately to allow trace field reordering for better struct packing on 32-bit. Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20260304-iino-u64-v3-12-2257ad83d372@kernel.org Acked-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Chuck Lever Signed-off-by: Christian Brauner --- drivers/dma-buf/dma-buf.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 4 +-- fs/9p/vfs_addr.c | 4 +-- fs/9p/vfs_inode.c | 6 ++-- fs/9p/vfs_inode_dotl.c | 6 ++-- fs/affs/amigaffs.c | 10 +++---- fs/affs/bitmap.c | 2 +- fs/affs/dir.c | 2 +- fs/affs/file.c | 20 ++++++------- fs/affs/inode.c | 12 ++++---- fs/affs/namei.c | 14 ++++----- fs/affs/symlink.c | 2 +- fs/afs/dir.c | 10 +++---- fs/afs/dir_search.c | 2 +- fs/afs/dynroot.c | 2 +- fs/afs/inode.c | 2 +- fs/autofs/inode.c | 2 +- fs/befs/linuxvfs.c | 28 ++++++++--------- fs/bfs/dir.c | 4 +-- fs/cachefiles/io.c | 6 ++-- fs/cachefiles/namei.c | 12 ++++---- fs/cachefiles/xattr.c | 2 +- fs/ceph/crypto.c | 4 +-- fs/coda/dir.c | 2 +- fs/coda/inode.c | 2 +- fs/cramfs/inode.c | 2 +- fs/crypto/crypto.c | 2 +- fs/crypto/hooks.c | 2 +- fs/crypto/keyring.c | 4 +-- fs/crypto/keysetup.c | 2 +- fs/dcache.c | 4 +-- fs/ecryptfs/crypto.c | 6 ++-- fs/ecryptfs/file.c | 2 +- fs/efs/inode.c | 6 ++-- fs/eventpoll.c | 2 +- fs/exportfs/expfs.c | 4 +-- fs/ext2/dir.c | 10 +++---- fs/ext2/ialloc.c | 9 +++--- fs/ext2/inode.c | 2 +- fs/ext2/xattr.c | 14 ++++----- fs/ext4/dir.c | 2 +- fs/ext4/ext4.h | 4 +-- fs/ext4/extents.c | 8 ++--- fs/ext4/extents_status.c | 28 ++++++++--------- fs/ext4/fast_commit.c | 8 ++--- fs/ext4/ialloc.c | 10 +++---- fs/ext4/indirect.c | 2 +- fs/ext4/inline.c | 14 ++++----- fs/ext4/inode.c | 22 +++++++------- fs/ext4/ioctl.c | 4 +-- fs/ext4/mballoc.c | 6 ++-- fs/ext4/migrate.c | 2 +- fs/ext4/move_extent.c | 20 ++++++------- fs/ext4/namei.c | 10 +++---- fs/ext4/orphan.c | 16 +++++----- fs/ext4/page-io.c | 10 +++---- fs/ext4/super.c | 22 +++++++------- fs/ext4/xattr.c | 10 +++---- fs/f2fs/compress.c | 4 +-- fs/f2fs/dir.c | 2 +- fs/f2fs/extent_cache.c | 8 ++--- fs/f2fs/f2fs.h | 6 ++-- fs/f2fs/file.c | 12 ++++---- fs/f2fs/gc.c | 2 +- fs/f2fs/inline.c | 4 +-- fs/f2fs/inode.c | 48 +++++++++++++++--------------- fs/f2fs/namei.c | 8 ++--- fs/f2fs/node.c | 10 +++---- fs/f2fs/recovery.c | 10 +++---- fs/f2fs/xattr.c | 10 +++---- fs/freevxfs/vxfs_bmap.c | 4 +-- fs/fserror.c | 2 +- fs/hfs/catalog.c | 2 +- fs/hfs/extent.c | 4 +-- fs/hfs/inode.c | 4 +-- fs/hfsplus/attributes.c | 10 +++---- fs/hfsplus/catalog.c | 2 +- fs/hfsplus/dir.c | 6 ++-- fs/hfsplus/extents.c | 6 ++-- fs/hfsplus/inode.c | 8 ++--- fs/hfsplus/super.c | 6 ++-- fs/hfsplus/xattr.c | 10 +++---- fs/hpfs/dir.c | 4 +-- fs/hpfs/dnode.c | 4 +-- fs/hpfs/ea.c | 4 +-- fs/hpfs/inode.c | 4 +-- fs/inode.c | 13 ++++---- fs/iomap/ioend.c | 2 +- fs/isofs/compress.c | 2 +- fs/isofs/dir.c | 2 +- fs/isofs/inode.c | 6 ++-- fs/isofs/namei.c | 2 +- fs/jbd2/journal.c | 4 +-- fs/jbd2/transaction.c | 2 +- fs/jffs2/dir.c | 4 +-- fs/jffs2/file.c | 4 +-- fs/jffs2/fs.c | 18 +++++------ fs/jfs/inode.c | 2 +- fs/jfs/jfs_imap.c | 2 +- fs/jfs/jfs_metapage.c | 2 +- fs/lockd/svclock.c | 8 ++--- fs/lockd/svcsubs.c | 2 +- fs/locks.c | 6 ++-- fs/minix/inode.c | 10 +++---- fs/nfs/dir.c | 20 ++++++------- fs/nfs/file.c | 8 ++--- fs/nfs/filelayout/filelayout.c | 8 ++--- fs/nfs/flexfilelayout/flexfilelayout.c | 8 ++--- fs/nfs/inode.c | 6 ++-- fs/nfs/nfs4proc.c | 4 +-- fs/nfs/pnfs.c | 12 ++++---- fs/nfsd/export.c | 2 +- fs/nfsd/nfs4state.c | 4 +-- fs/nfsd/nfsfh.c | 4 +-- fs/nfsd/vfs.c | 2 +- fs/nilfs2/alloc.c | 10 +++---- fs/nilfs2/bmap.c | 2 +- fs/nilfs2/btnode.c | 2 +- fs/nilfs2/btree.c | 12 ++++---- fs/nilfs2/dir.c | 12 ++++---- fs/nilfs2/direct.c | 4 +-- fs/nilfs2/gcinode.c | 2 +- fs/nilfs2/inode.c | 8 ++--- fs/nilfs2/mdt.c | 2 +- fs/nilfs2/namei.c | 2 +- fs/nilfs2/segment.c | 2 +- fs/notify/fdinfo.c | 4 +-- fs/nsfs.c | 4 +-- fs/ntfs3/super.c | 2 +- fs/ocfs2/alloc.c | 2 +- fs/ocfs2/aops.c | 4 +-- fs/ocfs2/dir.c | 8 ++--- fs/ocfs2/dlmfs/dlmfs.c | 10 +++---- fs/ocfs2/extent_map.c | 12 ++++---- fs/ocfs2/inode.c | 2 +- fs/ocfs2/quota_local.c | 2 +- fs/ocfs2/refcounttree.c | 10 +++---- fs/ocfs2/xattr.c | 4 +-- fs/orangefs/inode.c | 2 +- fs/overlayfs/export.c | 2 +- fs/overlayfs/namei.c | 4 +-- fs/overlayfs/util.c | 2 +- fs/pipe.c | 2 +- fs/proc/fd.c | 2 +- fs/proc/task_mmu.c | 4 +-- fs/qnx4/inode.c | 4 +-- fs/qnx6/inode.c | 2 +- fs/ubifs/debug.c | 8 ++--- fs/ubifs/dir.c | 28 ++++++++--------- fs/ubifs/file.c | 28 ++++++++--------- fs/ubifs/journal.c | 6 ++-- fs/ubifs/super.c | 16 +++++----- fs/ubifs/tnc.c | 4 +-- fs/ubifs/xattr.c | 14 ++++----- fs/udf/directory.c | 18 +++++------ fs/udf/file.c | 2 +- fs/udf/inode.c | 12 ++++---- fs/udf/namei.c | 8 ++--- fs/udf/super.c | 2 +- fs/ufs/balloc.c | 6 ++-- fs/ufs/dir.c | 10 +++---- fs/ufs/ialloc.c | 6 ++-- fs/ufs/inode.c | 18 +++++------ fs/ufs/ufs_fs.h | 6 ++-- fs/ufs/util.c | 2 +- fs/verity/init.c | 2 +- fs/zonefs/super.c | 8 ++--- include/linux/fs.h | 2 +- kernel/events/uprobes.c | 4 +-- net/netrom/af_netrom.c | 4 +-- net/rose/af_rose.c | 4 +-- net/socket.c | 2 +- net/x25/x25_proc.c | 4 +-- security/apparmor/apparmorfs.c | 4 +-- security/integrity/integrity_audit.c | 2 +- security/ipe/audit.c | 2 +- security/lsm_audit.c | 10 +++---- security/selinux/hooks.c | 10 +++---- security/smack/smack_lsm.c | 12 ++++---- 179 files changed, 607 insertions(+), 607 deletions(-) (limited to 'include/linux') diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index 11711874a325..8c16c8c425cc 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -1708,7 +1708,7 @@ static int dma_buf_debug_show(struct seq_file *s, void *unused) spin_lock(&buf_obj->name_lock); - seq_printf(s, "%08zu\t%08x\t%08x\t%08ld\t%s\t%08lu\t%s\n", + seq_printf(s, "%08zu\t%08x\t%08x\t%08ld\t%s\t%08llu\t%s\n", buf_obj->size, buf_obj->file->f_flags, buf_obj->file->f_mode, file_count(buf_obj->file), diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 1fb956400696..aaa8cdc122c4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -1676,9 +1676,9 @@ u64 amdgpu_bo_print_info(int id, struct amdgpu_bo *bo, struct seq_file *m) attachment = READ_ONCE(bo->tbo.base.import_attach); if (attachment) - seq_printf(m, " imported from ino:%lu", file_inode(dma_buf->file)->i_ino); + seq_printf(m, " imported from ino:%llu", file_inode(dma_buf->file)->i_ino); else if (dma_buf) - seq_printf(m, " exported as ino:%lu", file_inode(dma_buf->file)->i_ino); + seq_printf(m, " exported as ino:%llu", file_inode(dma_buf->file)->i_ino); amdgpu_bo_print_flag(m, bo, CPU_ACCESS_REQUIRED); amdgpu_bo_print_flag(m, bo, NO_CPU_ACCESS); diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 862164181bac..c21d33830f5f 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -36,7 +36,7 @@ static void v9fs_begin_writeback(struct netfs_io_request *wreq) fid = v9fs_fid_find_inode(wreq->inode, true, INVALID_UID, true); if (!fid) { - WARN_ONCE(1, "folio expected an open fid inode->i_ino=%lx\n", + WARN_ONCE(1, "folio expected an open fid inode->i_ino=%llx\n", wreq->inode->i_ino); return; } @@ -133,7 +133,7 @@ static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file) return 0; no_fid: - WARN_ONCE(1, "folio expected an open fid inode->i_ino=%lx\n", + WARN_ONCE(1, "folio expected an open fid inode->i_ino=%llx\n", rreq->inode->i_ino); return -EINVAL; } diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 97abe65bf7c1..d1508b1fe109 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -1245,7 +1245,7 @@ static int v9fs_vfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { - p9_debug(P9_DEBUG_VFS, " %lu,%pd,%s\n", + p9_debug(P9_DEBUG_VFS, " %llu,%pd,%s\n", dir->i_ino, dentry, symname); return v9fs_vfs_mkspecial(dir, dentry, P9_DMSYMLINK, symname); @@ -1269,7 +1269,7 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, char name[1 + U32_MAX_DIGITS + 2]; /* sign + number + \n + \0 */ struct p9_fid *oldfid; - p9_debug(P9_DEBUG_VFS, " %lu,%pd,%pd\n", + p9_debug(P9_DEBUG_VFS, " %llu,%pd,%pd\n", dir->i_ino, dentry, old_dentry); oldfid = v9fs_fid_clone(old_dentry); @@ -1305,7 +1305,7 @@ v9fs_vfs_mknod(struct mnt_idmap *idmap, struct inode *dir, char name[2 + U32_MAX_DIGITS + 1 + U32_MAX_DIGITS + 1]; u32 perm; - p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %x MAJOR: %u MINOR: %u\n", + p9_debug(P9_DEBUG_VFS, " %llu,%pd mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino, dentry, mode, MAJOR(rdev), MINOR(rdev)); diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 643e759eacb2..71796a89bcf4 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -691,7 +691,7 @@ v9fs_vfs_symlink_dotl(struct mnt_idmap *idmap, struct inode *dir, struct p9_fid *fid = NULL; name = dentry->d_name.name; - p9_debug(P9_DEBUG_VFS, "%lu,%s,%s\n", dir->i_ino, name, symname); + p9_debug(P9_DEBUG_VFS, "%llu,%s,%s\n", dir->i_ino, name, symname); dfid = v9fs_parent_fid(dentry); if (IS_ERR(dfid)) { @@ -734,7 +734,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, struct p9_fid *dfid, *oldfid; struct v9fs_session_info *v9ses; - p9_debug(P9_DEBUG_VFS, "dir ino: %lu, old_name: %pd, new_name: %pd\n", + p9_debug(P9_DEBUG_VFS, "dir ino: %llu, old_name: %pd, new_name: %pd\n", dir->i_ino, old_dentry, dentry); v9ses = v9fs_inode2v9ses(dir); @@ -798,7 +798,7 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir, struct p9_qid qid; struct posix_acl *dacl = NULL, *pacl = NULL; - p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %x MAJOR: %u MINOR: %u\n", + p9_debug(P9_DEBUG_VFS, " %llu,%pd mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino, dentry, omode, MAJOR(rdev), MINOR(rdev)); diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index fd669daa4e7b..d8a96d8cc826 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -33,7 +33,7 @@ affs_insert_hash(struct inode *dir, struct buffer_head *bh) ino = bh->b_blocknr; offset = affs_hash_name(sb, AFFS_TAIL(sb, bh)->name + 1, AFFS_TAIL(sb, bh)->name[0]); - pr_debug("%s(dir=%lu, ino=%d)\n", __func__, dir->i_ino, ino); + pr_debug("%s(dir=%llu, ino=%d)\n", __func__, dir->i_ino, ino); dir_bh = affs_bread(sb, dir->i_ino); if (!dir_bh) @@ -83,7 +83,7 @@ affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh) sb = dir->i_sb; rem_ino = rem_bh->b_blocknr; offset = affs_hash_name(sb, AFFS_TAIL(sb, rem_bh)->name+1, AFFS_TAIL(sb, rem_bh)->name[0]); - pr_debug("%s(dir=%lu, ino=%d, hashval=%d)\n", __func__, dir->i_ino, + pr_debug("%s(dir=%llu, ino=%d, hashval=%d)\n", __func__, dir->i_ino, rem_ino, offset); bh = affs_bread(sb, dir->i_ino); @@ -128,7 +128,7 @@ affs_fix_dcache(struct inode *inode, u32 entry_ino) spin_lock(&inode->i_lock); hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { if (entry_ino == (u32)(long)dentry->d_fsdata) { - dentry->d_fsdata = (void *)inode->i_ino; + dentry->d_fsdata = (void *)(unsigned long)inode->i_ino; break; } } @@ -147,7 +147,7 @@ affs_remove_link(struct dentry *dentry) u32 link_ino, ino; int retval; - pr_debug("%s(key=%ld)\n", __func__, inode->i_ino); + pr_debug("%s(key=%llu)\n", __func__, inode->i_ino); retval = -EIO; bh = affs_bread(sb, inode->i_ino); if (!bh) @@ -279,7 +279,7 @@ affs_remove_header(struct dentry *dentry) if (!inode) goto done; - pr_debug("%s(key=%ld)\n", __func__, inode->i_ino); + pr_debug("%s(key=%llu)\n", __func__, inode->i_ino); retval = -EIO; bh = affs_bread(sb, (u32)(long)dentry->d_fsdata); if (!bh) diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c index 5ba9ef2742f6..40bc4ce6af4a 100644 --- a/fs/affs/bitmap.c +++ b/fs/affs/bitmap.c @@ -125,7 +125,7 @@ affs_alloc_block(struct inode *inode, u32 goal) sb = inode->i_sb; sbi = AFFS_SB(sb); - pr_debug("balloc(inode=%lu,goal=%u): ", inode->i_ino, goal); + pr_debug("balloc(inode=%llu,goal=%u): ", inode->i_ino, goal); if (AFFS_I(inode)->i_pa_cnt) { pr_debug("%d\n", AFFS_I(inode)->i_lastalloc+1); diff --git a/fs/affs/dir.c b/fs/affs/dir.c index 5c8d83387a39..11e2bac2e391 100644 --- a/fs/affs/dir.c +++ b/fs/affs/dir.c @@ -90,7 +90,7 @@ affs_readdir(struct file *file, struct dir_context *ctx) u32 ino; int error = 0; - pr_debug("%s(ino=%lu,f_pos=%llx)\n", __func__, inode->i_ino, ctx->pos); + pr_debug("%s(ino=%llu,f_pos=%llx)\n", __func__, inode->i_ino, ctx->pos); if (ctx->pos < 2) { data->ino = 0; diff --git a/fs/affs/file.c b/fs/affs/file.c index 6c9258359ddb..a51dee9d7d7e 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -24,7 +24,7 @@ static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext); static int affs_file_open(struct inode *inode, struct file *filp) { - pr_debug("open(%lu,%d)\n", + pr_debug("open(%llu,%d)\n", inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt)); atomic_inc(&AFFS_I(inode)->i_opencnt); return 0; @@ -33,7 +33,7 @@ affs_file_open(struct inode *inode, struct file *filp) static int affs_file_release(struct inode *inode, struct file *filp) { - pr_debug("release(%lu, %d)\n", + pr_debug("release(%llu, %d)\n", inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt)); if (atomic_dec_and_test(&AFFS_I(inode)->i_opencnt)) { @@ -301,7 +301,7 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul struct buffer_head *ext_bh; u32 ext; - pr_debug("%s(%lu, %llu)\n", __func__, inode->i_ino, + pr_debug("%s(%llu, %llu)\n", __func__, inode->i_ino, (unsigned long long)block); BUG_ON(block > (sector_t)0x7fffffffUL); @@ -534,7 +534,7 @@ static int affs_do_read_folio_ofs(struct folio *folio, size_t to, int create) size_t bidx, boff, bsize; u32 tmp; - pr_debug("%s(%lu, %ld, 0, %zu)\n", __func__, inode->i_ino, + pr_debug("%s(%llu, %ld, 0, %zu)\n", __func__, inode->i_ino, folio->index, to); BUG_ON(to > folio_size(folio)); bsize = AFFS_SB(sb)->s_data_blksize; @@ -566,7 +566,7 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize) u32 size, bsize; u32 tmp; - pr_debug("%s(%lu, %d)\n", __func__, inode->i_ino, newsize); + pr_debug("%s(%llu, %d)\n", __func__, inode->i_ino, newsize); bsize = AFFS_SB(sb)->s_data_blksize; bh = NULL; size = AFFS_I(inode)->mmu_private; @@ -634,7 +634,7 @@ static int affs_read_folio_ofs(struct file *file, struct folio *folio) size_t to; int err; - pr_debug("%s(%lu, %ld)\n", __func__, inode->i_ino, folio->index); + pr_debug("%s(%llu, %ld)\n", __func__, inode->i_ino, folio->index); to = folio_size(folio); if (folio_pos(folio) + to > inode->i_size) { to = inode->i_size - folio_pos(folio); @@ -658,7 +658,7 @@ static int affs_write_begin_ofs(const struct kiocb *iocb, pgoff_t index; int err = 0; - pr_debug("%s(%lu, %llu, %llu)\n", __func__, inode->i_ino, pos, + pr_debug("%s(%llu, %llu, %llu)\n", __func__, inode->i_ino, pos, pos + len); if (pos > AFFS_I(inode)->mmu_private) { /* XXX: this probably leaves a too-big i_size in case of @@ -710,7 +710,7 @@ static int affs_write_end_ofs(const struct kiocb *iocb, * due to write_begin. */ - pr_debug("%s(%lu, %llu, %llu)\n", __func__, inode->i_ino, pos, + pr_debug("%s(%llu, %llu, %llu)\n", __func__, inode->i_ino, pos, pos + len); bsize = AFFS_SB(sb)->s_data_blksize; data = folio_address(folio); @@ -854,7 +854,7 @@ affs_free_prealloc(struct inode *inode) { struct super_block *sb = inode->i_sb; - pr_debug("free_prealloc(ino=%lu)\n", inode->i_ino); + pr_debug("free_prealloc(ino=%llu)\n", inode->i_ino); while (AFFS_I(inode)->i_pa_cnt) { AFFS_I(inode)->i_pa_cnt--; @@ -874,7 +874,7 @@ affs_truncate(struct inode *inode) struct buffer_head *ext_bh; int i; - pr_debug("truncate(inode=%lu, oldsize=%llu, newsize=%llu)\n", + pr_debug("truncate(inode=%llu, oldsize=%llu, newsize=%llu)\n", inode->i_ino, AFFS_I(inode)->mmu_private, inode->i_size); last_blk = 0; diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 0bfc7d151dcd..561fc0185e89 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -32,7 +32,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) if (!(inode_state_read_once(inode) & I_NEW)) return inode; - pr_debug("affs_iget(%lu)\n", inode->i_ino); + pr_debug("affs_iget(%llu)\n", inode->i_ino); block = inode->i_ino; bh = affs_bread(sb, block); @@ -171,14 +171,14 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc) uid_t uid; gid_t gid; - pr_debug("write_inode(%lu)\n", inode->i_ino); + pr_debug("write_inode(%llu)\n", inode->i_ino); if (!inode->i_nlink) // possibly free block return 0; bh = affs_bread(sb, inode->i_ino); if (!bh) { - affs_error(sb,"write_inode","Cannot read block %lu",inode->i_ino); + affs_error(sb, "write_inode", "Cannot read block %llu", inode->i_ino); return -EIO; } tail = AFFS_TAIL(sb, bh); @@ -219,7 +219,7 @@ affs_notify_change(struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode = d_inode(dentry); int error; - pr_debug("notify_change(%lu,0x%x)\n", inode->i_ino, attr->ia_valid); + pr_debug("notify_change(%llu,0x%x)\n", inode->i_ino, attr->ia_valid); error = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (error) @@ -260,7 +260,7 @@ void affs_evict_inode(struct inode *inode) { unsigned long cache_page; - pr_debug("evict_inode(ino=%lu, nlink=%u)\n", + pr_debug("evict_inode(ino=%llu, nlink=%u)\n", inode->i_ino, inode->i_nlink); truncate_inode_pages_final(&inode->i_data); @@ -353,7 +353,7 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3 u32 block = 0; int retval; - pr_debug("%s(dir=%lu, inode=%lu, \"%pd\", type=%d)\n", __func__, + pr_debug("%s(dir=%llu, inode=%llu, \"%pd\", type=%d)\n", __func__, dir->i_ino, inode->i_ino, dentry, type); retval = -EIO; diff --git a/fs/affs/namei.c b/fs/affs/namei.c index f883be50db12..870532192600 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -235,7 +235,7 @@ affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) int affs_unlink(struct inode *dir, struct dentry *dentry) { - pr_debug("%s(dir=%lu, %lu \"%pd\")\n", __func__, dir->i_ino, + pr_debug("%s(dir=%llu, %llu \"%pd\")\n", __func__, dir->i_ino, d_inode(dentry)->i_ino, dentry); return affs_remove_header(dentry); @@ -249,7 +249,7 @@ affs_create(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode; int error; - pr_debug("%s(%lu,\"%pd\",0%ho)\n", + pr_debug("%s(%llu,\"%pd\",0%ho)\n", __func__, dir->i_ino, dentry, mode); inode = affs_new_inode(dir); @@ -280,7 +280,7 @@ affs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode; int error; - pr_debug("%s(%lu,\"%pd\",0%ho)\n", + pr_debug("%s(%llu,\"%pd\",0%ho)\n", __func__, dir->i_ino, dentry, mode); inode = affs_new_inode(dir); @@ -306,7 +306,7 @@ affs_mkdir(struct mnt_idmap *idmap, struct inode *dir, int affs_rmdir(struct inode *dir, struct dentry *dentry) { - pr_debug("%s(dir=%lu, %lu \"%pd\")\n", __func__, dir->i_ino, + pr_debug("%s(dir=%llu, %llu \"%pd\")\n", __func__, dir->i_ino, d_inode(dentry)->i_ino, dentry); return affs_remove_header(dentry); @@ -323,7 +323,7 @@ affs_symlink(struct mnt_idmap *idmap, struct inode *dir, int i, maxlen, error; char c, lc; - pr_debug("%s(%lu,\"%pd\" -> \"%s\")\n", + pr_debug("%s(%llu,\"%pd\" -> \"%s\")\n", __func__, dir->i_ino, dentry, symname); maxlen = AFFS_SB(sb)->s_hashsize * sizeof(u32) - 1; @@ -395,7 +395,7 @@ affs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(old_dentry); - pr_debug("%s(%lu, %lu, \"%pd\")\n", __func__, inode->i_ino, dir->i_ino, + pr_debug("%s(%llu, %llu, \"%pd\")\n", __func__, inode->i_ino, dir->i_ino, dentry); return affs_add_entry(dir, inode, dentry, ST_LINKFILE); @@ -511,7 +511,7 @@ int affs_rename2(struct mnt_idmap *idmap, struct inode *old_dir, if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) return -EINVAL; - pr_debug("%s(old=%lu,\"%pd\" to new=%lu,\"%pd\")\n", __func__, + pr_debug("%s(old=%llu,\"%pd\" to new=%llu,\"%pd\")\n", __func__, old_dir->i_ino, old_dentry, new_dir->i_ino, new_dentry); if (flags & RENAME_EXCHANGE) diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c index 094aec8d17b8..de31ed2e71df 100644 --- a/fs/affs/symlink.c +++ b/fs/affs/symlink.c @@ -21,7 +21,7 @@ static int affs_symlink_read_folio(struct file *file, struct folio *folio) char c; char lc; - pr_debug("get_link(ino=%lu)\n", inode->i_ino); + pr_debug("get_link(ino=%llu)\n", inode->i_ino); bh = affs_bread(inode->i_sb, inode->i_ino); if (!bh) diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 78caef3f1338..aaaa55878ffd 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -148,7 +148,7 @@ static bool afs_dir_check_block(struct afs_vnode *dvnode, size_t progress, union afs_xdr_dir_block *block) { if (block->hdr.magic != AFS_DIR_MAGIC) { - pr_warn("%s(%lx): [%zx] bad magic %04x\n", + pr_warn("%s(%llx): [%zx] bad magic %04x\n", __func__, dvnode->netfs.inode.i_ino, progress, ntohs(block->hdr.magic)); trace_afs_dir_check_failed(dvnode, progress); @@ -214,7 +214,7 @@ static int afs_dir_check(struct afs_vnode *dvnode) */ static int afs_dir_open(struct inode *inode, struct file *file) { - _enter("{%lu}", inode->i_ino); + _enter("{%llu}", inode->i_ino); BUILD_BUG_ON(sizeof(union afs_xdr_dir_block) != 2048); BUILD_BUG_ON(sizeof(union afs_xdr_dirent) != 32); @@ -523,7 +523,7 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, int retry_limit = 100; int ret; - _enter("{%lu},%llx,,", dir->i_ino, ctx->pos); + _enter("{%llu},%llx,,", dir->i_ino, ctx->pos); do { if (--retry_limit < 0) { @@ -610,7 +610,7 @@ static int afs_do_lookup_one(struct inode *dir, const struct qstr *name, }; int ret; - _enter("{%lu},{%.*s},", dir->i_ino, name->len, name->name); + _enter("{%llu},{%.*s},", dir->i_ino, name->len, name->name); /* search the directory */ ret = afs_dir_iterate(dir, &cookie.ctx, NULL, _dir_version); @@ -783,7 +783,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry) long ret; int i; - _enter("{%lu},%p{%pd},", dir->i_ino, dentry, dentry); + _enter("{%llu},%p{%pd},", dir->i_ino, dentry, dentry); cookie = kzalloc_obj(struct afs_lookup_cookie); if (!cookie) diff --git a/fs/afs/dir_search.c b/fs/afs/dir_search.c index d2516e55b5ed..104411c0692f 100644 --- a/fs/afs/dir_search.c +++ b/fs/afs/dir_search.c @@ -194,7 +194,7 @@ int afs_dir_search(struct afs_vnode *dvnode, const struct qstr *name, struct afs_dir_iter iter = { .dvnode = dvnode, }; int ret, retry_limit = 3; - _enter("{%lu},,,", dvnode->netfs.inode.i_ino); + _enter("{%llu},,,", dvnode->netfs.inode.i_ino); if (!afs_dir_init_iter(&iter, name)) return -ENOENT; diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index aa56e8951e03..1d5e33bc7502 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -59,7 +59,7 @@ static struct inode *afs_iget_pseudo_dir(struct super_block *sb, ino_t ino) return ERR_PTR(-ENOMEM); } - _debug("GOT INODE %p { ino=%lu, vl=%llx, vn=%llx, u=%x }", + _debug("GOT INODE %p { ino=%llu, vl=%llx, vn=%llx, u=%x }", inode, inode->i_ino, fid.vid, fid.vnode, fid.unique); vnode = AFS_FS_I(inode); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index dde1857fcabb..a5173434f786 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -683,7 +683,7 @@ int afs_getattr(struct mnt_idmap *idmap, const struct path *path, struct key *key; int ret, seq; - _enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation); + _enter("{ ino=%llu v=%u }", inode->i_ino, inode->i_generation); if (vnode->volume && !(query_flags & AT_STATX_DONT_SYNC) && diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index c53dc551053b..c1e210cec436 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -92,7 +92,7 @@ static int autofs_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",ignore"); #ifdef CONFIG_CHECKPOINT_RESTORE if (sbi->pipe) - seq_printf(m, ",pipe_ino=%ld", file_inode(sbi->pipe)->i_ino); + seq_printf(m, ",pipe_ino=%llu", file_inode(sbi->pipe)->i_ino); else seq_puts(m, ",pipe_ino=-1"); #endif diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index cecbc92f959a..c12caae9a967 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -140,20 +140,20 @@ befs_get_block(struct inode *inode, sector_t block, int res; ulong disk_off; - befs_debug(sb, "---> befs_get_block() for inode %lu, block %ld", - (unsigned long)inode->i_ino, (long)block); + befs_debug(sb, "---> befs_get_block() for inode %llu, block %ld", + inode->i_ino, (long)block); if (create) { befs_error(sb, "befs_get_block() was asked to write to " - "block %ld in inode %lu", (long)block, - (unsigned long)inode->i_ino); + "block %ld in inode %llu", (long)block, + inode->i_ino); return -EPERM; } res = befs_fblock2brun(sb, ds, block, &run); if (res != BEFS_OK) { befs_error(sb, - "<--- %s for inode %lu, block %ld ERROR", - __func__, (unsigned long)inode->i_ino, + "<--- %s for inode %llu, block %ld ERROR", + __func__, inode->i_ino, (long)block); return -EFBIG; } @@ -162,8 +162,8 @@ befs_get_block(struct inode *inode, sector_t block, map_bh(bh_result, inode->i_sb, disk_off); - befs_debug(sb, "<--- %s for inode %lu, block %ld, disk address %lu", - __func__, (unsigned long)inode->i_ino, (long)block, + befs_debug(sb, "<--- %s for inode %llu, block %ld, disk address %lu", + __func__, inode->i_ino, (long)block, (unsigned long)disk_off); return 0; @@ -181,7 +181,7 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) char *utfname; const char *name = dentry->d_name.name; - befs_debug(sb, "---> %s name %pd inode %ld", __func__, + befs_debug(sb, "---> %s name %pd inode %llu", __func__, dentry, dir->i_ino); /* Convert to UTF-8 */ @@ -224,7 +224,7 @@ befs_readdir(struct file *file, struct dir_context *ctx) size_t keysize; char keybuf[BEFS_NAME_LEN + 1]; - befs_debug(sb, "---> %s name %pD, inode %ld, ctx->pos %lld", + befs_debug(sb, "---> %s name %pD, inode %llu, ctx->pos %lld", __func__, file, inode->i_ino, ctx->pos); while (1) { @@ -233,7 +233,7 @@ befs_readdir(struct file *file, struct dir_context *ctx) if (result == BEFS_ERR) { befs_debug(sb, "<--- %s ERROR", __func__); - befs_error(sb, "IO error reading %pD (inode %lu)", + befs_error(sb, "IO error reading %pD (inode %llu)", file, inode->i_ino); return -EIO; @@ -324,7 +324,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) bh = sb_bread(sb, inode->i_ino); if (!bh) { befs_error(sb, "unable to read inode block - " - "inode = %lu", inode->i_ino); + "inode = %llu", inode->i_ino); goto unacquire_none; } @@ -333,7 +333,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) befs_dump_inode(sb, raw_inode); if (befs_check_inode(sb, raw_inode, inode->i_ino) != BEFS_OK) { - befs_error(sb, "Bad inode: %lu", inode->i_ino); + befs_error(sb, "Bad inode: %llu", inode->i_ino); goto unacquire_bh; } @@ -407,7 +407,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) inode->i_op = &simple_symlink_inode_operations; } } else { - befs_error(sb, "Inode %lu is not a regular file, " + befs_error(sb, "Inode %llu is not a regular file, " "directory or symlink. THAT IS WRONG! BeFS has no " "on disk special files", inode->i_ino); goto unacquire_bh; diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index c375e22c4c0c..481514db4eae 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -35,7 +35,7 @@ static int bfs_readdir(struct file *f, struct dir_context *ctx) int block; if (ctx->pos & (BFS_DIRENT_SIZE - 1)) { - printf("Bad f_pos=%08lx for %s:%08lx\n", + printf("Bad f_pos=%08lx for %s:%08llx\n", (unsigned long)ctx->pos, dir->i_sb->s_id, dir->i_ino); return -EINVAL; @@ -180,7 +180,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry) goto out_brelse; if (!inode->i_nlink) { - printf("unlinking non-existent file %s:%lu (nlink=%d)\n", + printf("unlinking non-existent file %s:%llu (nlink=%d)\n", inode->i_sb->s_id, inode->i_ino, inode->i_nlink); set_nlink(inode, 1); diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index eaf47851c65f..d879b80a0bed 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -93,7 +93,7 @@ static int cachefiles_read(struct netfs_cache_resources *cres, object = cachefiles_cres_object(cres); file = cachefiles_cres_file(cres); - _enter("%pD,%li,%llx,%zx/%llx", + _enter("%pD,%llu,%llx,%zx/%llx", file, file_inode(file)->i_ino, start_pos, len, i_size_read(file_inode(file))); @@ -214,7 +214,7 @@ static int cachefiles_query_occupancy(struct netfs_cache_resources *cres, file = cachefiles_cres_file(cres); granularity = max_t(size_t, object->volume->cache->bsize, granularity); - _enter("%pD,%li,%llx,%zx/%llx", + _enter("%pD,%llu,%llx,%zx/%llx", file, file_inode(file)->i_ino, start, len, i_size_read(file_inode(file))); @@ -294,7 +294,7 @@ int __cachefiles_write(struct cachefiles_object *object, fscache_count_write(); cache = object->volume->cache; - _enter("%pD,%li,%llx,%zx/%llx", + _enter("%pD,%llu,%llx,%zx/%llx", file, file_inode(file)->i_ino, start_pos, len, i_size_read(file_inode(file))); diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index e5ec90dccc27..4fdf7687aacb 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -147,7 +147,7 @@ retry: } ASSERT(d_backing_inode(subdir)); - _debug("mkdir -> %pd{ino=%lu}", + _debug("mkdir -> %pd{ino=%llu}", subdir, d_backing_inode(subdir)->i_ino); if (_is_new) *_is_new = true; @@ -158,7 +158,7 @@ retry: end_creating_keep(subdir); if (!__cachefiles_mark_inode_in_use(NULL, d_inode(subdir))) { - pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n", + pr_notice("cachefiles: Inode already in use: %pd (B=%llx)\n", subdir, d_inode(subdir)->i_ino); goto mark_error; } @@ -183,7 +183,7 @@ retry: !d_backing_inode(subdir)->i_op->unlink) goto check_error; - _leave(" = [%lu]", d_backing_inode(subdir)->i_ino); + _leave(" = [%llu]", d_backing_inode(subdir)->i_ino); return subdir; check_error: @@ -529,7 +529,7 @@ static bool cachefiles_create_file(struct cachefiles_object *object) set_bit(FSCACHE_COOKIE_NEEDS_UPDATE, &object->cookie->flags); set_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags); - _debug("create -> %pD{ino=%lu}", file, file_inode(file)->i_ino); + _debug("create -> %pD{ino=%llu}", file, file_inode(file)->i_ino); object->file = file; return true; } @@ -549,7 +549,7 @@ static bool cachefiles_open_file(struct cachefiles_object *object, _enter("%pd", dentry); if (!cachefiles_mark_inode_in_use(object, d_inode(dentry))) { - pr_notice("cachefiles: Inode already in use: %pd (B=%lx)\n", + pr_notice("cachefiles: Inode already in use: %pd (B=%llx)\n", dentry, d_inode(dentry)->i_ino); return false; } @@ -657,7 +657,7 @@ bool cachefiles_look_up_object(struct cachefiles_object *object) if (!ret) return false; - _leave(" = t [%lu]", file_inode(object->file)->i_ino); + _leave(" = t [%llu]", file_inode(object->file)->i_ino); return true; new_file: diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index 52383b1d0ba6..f8ae78b3f7b6 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -179,7 +179,7 @@ int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, ret = 0; else if (ret != -ENOMEM) cachefiles_io_error(cache, - "Can't remove xattr from %lu" + "Can't remove xattr from %llu" " (error %d)", d_backing_inode(dentry)->i_ino, -ret); } diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c index f3de43ccb470..64d240759277 100644 --- a/fs/ceph/crypto.c +++ b/fs/ceph/crypto.c @@ -272,7 +272,7 @@ int ceph_encode_encrypted_dname(struct inode *parent, char *buf, int elen) /* To understand the 240 limit, see CEPH_NOHASH_NAME_MAX comments */ WARN_ON(elen > 240); if (dir != parent) // leading _ is already there; append _ - elen += 1 + sprintf(p + elen, "_%ld", dir->i_ino); + elen += 1 + sprintf(p + elen, "_%llu", dir->i_ino); out: kfree(cryptbuf); @@ -377,7 +377,7 @@ int ceph_fname_to_usr(const struct ceph_fname *fname, struct fscrypt_str *tname, if (!ret && (dir != fname->dir)) { char tmp_buf[BASE64_CHARS(NAME_MAX)]; - name_len = snprintf(tmp_buf, sizeof(tmp_buf), "_%.*s_%ld", + name_len = snprintf(tmp_buf, sizeof(tmp_buf), "_%.*s_%llu", oname->len, oname->name, dir->i_ino); memcpy(oname->name, tmp_buf, name_len); oname->len = name_len; diff --git a/fs/coda/dir.c b/fs/coda/dir.c index c64b8cd81568..d6b9fc3cc1ca 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -533,7 +533,7 @@ int coda_revalidate_inode(struct inode *inode) coda_vattr_to_iattr(inode, &attr); if ((old_mode & S_IFMT) != (inode->i_mode & S_IFMT)) { - pr_warn("inode %ld, fid %s changed type!\n", + pr_warn("inode %llu, fid %s changed type!\n", inode->i_ino, coda_f2s(&(cii->c_fid))); } diff --git a/fs/coda/inode.c b/fs/coda/inode.c index ad1654f3adf8..40b43866e6a5 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -257,7 +257,7 @@ static int coda_fill_super(struct super_block *sb, struct fs_context *fc) goto error; } - pr_info("%s: rootinode is %ld dev %s\n", + pr_info("%s: rootinode is %llu dev %s\n", __func__, root->i_ino, root->i_sb->s_id); sb->s_root = d_make_root(root); if (!sb->s_root) { diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index e0ba9cd640dc..4edbfccd0bbe 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -125,7 +125,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb, old_decode_dev(cramfs_inode->size)); break; default: - printk(KERN_DEBUG "CRAMFS: Invalid file type 0%04o for inode %lu.\n", + printk(KERN_DEBUG "CRAMFS: Invalid file type 0%04o for inode %llu.\n", inode->i_mode, inode->i_ino); iget_failed(inode); return ERR_PTR(-EIO); diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 07f9cbfe3ea4..570a2231c945 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -365,7 +365,7 @@ void fscrypt_msg(const struct inode *inode, const char *level, vaf.fmt = fmt; vaf.va = &args; if (inode && inode->i_ino) - printk("%sfscrypt (%s, inode %lu): %pV\n", + printk("%sfscrypt (%s, inode %llu): %pV\n", level, inode->i_sb->s_id, inode->i_ino, &vaf); else if (inode) printk("%sfscrypt (%s): %pV\n", level, inode->i_sb->s_id, &vaf); diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index b97de0d1430f..a7a8a3f581a0 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -62,7 +62,7 @@ int fscrypt_file_open(struct inode *inode, struct file *filp) dentry_parent = dget_parent(dentry); if (!fscrypt_has_permitted_context(d_inode(dentry_parent), inode)) { fscrypt_warn(inode, - "Inconsistent encryption context (parent directory: %lu)", + "Inconsistent encryption context (parent directory: %llu)", d_inode(dentry_parent)->i_ino); err = -EPERM; } diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index 9ec6e5ef0947..be8e6e8011f2 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -969,8 +969,8 @@ static int check_for_busy_inodes(struct super_block *sb, { struct list_head *pos; size_t busy_count = 0; - unsigned long ino; char ino_str[50] = ""; + u64 ino; spin_lock(&mk->mk_decrypted_inodes_lock); @@ -994,7 +994,7 @@ static int check_for_busy_inodes(struct super_block *sb, /* If the inode is currently being created, ino may still be 0. */ if (ino) - snprintf(ino_str, sizeof(ino_str), ", including ino %lu", ino); + snprintf(ino_str, sizeof(ino_str), ", including ino %llu", ino); fscrypt_warn(NULL, "%s: %zu inode(s) still busy after removing key with %s %*phN%s", diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 40fa05688d3a..df58ca4a5e3c 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -91,7 +91,7 @@ select_encryption_mode(const union fscrypt_policy *policy, if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) return &fscrypt_modes[fscrypt_policy_fnames_mode(policy)]; - WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n", + WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %llu, which is not encryptable (file type %d)\n", inode->i_ino, (inode->i_mode & S_IFMT)); return ERR_PTR(-EINVAL); } diff --git a/fs/dcache.c b/fs/dcache.c index 7ba1801d8132..0a4ffda07360 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1637,11 +1637,11 @@ static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) if (dentry == _data && dentry->d_lockref.count == 1) return D_WALK_CONTINUE; - WARN(1, "BUG: Dentry %p{i=%lx,n=%pd} " + WARN(1, "BUG: Dentry %p{i=%llx,n=%pd} " " still in use (%d) [unmount of %s %s]\n", dentry, dentry->d_inode ? - dentry->d_inode->i_ino : 0UL, + dentry->d_inode->i_ino : (u64)0, dentry, dentry->d_lockref.count, dentry->d_sb->s_type->name, diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 3b59346d68c5..f25c9a49e251 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1313,7 +1313,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry) rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode); if (rc) { printk(KERN_DEBUG "Valid eCryptfs headers not found in " - "file header region or xattr region, inode %lu\n", + "file header region or xattr region, inode %llu\n", ecryptfs_inode->i_ino); rc = -EINVAL; goto out; @@ -1323,7 +1323,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry) ECRYPTFS_DONT_VALIDATE_HEADER_SIZE); if (rc) { printk(KERN_DEBUG "Valid eCryptfs headers not found in " - "file xattr region either, inode %lu\n", + "file xattr region either, inode %llu\n", ecryptfs_inode->i_ino); rc = -EINVAL; } @@ -1335,7 +1335,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry) "crypto metadata only in the extended attribute " "region, but eCryptfs was mounted without " "xattr support enabled. eCryptfs will not treat " - "this like an encrypted file, inode %lu\n", + "this like an encrypted file, inode %llu\n", ecryptfs_inode->i_ino); rc = -EINVAL; } diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 7929411837cf..49b0fbe0428a 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -253,7 +253,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file) if (rc) goto out_put; ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = " - "[0x%.16lx] size: [0x%.16llx]\n", inode, inode->i_ino, + "[0x%.16llx] size: [0x%.16llx]\n", inode, inode->i_ino, (unsigned long long)i_size_read(inode)); goto out; out_put: diff --git a/fs/efs/inode.c b/fs/efs/inode.c index 28407578f83a..4b132729e638 100644 --- a/fs/efs/inode.c +++ b/fs/efs/inode.c @@ -132,7 +132,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) for(i = 0; i < EFS_DIRECTEXTENTS; i++) { extent_copy(&(efs_inode->di_u.di_extents[i]), &(in->extents[i])); if (i < in->numextents && in->extents[i].cooked.ex_magic != 0) { - pr_warn("extent %d has bad magic number in inode %lu\n", + pr_warn("extent %d has bad magic number in inode %llu\n", i, inode->i_ino); brelse(bh); goto read_inode_error; @@ -140,7 +140,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) } brelse(bh); - pr_debug("efs_iget(): inode %lu, extents %d, mode %o\n", + pr_debug("efs_iget(): inode %llu, extents %d, mode %o\n", inode->i_ino, in->numextents, inode->i_mode); switch (inode->i_mode & S_IFMT) { case S_IFDIR: @@ -171,7 +171,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) return inode; read_inode_error: - pr_warn("failed to read inode %lu\n", inode->i_ino); + pr_warn("failed to read inode %llu\n", inode->i_ino); iget_failed(inode); return ERR_PTR(-EIO); } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index a8c278c50083..a1731dafb1e1 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1080,7 +1080,7 @@ static void ep_show_fdinfo(struct seq_file *m, struct file *f) struct inode *inode = file_inode(epi->ffd.file); seq_printf(m, "tfd: %8d events: %8x data: %16llx " - " pos:%lli ino:%lx sdev:%x\n", + " pos:%lli ino:%llx sdev:%x\n", epi->ffd.fd, epi->event.events, (long long)epi->event.data, (long long)epi->ffd.file->f_pos, diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 6c9be60a3e48..5c3183ce350e 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -130,12 +130,12 @@ static struct dentry *reconnect_one(struct vfsmount *mnt, parent = mnt->mnt_sb->s_export_op->get_parent(dentry); if (IS_ERR(parent)) { - dprintk("get_parent of %lu failed, err %ld\n", + dprintk("get_parent of %llu failed, err %ld\n", dentry->d_inode->i_ino, PTR_ERR(parent)); return parent; } - dprintk("%s: find name of %lu in %lu\n", __func__, + dprintk("%s: find name of %llu in %llu\n", __func__, dentry->d_inode->i_ino, parent->d_inode->i_ino); err = exportfs_get_name(mnt, parent, nbuf, dentry); if (err == -ENOENT) diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 395fc36c089b..278d4be8ecbe 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -141,7 +141,7 @@ out: Ebadsize: if (!quiet) ext2_error(sb, __func__, - "size of directory #%lu is not a multiple " + "size of directory #%llu is not a multiple " "of chunk size", dir->i_ino); goto fail; Eshort: @@ -160,7 +160,7 @@ Einumber: error = "inode out of bounds"; bad_entry: if (!quiet) - ext2_error(sb, __func__, "bad entry in directory #%lu: : %s - " + ext2_error(sb, __func__, "bad entry in directory #%llu: : %s - " "offset=%llu, inode=%lu, rec_len=%d, name_len=%d", dir->i_ino, error, folio_pos(folio) + offs, (unsigned long) le32_to_cpu(p->inode), @@ -170,7 +170,7 @@ Eend: if (!quiet) { p = (ext2_dirent *)(kaddr + offs); ext2_error(sb, "ext2_check_folio", - "entry in directory #%lu spans the page boundary" + "entry in directory #%llu spans the page boundary" "offset=%llu, inode=%lu", dir->i_ino, folio_pos(folio) + offs, (unsigned long) le32_to_cpu(p->inode)); @@ -281,7 +281,7 @@ ext2_readdir(struct file *file, struct dir_context *ctx) if (IS_ERR(kaddr)) { ext2_error(sb, __func__, - "bad page in #%lu", + "bad page in #%llu", inode->i_ino); ctx->pos += PAGE_SIZE - offset; return PTR_ERR(kaddr); @@ -383,7 +383,7 @@ struct ext2_dir_entry_2 *ext2_find_entry (struct inode *dir, /* next folio is past the blocks we've got */ if (unlikely(n > (dir->i_blocks >> (PAGE_SHIFT - 9)))) { ext2_error(dir->i_sb, __func__, - "dir %lu size %lld exceeds block count %llu", + "dir %llu size %lld exceeds block count %llu", dir->i_ino, dir->i_size, (unsigned long long)dir->i_blocks); goto out; diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index fdf63e9c6e7c..bf21b57cf98c 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -169,9 +169,10 @@ static void ext2_preread_inode(struct inode *inode) unsigned long block_group; unsigned long offset; unsigned long block; + unsigned int ino = inode->i_ino; struct ext2_group_desc * gdp; - block_group = (inode->i_ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb); + block_group = (ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb); gdp = ext2_get_group_desc(inode->i_sb, block_group, NULL); if (gdp == NULL) return; @@ -179,7 +180,7 @@ static void ext2_preread_inode(struct inode *inode) /* * Figure out the offset within the block group inode table */ - offset = ((inode->i_ino - 1) % EXT2_INODES_PER_GROUP(inode->i_sb)) * + offset = ((ino - 1) % EXT2_INODES_PER_GROUP(inode->i_sb)) * EXT2_INODE_SIZE(inode->i_sb); block = le32_to_cpu(gdp->bg_inode_table) + (offset >> EXT2_BLOCK_SIZE_BITS(inode->i_sb)); @@ -381,7 +382,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent) * * So add our directory's i_ino into the starting point for the hash. */ - group = (group + parent->i_ino) % ngroups; + group = (group + (unsigned int)parent->i_ino) % ngroups; /* * Use a quadratic hash to find a group with a free inode and some @@ -589,7 +590,7 @@ got: goto fail_free_drop; mark_inode_dirty(inode); - ext2_debug("allocating inode %lu\n", inode->i_ino); + ext2_debug("allocating inode %llu\n", inode->i_ino); ext2_preread_inode(inode); return inode; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index dbfe9098a124..45286c0c3b6b 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1152,7 +1152,7 @@ static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int de */ if (!bh) { ext2_error(inode->i_sb, "ext2_free_branches", - "Read failure, inode=%ld, block=%ld", + "Read failure, inode=%llu, block=%ld", inode->i_ino, nr); continue; } diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index c885dcc3bd0d..14ada70db36a 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -227,7 +227,7 @@ ext2_xattr_get(struct inode *inode, int name_index, const char *name, if (!ext2_xattr_header_valid(HDR(bh))) { bad_block: ext2_error(inode->i_sb, "ext2_xattr_get", - "inode %ld: bad block %d", inode->i_ino, + "inode %llu: bad block %d", inode->i_ino, EXT2_I(inode)->i_file_acl); error = -EIO; goto cleanup; @@ -313,7 +313,7 @@ ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) if (!ext2_xattr_header_valid(HDR(bh))) { bad_block: ext2_error(inode->i_sb, "ext2_xattr_list", - "inode %ld: bad block %d", inode->i_ino, + "inode %llu: bad block %d", inode->i_ino, EXT2_I(inode)->i_file_acl); error = -EIO; goto cleanup; @@ -454,7 +454,7 @@ ext2_xattr_set(struct inode *inode, int name_index, const char *name, if (!ext2_xattr_header_valid(header)) { bad_block: ext2_error(sb, "ext2_xattr_set", - "inode %ld: bad block %d", inode->i_ino, + "inode %llu: bad block %d", inode->i_ino, EXT2_I(inode)->i_file_acl); error = -EIO; goto cleanup; @@ -833,7 +833,7 @@ ext2_xattr_delete_inode(struct inode *inode) if (!ext2_data_block_valid(sbi, EXT2_I(inode)->i_file_acl, 1)) { ext2_error(inode->i_sb, "ext2_xattr_delete_inode", - "inode %ld: xattr block %d is out of data blocks range", + "inode %llu: xattr block %d is out of data blocks range", inode->i_ino, EXT2_I(inode)->i_file_acl); goto cleanup; } @@ -841,14 +841,14 @@ ext2_xattr_delete_inode(struct inode *inode) bh = sb_bread(inode->i_sb, EXT2_I(inode)->i_file_acl); if (!bh) { ext2_error(inode->i_sb, "ext2_xattr_delete_inode", - "inode %ld: block %d read error", inode->i_ino, + "inode %llu: block %d read error", inode->i_ino, EXT2_I(inode)->i_file_acl); goto cleanup; } ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count))); if (!ext2_xattr_header_valid(HDR(bh))) { ext2_error(inode->i_sb, "ext2_xattr_delete_inode", - "inode %ld: bad block %d", inode->i_ino, + "inode %llu: bad block %d", inode->i_ino, EXT2_I(inode)->i_file_acl); goto cleanup; } @@ -952,7 +952,7 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header) bh = sb_bread(inode->i_sb, ce->e_value); if (!bh) { ext2_error(inode->i_sb, "ext2_xattr_cache_find", - "inode %ld: block %ld read error", + "inode %llu: block %ld read error", inode->i_ino, (unsigned long) ce->e_value); } else { lock_buffer(bh); diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 28b2a3deb954..17edd678fa87 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -535,7 +535,7 @@ static int call_filldir(struct file *file, struct dir_context *ctx, struct super_block *sb = inode->i_sb; if (!fname) { - ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: " + ext4_msg(sb, KERN_ERR, "%s:%d: inode #%llu: comm %s: " "called with null fname?!?", __func__, __LINE__, inode->i_ino, current->comm); return 0; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 293f698b7042..85e6c2b543a8 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -92,7 +92,7 @@ */ #ifdef CONFIG_EXT4_DEBUG #define ext_debug(ino, fmt, ...) \ - pr_debug("[%s/%d] EXT4-fs (%s): ino %lu: (%s, %d): %s:" fmt, \ + pr_debug("[%s/%d] EXT4-fs (%s): ino %llu: (%s, %d): %s:" fmt, \ current->comm, task_pid_nr(current), \ ino->i_sb->s_id, ino->i_ino, __FILE__, __LINE__, \ __func__, ##__VA_ARGS__) @@ -3229,7 +3229,7 @@ extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, extern __printf(7, 8) void __ext4_grp_locked_error(const char *, unsigned int, struct super_block *, ext4_group_t, - unsigned long, ext4_fsblk_t, + u64, ext4_fsblk_t, const char *, ...); #define EXT4_ERROR_INODE(inode, fmt, a...) \ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ae3804f36535..042e1555a674 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4603,7 +4603,7 @@ retry: } ret = ext4_map_blocks(handle, inode, &map, flags); if (ret <= 0) { - ext4_debug("inode #%lu: block %u: len %u: " + ext4_debug("inode #%llu: block %u: len %u: " "ext4_ext_map_blocks returned %d", inode->i_ino, map.m_lblk, map.m_len, ret); @@ -4955,7 +4955,7 @@ int ext4_convert_unwritten_extents_atomic(handle_t *handle, struct inode *inode, ret = ext4_map_blocks(handle, inode, &map, flags); if (ret != max_blocks) ext4_msg(inode->i_sb, KERN_INFO, - "inode #%lu: block %u: len %u: " + "inode #%llu: block %u: len %u: " "split block mapping found for atomic write, " "ret = %d", inode->i_ino, map.m_lblk, @@ -4974,7 +4974,7 @@ int ext4_convert_unwritten_extents_atomic(handle_t *handle, struct inode *inode, if (ret <= 0 || ret2) ext4_warning(inode->i_sb, - "inode #%lu: block %u: len %u: " + "inode #%llu: block %u: len %u: " "returned %d or %d", inode->i_ino, map.m_lblk, map.m_len, ret, ret2); @@ -5031,7 +5031,7 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, EXT4_EX_NOCACHE); if (ret <= 0) ext4_warning(inode->i_sb, - "inode #%lu: block %u: len %u: " + "inode #%llu: block %u: len %u: " "ext4_ext_map_blocks returned %d", inode->i_ino, map.m_lblk, map.m_len, ret); diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index a1538bac51c6..6e4a191e8219 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -214,7 +214,7 @@ static void ext4_es_print_tree(struct inode *inode) struct ext4_es_tree *tree; struct rb_node *node; - printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino); + printk(KERN_DEBUG "status extents for inode %llu:", inode->i_ino); tree = &EXT4_I(inode)->i_es_tree; node = rb_first(&tree->root); while (node) { @@ -703,7 +703,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) { if (in_range(es->es_lblk, ee_block, ee_len)) { pr_warn("ES insert assertion failed for " - "inode: %lu we can find an extent " + "inode: %llu we can find an extent " "at block [%d/%d/%llu/%c], but we " "want to add a delayed/hole extent " "[%d/%d/%llu/%x]\n", @@ -721,7 +721,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, */ if (es->es_lblk < ee_block || ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) { - pr_warn("ES insert assertion failed for inode: %lu " + pr_warn("ES insert assertion failed for inode: %llu " "ex_status [%d/%d/%llu/%c] != " "es_status [%d/%d/%llu/%c]\n", inode->i_ino, ee_block, ee_len, ee_start, @@ -731,7 +731,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, } if (ee_status ^ es_status) { - pr_warn("ES insert assertion failed for inode: %lu " + pr_warn("ES insert assertion failed for inode: %llu " "ex_status [%d/%d/%llu/%c] != " "es_status [%d/%d/%llu/%c]\n", inode->i_ino, ee_block, ee_len, ee_start, @@ -744,7 +744,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, * that we don't want to add an written/unwritten extent. */ if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) { - pr_warn("ES insert assertion failed for inode: %lu " + pr_warn("ES insert assertion failed for inode: %llu " "can't find an extent at block %d but we want " "to add a written/unwritten extent " "[%d/%d/%llu/%x]\n", inode->i_ino, @@ -779,7 +779,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode, * We want to add a delayed/hole extent but this * block has been allocated. */ - pr_warn("ES insert assertion failed for inode: %lu " + pr_warn("ES insert assertion failed for inode: %llu " "We can find blocks but we want to add a " "delayed/hole extent [%d/%d/%llu/%x]\n", inode->i_ino, es->es_lblk, es->es_len, @@ -788,13 +788,13 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode, } else if (ext4_es_is_written(es)) { if (retval != es->es_len) { pr_warn("ES insert assertion failed for " - "inode: %lu retval %d != es_len %d\n", + "inode: %llu retval %d != es_len %d\n", inode->i_ino, retval, es->es_len); return; } if (map.m_pblk != ext4_es_pblock(es)) { pr_warn("ES insert assertion failed for " - "inode: %lu m_pblk %llu != " + "inode: %llu m_pblk %llu != " "es_pblk %llu\n", inode->i_ino, map.m_pblk, ext4_es_pblock(es)); @@ -809,7 +809,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode, } } else if (retval == 0) { if (ext4_es_is_written(es)) { - pr_warn("ES insert assertion failed for inode: %lu " + pr_warn("ES insert assertion failed for inode: %llu " "We can't find the block but we want to add " "a written extent [%d/%d/%llu/%x]\n", inode->i_ino, es->es_lblk, es->es_len, @@ -919,7 +919,7 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; - es_debug("add [%u/%u) %llu %x %d to extent status tree of inode %lu\n", + es_debug("add [%u/%u) %llu %x %d to extent status tree of inode %llu\n", lblk, len, pblk, status, delalloc_reserve_used, inode->i_ino); if (!len) @@ -1631,7 +1631,7 @@ void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; - es_debug("remove [%u/%u) from extent status tree of inode %lu\n", + es_debug("remove [%u/%u) from extent status tree of inode %llu\n", lblk, len, inode->i_ino); if (!len) @@ -1821,7 +1821,7 @@ int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v) seq_printf(seq, " %lu shrunk objects\n", es_stats->es_stats_shrunk); if (inode_cnt) seq_printf(seq, - "maximum:\n %lu inode (%u objects, %u reclaimable)\n" + "maximum:\n %llu inode (%u objects, %u reclaimable)\n" " %llu us max scan time\n", max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr, div_u64(es_stats->es_stats_max_scan_time, 1000)); @@ -1998,7 +1998,7 @@ static void ext4_print_pending_tree(struct inode *inode) struct rb_node *node; struct pending_reservation *pr; - printk(KERN_DEBUG "pending reservations for inode %lu:", inode->i_ino); + printk(KERN_DEBUG "pending reservations for inode %llu:", inode->i_ino); tree = &EXT4_I(inode)->i_pending_tree; node = rb_first(&tree->root); while (node) { @@ -2214,7 +2214,7 @@ void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk, if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; - es_debug("add [%u/%u) delayed to extent status tree of inode %lu\n", + es_debug("add [%u/%u) delayed to extent status tree of inode %llu\n", lblk, len, inode->i_ino); if (!len) return; diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index f575751f1cae..379fb66dedbc 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -616,7 +616,7 @@ static int __track_range(handle_t *handle, struct inode *inode, void *arg, (struct __track_range_args *)arg; if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) { - ext4_debug("Special inode %ld being modified\n", inode->i_ino); + ext4_debug("Special inode %llu being modified\n", inode->i_ino); return -ECANCELED; } @@ -914,7 +914,7 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) spin_unlock(&ei->i_fc_lock); cur_lblk_off = old_blk_size; - ext4_debug("will try writing %d to %d for inode %ld\n", + ext4_debug("will try writing %d to %d for inode %llu\n", cur_lblk_off, new_blk_size, inode->i_ino); while (cur_lblk_off <= new_blk_size) { @@ -1792,7 +1792,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb, cur = start; remaining = len; - ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n", + ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %llu\n", start, start_pblk, len, ext4_ext_is_unwritten(ex), inode->i_ino); @@ -1903,7 +1903,7 @@ ext4_fc_replay_del_range(struct super_block *sb, if (ret) goto out; - ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n", + ext4_debug("DEL_RANGE, inode %llu, lblk %d, len %d\n", inode->i_ino, le32_to_cpu(lrange.fc_lblk), le32_to_cpu(lrange.fc_len)); while (remaining > 0) { diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index b20a1bf866ab..628a74b2bbe6 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -253,13 +253,13 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) return; } if (icount_read(inode) > 1) { - ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d", + ext4_msg(sb, KERN_ERR, "%s:%d: inode #%llu: count=%d", __func__, __LINE__, inode->i_ino, icount_read(inode)); return; } if (inode->i_nlink) { - ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n", + ext4_msg(sb, KERN_ERR, "%s:%d: inode #%llu: nlink=%d\n", __func__, __LINE__, inode->i_ino, inode->i_nlink); return; } @@ -631,7 +631,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent, * * So add our directory's i_ino into the starting point for the hash. */ - *group = (*group + parent->i_ino) % ngroups; + *group = (*group + (unsigned int)parent->i_ino) % ngroups; /* * Use a quadratic hash to find a group with a free inode and some free @@ -1275,7 +1275,7 @@ got: * twice. */ err = -EIO; - ext4_error(sb, "failed to insert inode %lu: doubly allocated?", + ext4_error(sb, "failed to insert inode %llu: doubly allocated?", inode->i_ino); ext4_mark_group_bitmap_corrupted(sb, group, EXT4_GROUP_INFO_IBITMAP_CORRUPT); @@ -1344,7 +1344,7 @@ got: goto fail_free_drop; } - ext4_debug("allocating inode %lu\n", inode->i_ino); + ext4_debug("allocating inode %llu\n", inode->i_ino); trace_ext4_allocate_inode(inode, dir, mode); brelse(inode_bitmap_bh); return ret; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index da76353b3a57..5aec759eed70 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -102,7 +102,7 @@ static int ext4_block_to_path(struct inode *inode, offsets[n++] = i_block & (ptrs - 1); final = ptrs; } else { - ext4_warning(inode->i_sb, "block %lu > max in inode %lu", + ext4_warning(inode->i_sb, "block %lu > max in inode %llu", i_block + direct_blocks + indirect_blocks + double_blocks, inode->i_ino); } diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 1f6bc05593df..f846fcb7db24 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -119,7 +119,7 @@ int ext4_get_max_inline_size(struct inode *inode) error = ext4_get_inode_loc(inode, &iloc); if (error) { ext4_error_inode_err(inode, __func__, __LINE__, 0, -error, - "can't get inode location %lu", + "can't get inode location %llu", inode->i_ino); return 0; } @@ -512,7 +512,7 @@ static int ext4_read_inline_folio(struct inode *inode, struct folio *folio) BUG_ON(folio->index); if (!EXT4_I(inode)->i_inline_off) { - ext4_warning(inode->i_sb, "inode %lu doesn't have inline data.", + ext4_warning(inode->i_sb, "inode %llu doesn't have inline data.", inode->i_ino); goto out; } @@ -934,7 +934,7 @@ void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, struct ext4_dir_entry_2 *de = inline_start; void *dlimit = inline_start + inline_size; - trace_printk("inode %lu\n", dir->i_ino); + trace_printk("inode %llu\n", dir->i_ino); offset = 0; while ((void *)de < dlimit) { de_len = ext4_rec_len_from_disk(de->rec_len, inline_size); @@ -1071,7 +1071,7 @@ static void ext4_restore_inline_data(handle_t *handle, struct inode *inode, ret = ext4_create_inline_data(handle, inode, inline_size); if (ret) { ext4_msg(inode->i_sb, KERN_EMERG, - "error restoring inline_data for inode -- potential data loss! (inode %lu, error %d)", + "error restoring inline_data for inode -- potential data loss! (inode %llu, error %d)", inode->i_ino, ret); return; } @@ -1740,7 +1740,7 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data) err = ext4_get_inode_loc(dir, &iloc); if (err) { EXT4_ERROR_INODE_ERR(dir, -err, - "error %d getting inode %lu block", + "error %d getting inode %llu block", err, dir->i_ino); return false; } @@ -1755,7 +1755,7 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data) de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block; if (!le32_to_cpu(de->inode)) { ext4_warning(dir->i_sb, - "bad inline directory (dir #%lu) - no `..'", + "bad inline directory (dir #%llu) - no `..'", dir->i_ino); goto out; } @@ -1769,7 +1769,7 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data) iloc.bh, inline_pos, inline_size, offset)) { ext4_warning(dir->i_sb, - "bad inline directory (dir #%lu) - " + "bad inline directory (dir #%llu) - " "inode %u, rec_len %u, name_len %d" "inline size %d", dir->i_ino, le32_to_cpu(de->inode), diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 396dc3a5d16b..d50f31124a78 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -262,7 +262,7 @@ void ext4_evict_inode(struct inode *inode) err = ext4_truncate(inode); if (err) { ext4_error_err(inode->i_sb, -err, - "couldn't truncate inode %lu (err %d)", + "couldn't truncate inode %llu (err %d)", inode->i_ino, err); goto stop_handle; } @@ -342,7 +342,7 @@ void ext4_da_update_reserve_space(struct inode *inode, spin_lock(&ei->i_block_reservation_lock); trace_ext4_da_update_reserve_space(inode, used, quota_claim); if (unlikely(used > ei->i_reserved_data_blocks)) { - ext4_warning(inode->i_sb, "%s: ino %lu, used %d " + ext4_warning(inode->i_sb, "%s: ino %llu, used %d " "with only %d reserved data blocks", __func__, inode->i_ino, used, ei->i_reserved_data_blocks); @@ -475,7 +475,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, if (es_map->m_lblk != map->m_lblk || es_map->m_flags != map->m_flags || es_map->m_pblk != map->m_pblk) { - printk("ES cache assertion failed for inode: %lu " + printk("ES cache assertion failed for inode: %llu " "es_cached ex [%d/%d/%llu/%x] != " "found ex [%d/%d/%llu/%x] retval %d flags %x\n", inode->i_ino, es_map->m_lblk, es_map->m_len, @@ -515,7 +515,7 @@ static int ext4_map_query_blocks_next_in_leaf(handle_t *handle, if (unlikely(retval != map2.m_len)) { ext4_warning(inode->i_sb, "ES len assertion failed for inode " - "%lu: retval %d != map->m_len %d", + "%llu: retval %d != map->m_len %d", inode->i_ino, retval, map2.m_len); WARN_ON(1); } @@ -563,7 +563,7 @@ int ext4_map_query_blocks(handle_t *handle, struct inode *inode, if (unlikely(retval != map->m_len)) { ext4_warning(inode->i_sb, "ES len assertion failed for inode " - "%lu: retval %d != map->m_len %d", + "%llu: retval %d != map->m_len %d", inode->i_ino, retval, map->m_len); WARN_ON(1); } @@ -630,7 +630,7 @@ int ext4_map_create_blocks(handle_t *handle, struct inode *inode, if (unlikely(retval != map->m_len)) { ext4_warning(inode->i_sb, - "ES len assertion failed for inode %lu: " + "ES len assertion failed for inode %llu: " "retval %d != map->m_len %d", inode->i_ino, retval, map->m_len); WARN_ON(1); @@ -937,7 +937,7 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, { int ret = 0; - ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n", + ext4_debug("ext4_get_block_unwritten: inode %llu, create flag %d\n", inode->i_ino, create); ret = _ext4_get_block(inode, iblock, bh_result, EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT); @@ -1659,7 +1659,7 @@ void ext4_da_release_space(struct inode *inode, int to_free) * harmless to return without any action. */ ext4_warning(inode->i_sb, "ext4_da_release_space: " - "ino %lu, to_free %d with only %d reserved " + "ino %llu, to_free %d with only %d reserved " "data blocks", inode->i_ino, to_free, ei->i_reserved_data_blocks); WARN_ON(1); @@ -2491,7 +2491,7 @@ static int mpage_map_and_submit_extent(handle_t *handle, } ext4_msg(sb, KERN_CRIT, "Delayed block allocation failed for " - "inode %lu at logical offset %llu with" + "inode %llu at logical offset %llu with" " max blocks %u with error %d", inode->i_ino, (unsigned long long)map->m_lblk, @@ -2535,7 +2535,7 @@ update_disksize: err2 = ext4_mark_inode_dirty(handle, inode); if (err2) { ext4_error_err(inode->i_sb, -err2, - "Failed to mark inode %lu dirty", + "Failed to mark inode %llu dirty", inode->i_ino); } if (!err) @@ -2909,7 +2909,7 @@ retry: if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " - "%ld pages, ino %lu; err %d", __func__, + "%ld pages, ino %llu; err %d", __func__, wbc->nr_to_write, inode->i_ino, ret); /* Release allocated io_end */ ext4_put_io_end(mpd->io_submit.io_end); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 3ae9cb50a0c0..1d0c3d4bdf47 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -477,7 +477,7 @@ static long swap_inode_boot_loader(struct super_block *sb, if (err < 0) { /* No need to update quota information. */ ext4_warning(inode->i_sb, - "couldn't mark inode #%lu dirty (err %d)", + "couldn't mark inode #%llu dirty (err %d)", inode->i_ino, err); /* Revert all changes: */ swap_inode_data(inode, inode_bl); @@ -493,7 +493,7 @@ static long swap_inode_boot_loader(struct super_block *sb, if (err < 0) { /* No need to update quota information. */ ext4_warning(inode_bl->i_sb, - "couldn't mark inode #%lu dirty (err %d)", + "couldn't mark inode #%llu dirty (err %d)", inode_bl->i_ino, err); goto revert; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 20e9fdaf4301..9e8041ac5623 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2266,7 +2266,7 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, folio_get(ac->ac_buddy_folio); /* store last allocated for subsequent stream allocation */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { - int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals; + int hash = (unsigned int)ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals; WRITE_ONCE(sbi->s_mb_last_groups[hash], ac->ac_f_ex.fe_group); } @@ -3032,7 +3032,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) /* if stream allocation is enabled, use global goal */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { - int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals; + int hash = (unsigned int)ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals; ac->ac_g_ex.fe_group = READ_ONCE(sbi->s_mb_last_groups[hash]); ac->ac_g_ex.fe_start = -1; @@ -5628,7 +5628,7 @@ void ext4_discard_preallocations(struct inode *inode) if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) return; - mb_debug(sb, "discard preallocation for inode %lu\n", + mb_debug(sb, "discard preallocation for inode %llu\n", inode->i_ino); trace_ext4_discard_preallocations(inode, atomic_read(&ei->i_prealloc_active)); diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 96ab95167bd6..477d43d7e294 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -455,7 +455,7 @@ int ext4_ext_migrate(struct inode *inode) * log, so disable fast commits for this transaction. */ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_MIGRATE, handle); - goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * + goal = ((((u32)inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; owner[0] = i_uid_read(inode); owner[1] = i_gid_read(inode); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index ce1f738dff93..ab17c1d3a7b5 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -420,21 +420,21 @@ static int mext_check_validity(struct inode *orig_inode, /* origin and donor should be different inodes */ if (orig_inode == donor_inode) { - ext4_debug("ext4 move extent: The argument files should not be same inode [ino:orig %lu, donor %lu]\n", + ext4_debug("ext4 move extent: The argument files should not be same inode [ino:orig %llu, donor %llu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } /* origin and donor should belone to the same filesystem */ if (orig_inode->i_sb != donor_inode->i_sb) { - ext4_debug("ext4 move extent: The argument files should be in same FS [ino:orig %lu, donor %lu]\n", + ext4_debug("ext4 move extent: The argument files should be in same FS [ino:orig %llu, donor %llu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } /* Regular file check */ if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { - ext4_debug("ext4 move extent: The argument files should be regular file [ino:orig %lu, donor %lu]\n", + ext4_debug("ext4 move extent: The argument files should be regular file [ino:orig %llu, donor %llu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } @@ -477,26 +477,26 @@ static int mext_check_validity(struct inode *orig_inode, } if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { - ext4_debug("ext4 move extent: suid or sgid is set to donor file [ino:orig %lu, donor %lu]\n", + ext4_debug("ext4 move extent: suid or sgid is set to donor file [ino:orig %llu, donor %llu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode)) { - ext4_debug("ext4 move extent: donor should not be immutable or append file [ino:orig %lu, donor %lu]\n", + ext4_debug("ext4 move extent: donor should not be immutable or append file [ino:orig %llu, donor %llu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EPERM; } /* Ext4 move extent does not support swap files */ if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { - ext4_debug("ext4 move extent: The argument files should not be swap files [ino:orig %lu, donor %lu]\n", + ext4_debug("ext4 move extent: The argument files should not be swap files [ino:orig %llu, donor %llu]\n", orig_inode->i_ino, donor_inode->i_ino); return -ETXTBSY; } if (ext4_is_quota_file(orig_inode) || ext4_is_quota_file(donor_inode)) { - ext4_debug("ext4 move extent: The argument files should not be quota files [ino:orig %lu, donor %lu]\n", + ext4_debug("ext4 move extent: The argument files should not be quota files [ino:orig %llu, donor %llu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EOPNOTSUPP; } @@ -523,7 +523,7 @@ static int mext_check_adjust_range(struct inode *orig_inode, /* Start offset should be same */ if ((orig_start & ~(PAGE_MASK >> orig_inode->i_blkbits)) != (donor_start & ~(PAGE_MASK >> orig_inode->i_blkbits))) { - ext4_debug("ext4 move extent: orig and donor's start offsets are not aligned [ino:orig %lu, donor %lu]\n", + ext4_debug("ext4 move extent: orig and donor's start offsets are not aligned [ino:orig %llu, donor %llu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } @@ -533,7 +533,7 @@ static int mext_check_adjust_range(struct inode *orig_inode, (*len > EXT_MAX_BLOCKS) || (donor_start + *len >= EXT_MAX_BLOCKS) || (orig_start + *len >= EXT_MAX_BLOCKS)) { - ext4_debug("ext4 move extent: Can't handle over [%u] blocks [ino:orig %lu, donor %lu]\n", + ext4_debug("ext4 move extent: Can't handle over [%u] blocks [ino:orig %llu, donor %llu]\n", EXT_MAX_BLOCKS, orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; @@ -550,7 +550,7 @@ static int mext_check_adjust_range(struct inode *orig_inode, else if (donor_eof < donor_start + *len - 1) *len = donor_eof - donor_start; if (!*len) { - ext4_debug("ext4 move extent: len should not be 0 [ino:orig %lu, donor %lu]\n", + ext4_debug("ext4 move extent: len should not be 0 [ino:orig %llu, donor %llu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index c4b5e252af0e..503dc9ffd614 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -144,7 +144,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, bh = ext4_bread(NULL, inode, block, 0); if (IS_ERR(bh)) { __ext4_warning(inode->i_sb, func, line, - "inode #%lu: lblock %lu: comm %s: " + "inode #%llu: lblock %lu: comm %s: " "error %ld reading directory block", inode->i_ino, (unsigned long)block, current->comm, PTR_ERR(bh)); @@ -841,7 +841,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, indirect = root->info.indirect_levels; if (indirect >= ext4_dir_htree_level(dir->i_sb)) { ext4_warning(dir->i_sb, - "Directory (ino: %lu) htree depth %#06x exceed" + "Directory (ino: %llu) htree depth %#06x exceed" "supported value", dir->i_ino, ext4_dir_htree_level(dir->i_sb)); if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) { @@ -1793,7 +1793,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !fscrypt_has_permitted_context(dir, inode)) { ext4_warning(inode->i_sb, - "Inconsistent encryption contexts: %lu/%lu", + "Inconsistent encryption contexts: %llu/%llu", dir->i_ino, inode->i_ino); iput(inode); return ERR_PTR(-EPERM); @@ -2227,7 +2227,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, csum_size = sizeof(struct ext4_dir_entry_tail); blocksize = dir->i_sb->s_blocksize; - dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino)); + dxtrace(printk(KERN_DEBUG "Creating index: inode %llu\n", dir->i_ino)); BUFFER_TRACE(bh, "get_write_access"); retval = ext4_journal_get_write_access(handle, dir->i_sb, bh, EXT4_JTR_NONE); @@ -2523,7 +2523,7 @@ again: restart = 1; } if (add_level && levels == ext4_dir_htree_level(sb)) { - ext4_warning(sb, "Directory (ino: %lu) index full, " + ext4_warning(sb, "Directory (ino: %llu) index full, " "reach max htree level :%d", dir->i_ino, levels); if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) { diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index c0022f0bff87..64ea47624233 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -179,8 +179,8 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) } else brelse(iloc.bh); - ext4_debug("superblock will point to %lu\n", inode->i_ino); - ext4_debug("orphan inode %lu will point to %d\n", + ext4_debug("superblock will point to %llu\n", inode->i_ino); + ext4_debug("orphan inode %llu will point to %d\n", inode->i_ino, NEXT_ORPHAN(inode)); out: ext4_std_error(sb, err); @@ -249,7 +249,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) } mutex_lock(&sbi->s_orphan_lock); - ext4_debug("remove inode %lu from orphan list\n", inode->i_ino); + ext4_debug("remove inode %llu from orphan list\n", inode->i_ino); prev = ei->i_orphan.prev; list_del_init(&ei->i_orphan); @@ -284,7 +284,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) struct inode *i_prev = &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode; - ext4_debug("orphan inode %lu will point to %u\n", + ext4_debug("orphan inode %llu will point to %u\n", i_prev->i_ino, ino_next); err = ext4_reserve_inode_write(handle, i_prev, &iloc2); if (err) { @@ -328,9 +328,9 @@ static void ext4_process_orphan(struct inode *inode, if (inode->i_nlink) { if (test_opt(sb, DEBUG)) ext4_msg(sb, KERN_DEBUG, - "%s: truncating inode %lu to %lld bytes", + "%s: truncating inode %llu to %lld bytes", __func__, inode->i_ino, inode->i_size); - ext4_debug("truncating inode %lu to %lld bytes\n", + ext4_debug("truncating inode %llu to %lld bytes\n", inode->i_ino, inode->i_size); inode_lock(inode); truncate_inode_pages(inode->i_mapping, inode->i_size); @@ -349,9 +349,9 @@ static void ext4_process_orphan(struct inode *inode, } else { if (test_opt(sb, DEBUG)) ext4_msg(sb, KERN_DEBUG, - "%s: deleting unreferenced inode %lu", + "%s: deleting unreferenced inode %llu", __func__, inode->i_ino); - ext4_debug("deleting unreferenced inode %lu\n", + ext4_debug("deleting unreferenced inode %llu\n", inode->i_ino); (*nr_orphans)++; } diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index a8c95eee91b7..86011275ad83 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -180,7 +180,7 @@ static int ext4_end_io_end(ext4_io_end_t *io_end) struct super_block *sb = inode->i_sb; int ret = 0; - ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p," + ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %llu,list->next 0x%p," "list->prev 0x%p\n", io_end, inode->i_ino, io_end->list.next, io_end->list.prev); @@ -204,7 +204,7 @@ static int ext4_end_io_end(ext4_io_end_t *io_end) ext4_msg(sb, KERN_EMERG, "failed to convert unwritten extents to written " "extents -- potential data loss! " - "(inode %lu, error %d)", inode->i_ino, ret); + "(inode %llu, error %d)", inode->i_ino, ret); } ext4_clear_io_unwritten_flag(io_end); @@ -221,7 +221,7 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head) if (list_empty(head)) return; - ext4_debug("Dump inode %lu completed io list\n", inode->i_ino); + ext4_debug("Dump inode %llu completed io list\n", inode->i_ino); list_for_each_entry(io_end, head, list) { cur = &io_end->list; before = cur->prev; @@ -229,7 +229,7 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head) after = cur->next; io_end1 = container_of(after, ext4_io_end_t, list); - ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", + ext4_debug("io 0x%p from inode %llu,prev 0x%p,next 0x%p\n", io_end, inode->i_ino, io_end0, io_end1); } #endif @@ -366,7 +366,7 @@ static void ext4_end_bio(struct bio *bio) if (bio->bi_status) { struct inode *inode = io_end->inode; - ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu " + ext4_warning(inode->i_sb, "I/O error %d writing to inode %llu " "starting block %llu)", bio->bi_status, inode->i_ino, (unsigned long long) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 43f680c750ae..781c083000c2 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -848,12 +848,12 @@ void __ext4_error_inode(struct inode *inode, const char *function, vaf.va = &args; if (block) printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " - "inode #%lu: block %llu: comm %s: %pV\n", + "inode #%llu: block %llu: comm %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, block, current->comm, &vaf); else printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: " - "inode #%lu: comm %s: %pV\n", + "inode #%llu: comm %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, current->comm, &vaf); va_end(args); @@ -888,13 +888,13 @@ void __ext4_error_file(struct file *file, const char *function, vaf.va = &args; if (block) printk(KERN_CRIT - "EXT4-fs error (device %s): %s:%d: inode #%lu: " + "EXT4-fs error (device %s): %s:%d: inode #%llu: " "block %llu: comm %s: path %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, block, current->comm, path, &vaf); else printk(KERN_CRIT - "EXT4-fs error (device %s): %s:%d: inode #%lu: " + "EXT4-fs error (device %s): %s:%d: inode #%llu: " "comm %s: path %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, current->comm, path, &vaf); @@ -1035,14 +1035,14 @@ void __ext4_warning_inode(const struct inode *inode, const char *function, vaf.fmt = fmt; vaf.va = &args; printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: " - "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id, + "inode #%llu: comm %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, current->comm, &vaf); va_end(args); } void __ext4_grp_locked_error(const char *function, unsigned int line, struct super_block *sb, ext4_group_t grp, - unsigned long ino, ext4_fsblk_t block, + u64 ino, ext4_fsblk_t block, const char *fmt, ...) __releases(bitlock) __acquires(bitlock) @@ -1061,7 +1061,7 @@ __acquires(bitlock) printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ", sb->s_id, function, line, grp); if (ino) - printk(KERN_CONT "inode %lu: ", ino); + printk(KERN_CONT "inode %llu: ", ino); if (block) printk(KERN_CONT "block %llu:", (unsigned long long) block); @@ -1170,7 +1170,7 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi) list_for_each(l, &sbi->s_orphan) { struct inode *inode = orphan_list_entry(l); printk(KERN_ERR " " - "inode %s:%lu at %p: mode %o, nlink %d, next %d\n", + "inode %s:%llu at %p: mode %o, nlink %d, next %d\n", inode->i_sb->s_id, inode->i_ino, inode, inode->i_mode, inode->i_nlink, NEXT_ORPHAN(inode)); @@ -1446,7 +1446,7 @@ static void ext4_free_in_core_inode(struct inode *inode) { fscrypt_free_inode(inode); if (!list_empty(&(EXT4_I(inode)->i_fc_list))) { - pr_warn("%s: inode %ld still in fc list", + pr_warn("%s: inode %llu still in fc list", __func__, inode->i_ino); } kmem_cache_free(ext4_inode_cachep, EXT4_I(inode)); @@ -1456,7 +1456,7 @@ static void ext4_destroy_inode(struct inode *inode) { if (ext4_inode_orphan_tracked(inode)) { ext4_msg(inode->i_sb, KERN_ERR, - "Inode %lu (%p): inode tracked as orphan!", + "Inode %llu (%p): inode tracked as orphan!", inode->i_ino, EXT4_I(inode)); print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4, EXT4_I(inode), sizeof(struct ext4_inode_info), @@ -1467,7 +1467,7 @@ static void ext4_destroy_inode(struct inode *inode) if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ERROR_FS) && WARN_ON_ONCE(EXT4_I(inode)->i_reserved_data_blocks)) ext4_msg(inode->i_sb, KERN_ERR, - "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!", + "Inode %llu (%p): i_reserved_data_blocks (%u) not cleared!", inode->i_ino, EXT4_I(inode), EXT4_I(inode)->i_reserved_data_blocks); } diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 7bf9ba19a89d..60aec4712f7f 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -64,7 +64,7 @@ #ifdef EXT4_XATTR_DEBUG # define ea_idebug(inode, fmt, ...) \ - printk(KERN_DEBUG "inode %s:%lu: " fmt "\n", \ + printk(KERN_DEBUG "inode %s:%llu: " fmt "\n", \ inode->i_sb->s_id, inode->i_ino, ##__VA_ARGS__) # define ea_bdebug(bh, fmt, ...) \ printk(KERN_DEBUG "block %pg:%lu: " fmt "\n", \ @@ -1035,7 +1035,7 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, ref_count = ext4_xattr_inode_get_ref(ea_inode); if ((ref_count == 0 && ref_change < 0) || (ref_count == U64_MAX && ref_change > 0)) { ext4_error_inode(ea_inode, __func__, __LINE__, 0, - "EA inode %lu ref wraparound: ref_count=%lld ref_change=%d", + "EA inode %llu ref wraparound: ref_count=%lld ref_change=%d", ea_inode->i_ino, ref_count, ref_change); brelse(iloc.bh); ret = -EFSCORRUPTED; @@ -1046,7 +1046,7 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, if (ref_change > 0) { if (ref_count == 1) { - WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u", + WARN_ONCE(ea_inode->i_nlink, "EA inode %llu i_nlink=%u", ea_inode->i_ino, ea_inode->i_nlink); set_nlink(ea_inode, 1); @@ -1055,7 +1055,7 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, } else { if (ref_count == 0) { WARN_ONCE(ea_inode->i_nlink != 1, - "EA inode %lu i_nlink=%u", + "EA inode %llu i_nlink=%u", ea_inode->i_ino, ea_inode->i_nlink); clear_nlink(ea_inode); @@ -2854,7 +2854,7 @@ shift: cleanup: if (error && (mnt_count != le16_to_cpu(sbi->s_es->s_mnt_count))) { - ext4_warning(inode->i_sb, "Unable to expand inode %lu. Delete some EAs or run e2fsck.", + ext4_warning(inode->i_sb, "Unable to expand inode %llu. Delete some EAs or run e2fsck.", inode->i_ino); mnt_count = le16_to_cpu(sbi->s_es->s_mnt_count); } diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 8c76400ba631..0b8be500db65 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -773,7 +773,7 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) if (!is_inode_flag_set(dic->inode, FI_COMPRESS_CORRUPT)) { set_inode_flag(dic->inode, FI_COMPRESS_CORRUPT); f2fs_info_ratelimited(sbi, - "checksum invalid, nid = %lu, %x vs %x", + "checksum invalid, nid = %llu, %x vs %x", dic->inode->i_ino, provided, calculated); } @@ -932,7 +932,7 @@ bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) return false; out: - f2fs_warn(sbi, "access invalid cluster, ino:%lu, nid:%u, ofs_in_node:%u, reason:%s", + f2fs_warn(sbi, "access invalid cluster, ino:%llu, nid:%u, ofs_in_node:%u, reason:%s", dn->inode->i_ino, dn->nid, dn->ofs_in_node, reason); set_sbi_flag(sbi, SBI_NEED_FSCK); return true; diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index f70092e231f0..38802ee2e40d 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -368,7 +368,7 @@ start_find_entry: max_depth = F2FS_I(dir)->i_current_depth; if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) { - f2fs_warn(F2FS_I_SB(dir), "Corrupted max_depth of %lu: %u", + f2fs_warn(F2FS_I_SB(dir), "Corrupted max_depth of %llu: %u", dir->i_ino, max_depth); max_depth = MAX_DIR_HASH_DEPTH; f2fs_i_depth_write(dir, max_depth); diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 0ed84cc065a7..d73aeef333a2 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -34,7 +34,7 @@ bool sanity_check_extent_cache(struct inode *inode, struct folio *ifolio) if (!f2fs_is_valid_blkaddr(sbi, ei.blk, DATA_GENERIC_ENHANCE) || !f2fs_is_valid_blkaddr(sbi, ei.blk + ei.len - 1, DATA_GENERIC_ENHANCE)) { - f2fs_warn(sbi, "%s: inode (ino=%lx) extent info [%u, %u, %u] is incorrect, run fsck to fix", + f2fs_warn(sbi, "%s: inode (ino=%llx) extent info [%u, %u, %u] is incorrect, run fsck to fix", __func__, inode->i_ino, ei.blk, ei.fofs, ei.len); return false; @@ -50,14 +50,14 @@ bool sanity_check_extent_cache(struct inode *inode, struct folio *ifolio) if (devi == 0) { f2fs_warn(sbi, - "%s: inode (ino=%lx) is an alias of meta device", + "%s: inode (ino=%llx) is an alias of meta device", __func__, inode->i_ino); return false; } if (bdev_is_zoned(FDEV(devi).bdev)) { f2fs_warn(sbi, - "%s: device alias inode (ino=%lx)'s extent info " + "%s: device alias inode (ino=%llx)'s extent info " "[%u, %u, %u] maps to zoned block device", __func__, inode->i_ino, ei.blk, ei.fofs, ei.len); return false; @@ -65,7 +65,7 @@ bool sanity_check_extent_cache(struct inode *inode, struct folio *ifolio) return true; } - f2fs_warn(sbi, "%s: device alias inode (ino=%lx)'s extent info " + f2fs_warn(sbi, "%s: device alias inode (ino=%llx)'s extent info " "[%u, %u, %u] is inconsistent w/ any devices", __func__, inode->i_ino, ei.blk, ei.fofs, ei.len); return false; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index bb34e864d0ef..760e6d80bbdd 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2706,7 +2706,7 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); if (unlikely(sbi->total_valid_block_count < count)) { - f2fs_warn(sbi, "Inconsistent total_valid_block_count:%u, ino:%lu, count:%u", + f2fs_warn(sbi, "Inconsistent total_valid_block_count:%u, ino:%llu, count:%u", sbi->total_valid_block_count, inode->i_ino, count); sbi->total_valid_block_count = 0; set_sbi_flag(sbi, SBI_NEED_FSCK); @@ -2719,7 +2719,7 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, sbi->current_reserved_blocks + count); spin_unlock(&sbi->stat_lock); if (unlikely(inode->i_blocks < sectors)) { - f2fs_warn(sbi, "Inconsistent i_blocks, ino:%lu, iblocks:%llu, sectors:%llu", + f2fs_warn(sbi, "Inconsistent i_blocks, ino:%llu, iblocks:%llu, sectors:%llu", inode->i_ino, (unsigned long long)inode->i_blocks, (unsigned long long)sectors); @@ -2993,7 +2993,7 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, dquot_free_inode(inode); } else { if (unlikely(inode->i_blocks == 0)) { - f2fs_warn(sbi, "dec_valid_node_count: inconsistent i_blocks, ino:%lu, iblocks:%llu", + f2fs_warn(sbi, "dec_valid_node_count: inconsistent i_blocks, ino:%llu, iblocks:%llu", inode->i_ino, (unsigned long long)inode->i_blocks); set_sbi_flag(sbi, SBI_NEED_FSCK); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c8a2f17a8f11..a7957e03ee03 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1917,7 +1917,7 @@ next_alloc: f2fs_up_write(&sbi->pin_sem); err = -ENOSPC; f2fs_warn_ratelimited(sbi, - "ino:%lu, start:%lu, end:%lu, need to trigger GC to " + "ino:%llu, start:%lu, end:%lu, need to trigger GC to " "reclaim enough free segment when checkpoint is enabled", inode->i_ino, pg_start, pg_end); goto out_err; @@ -2307,7 +2307,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate) * f2fs_is_atomic_file. */ if (get_dirty_pages(inode)) - f2fs_warn(sbi, "Unexpected flush for atomic writes: ino=%lu, npages=%u", + f2fs_warn(sbi, "Unexpected flush for atomic writes: ino=%llu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) @@ -3494,7 +3494,7 @@ int f2fs_pin_file_control(struct inode *inode, bool inc) return -EINVAL; if (fi->i_gc_failures >= sbi->gc_pin_file_threshold) { - f2fs_warn(sbi, "%s: Enable GC = ino %lx after %x GC trials", + f2fs_warn(sbi, "%s: Enable GC = ino %llx after %x GC trials", __func__, inode->i_ino, fi->i_gc_failures); clear_inode_flag(inode, FI_PIN_FILE); return -EAGAIN; @@ -3679,7 +3679,7 @@ static int f2fs_ioc_enable_verity(struct file *filp, unsigned long arg) if (!f2fs_sb_has_verity(F2FS_I_SB(inode))) { f2fs_warn(F2FS_I_SB(inode), - "Can't enable fs-verity on inode %lu: the verity feature is not enabled on this filesystem", + "Can't enable fs-verity on inode %llu: the verity feature is not enabled on this filesystem", inode->i_ino); return -EOPNOTSUPP; } @@ -3950,7 +3950,7 @@ out: } else if (released_blocks && atomic_read(&fi->i_compr_blocks)) { set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_warn(sbi, "%s: partial blocks were released i_ino=%lx " + f2fs_warn(sbi, "%s: partial blocks were released i_ino=%llx " "iblocks=%llu, released=%u, compr_blocks=%u, " "run fsck to fix.", __func__, inode->i_ino, inode->i_blocks, @@ -4133,7 +4133,7 @@ unlock_inode: } else if (reserved_blocks && atomic_read(&fi->i_compr_blocks)) { set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_warn(sbi, "%s: partial blocks were reserved i_ino=%lx " + f2fs_warn(sbi, "%s: partial blocks were reserved i_ino=%llx " "iblocks=%llu, reserved=%u, compr_blocks=%u, " "run fsck to fix.", __func__, inode->i_ino, inode->i_blocks, diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index f46b2673d31f..c0c8a1056d6b 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1622,7 +1622,7 @@ next_step: iput(inode); set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_err_ratelimited(sbi, - "inode %lx has both inline_data flag and " + "inode %llu has both inline_data flag and " "data block, nid=%u, ofs_in_node=%u", inode->i_ino, dni.nid, ofs_in_node); continue; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 0a1052d5ee62..2669439b9413 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -176,7 +176,7 @@ int f2fs_convert_inline_folio(struct dnode_of_data *dn, struct folio *folio) if (unlikely(dn->data_blkaddr != NEW_ADDR)) { f2fs_put_dnode(dn); set_sbi_flag(fio.sbi, SBI_NEED_FSCK); - f2fs_warn(fio.sbi, "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, run fsck to fix.", + f2fs_warn(fio.sbi, "%s: corrupted inline inode ino=%llu, i_addr[0]:0x%x, run fsck to fix.", __func__, dn->inode->i_ino, dn->data_blkaddr); f2fs_handle_error(fio.sbi, ERROR_INVALID_BLKADDR); return -EFSCORRUPTED; @@ -431,7 +431,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct folio *ifolio, if (unlikely(dn.data_blkaddr != NEW_ADDR)) { f2fs_put_dnode(&dn); set_sbi_flag(F2FS_F_SB(folio), SBI_NEED_FSCK); - f2fs_warn(F2FS_F_SB(folio), "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, run fsck to fix.", + f2fs_warn(F2FS_F_SB(folio), "%s: corrupted inline inode ino=%llu, i_addr[0]:0x%x, run fsck to fix.", __func__, dir->i_ino, dn.data_blkaddr); f2fs_handle_error(F2FS_F_SB(folio), ERROR_INVALID_BLKADDR); err = -EFSCORRUPTED; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index e0f850b3f0c3..f27198d6695b 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -203,14 +203,14 @@ static bool sanity_check_compress_inode(struct inode *inode, if (ri->i_compress_algorithm >= COMPRESS_MAX) { f2fs_warn(sbi, - "%s: inode (ino=%lx) has unsupported compress algorithm: %u, run fsck to fix", + "%s: inode (ino=%llx) has unsupported compress algorithm: %u, run fsck to fix", __func__, inode->i_ino, ri->i_compress_algorithm); return false; } if (le64_to_cpu(ri->i_compr_blocks) > SECTOR_TO_BLOCK(inode->i_blocks)) { f2fs_warn(sbi, - "%s: inode (ino=%lx) has inconsistent i_compr_blocks:%llu, i_blocks:%llu, run fsck to fix", + "%s: inode (ino=%llx) has inconsistent i_compr_blocks:%llu, i_blocks:%llu, run fsck to fix", __func__, inode->i_ino, le64_to_cpu(ri->i_compr_blocks), SECTOR_TO_BLOCK(inode->i_blocks)); return false; @@ -218,7 +218,7 @@ static bool sanity_check_compress_inode(struct inode *inode, if (ri->i_log_cluster_size < MIN_COMPRESS_LOG_SIZE || ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE) { f2fs_warn(sbi, - "%s: inode (ino=%lx) has unsupported log cluster size: %u, run fsck to fix", + "%s: inode (ino=%llx) has unsupported log cluster size: %u, run fsck to fix", __func__, inode->i_ino, ri->i_log_cluster_size); return false; } @@ -262,7 +262,7 @@ static bool sanity_check_compress_inode(struct inode *inode, return true; err_level: - f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported compress level: %u, run fsck to fix", + f2fs_warn(sbi, "%s: inode (ino=%llx) has unsupported compress level: %u, run fsck to fix", __func__, inode->i_ino, clevel); return false; } @@ -276,40 +276,40 @@ static bool sanity_check_inode(struct inode *inode, struct folio *node_folio) iblocks = le64_to_cpu(F2FS_INODE(node_folio)->i_blocks); if (!iblocks) { - f2fs_warn(sbi, "%s: corrupted inode i_blocks i_ino=%lx iblocks=%llu, run fsck to fix.", + f2fs_warn(sbi, "%s: corrupted inode i_blocks i_ino=%llx iblocks=%llu, run fsck to fix.", __func__, inode->i_ino, iblocks); return false; } if (ino_of_node(node_folio) != nid_of_node(node_folio)) { - f2fs_warn(sbi, "%s: corrupted inode footer i_ino=%lx, ino,nid: [%u, %u] run fsck to fix.", + f2fs_warn(sbi, "%s: corrupted inode footer i_ino=%llx, ino,nid: [%u, %u] run fsck to fix.", __func__, inode->i_ino, ino_of_node(node_folio), nid_of_node(node_folio)); return false; } if (ino_of_node(node_folio) == fi->i_xattr_nid) { - f2fs_warn(sbi, "%s: corrupted inode i_ino=%lx, xnid=%x, run fsck to fix.", + f2fs_warn(sbi, "%s: corrupted inode i_ino=%llx, xnid=%x, run fsck to fix.", __func__, inode->i_ino, fi->i_xattr_nid); return false; } if (S_ISDIR(inode->i_mode) && unlikely(inode->i_nlink == 1)) { - f2fs_warn(sbi, "%s: directory inode (ino=%lx) has a single i_nlink", + f2fs_warn(sbi, "%s: directory inode (ino=%llx) has a single i_nlink", __func__, inode->i_ino); return false; } if (f2fs_has_extra_attr(inode)) { if (!f2fs_sb_has_extra_attr(sbi)) { - f2fs_warn(sbi, "%s: inode (ino=%lx) is with extra_attr, but extra_attr feature is off", + f2fs_warn(sbi, "%s: inode (ino=%llx) is with extra_attr, but extra_attr feature is off", __func__, inode->i_ino); return false; } if (fi->i_extra_isize > F2FS_TOTAL_EXTRA_ATTR_SIZE || fi->i_extra_isize < F2FS_MIN_EXTRA_ATTR_SIZE || fi->i_extra_isize % sizeof(__le32)) { - f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_extra_isize: %d, max: %zu", + f2fs_warn(sbi, "%s: inode (ino=%llx) has corrupted i_extra_isize: %d, max: %zu", __func__, inode->i_ino, fi->i_extra_isize, F2FS_TOTAL_EXTRA_ATTR_SIZE); return false; @@ -327,7 +327,7 @@ static bool sanity_check_inode(struct inode *inode, struct folio *node_folio) f2fs_has_inline_xattr(inode) && (fi->i_inline_xattr_size < MIN_INLINE_XATTR_SIZE || fi->i_inline_xattr_size > MAX_INLINE_XATTR_SIZE)) { - f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_inline_xattr_size: %d, min: %zu, max: %lu", + f2fs_warn(sbi, "%s: inode (ino=%llx) has corrupted i_inline_xattr_size: %d, min: %zu, max: %lu", __func__, inode->i_ino, fi->i_inline_xattr_size, MIN_INLINE_XATTR_SIZE, MAX_INLINE_XATTR_SIZE); return false; @@ -335,64 +335,64 @@ static bool sanity_check_inode(struct inode *inode, struct folio *node_folio) if (!f2fs_sb_has_extra_attr(sbi)) { if (f2fs_sb_has_project_quota(sbi)) { - f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", + f2fs_warn(sbi, "%s: corrupted inode ino=%llx, wrong feature flag: %u, run fsck to fix.", __func__, inode->i_ino, F2FS_FEATURE_PRJQUOTA); return false; } if (f2fs_sb_has_inode_chksum(sbi)) { - f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", + f2fs_warn(sbi, "%s: corrupted inode ino=%llx, wrong feature flag: %u, run fsck to fix.", __func__, inode->i_ino, F2FS_FEATURE_INODE_CHKSUM); return false; } if (f2fs_sb_has_flexible_inline_xattr(sbi)) { - f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", + f2fs_warn(sbi, "%s: corrupted inode ino=%llx, wrong feature flag: %u, run fsck to fix.", __func__, inode->i_ino, F2FS_FEATURE_FLEXIBLE_INLINE_XATTR); return false; } if (f2fs_sb_has_inode_crtime(sbi)) { - f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", + f2fs_warn(sbi, "%s: corrupted inode ino=%llx, wrong feature flag: %u, run fsck to fix.", __func__, inode->i_ino, F2FS_FEATURE_INODE_CRTIME); return false; } if (f2fs_sb_has_compression(sbi)) { - f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.", + f2fs_warn(sbi, "%s: corrupted inode ino=%llx, wrong feature flag: %u, run fsck to fix.", __func__, inode->i_ino, F2FS_FEATURE_COMPRESSION); return false; } } if (f2fs_sanity_check_inline_data(inode, node_folio)) { - f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_data, run fsck to fix", + f2fs_warn(sbi, "%s: inode (ino=%llx, mode=%u) should not have inline_data, run fsck to fix", __func__, inode->i_ino, inode->i_mode); return false; } if (f2fs_has_inline_dentry(inode) && !S_ISDIR(inode->i_mode)) { - f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_dentry, run fsck to fix", + f2fs_warn(sbi, "%s: inode (ino=%llx, mode=%u) should not have inline_dentry, run fsck to fix", __func__, inode->i_ino, inode->i_mode); return false; } if ((fi->i_flags & F2FS_CASEFOLD_FL) && !f2fs_sb_has_casefold(sbi)) { - f2fs_warn(sbi, "%s: inode (ino=%lx) has casefold flag, but casefold feature is off", + f2fs_warn(sbi, "%s: inode (ino=%llx) has casefold flag, but casefold feature is off", __func__, inode->i_ino); return false; } if (fi->i_xattr_nid && f2fs_check_nid_range(sbi, fi->i_xattr_nid)) { - f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_xattr_nid: %u, run fsck to fix.", + f2fs_warn(sbi, "%s: inode (ino=%llx) has corrupted i_xattr_nid: %u, run fsck to fix.", __func__, inode->i_ino, fi->i_xattr_nid); return false; } if (IS_DEVICE_ALIASING(inode)) { if (!f2fs_sb_has_device_alias(sbi)) { - f2fs_warn(sbi, "%s: inode (ino=%lx) has device alias flag, but the feature is off", + f2fs_warn(sbi, "%s: inode (ino=%llx) has device alias flag, but the feature is off", __func__, inode->i_ino); return false; } if (!f2fs_is_pinned_file(inode)) { - f2fs_warn(sbi, "%s: inode (ino=%lx) has device alias flag, but is not pinned", + f2fs_warn(sbi, "%s: inode (ino=%llx) has device alias flag, but is not pinned", __func__, inode->i_ino); return false; } @@ -925,7 +925,7 @@ retry: */ if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { f2fs_warn(F2FS_I_SB(inode), - "f2fs_evict_inode: inconsistent node id, ino:%lu", + "f2fs_evict_inode: inconsistent node id, ino:%llu", inode->i_ino); f2fs_inode_synced(inode); set_sbi_flag(sbi, SBI_NEED_FSCK); @@ -954,7 +954,7 @@ retry: */ if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { f2fs_warn(sbi, - "f2fs_evict_inode: inode is dirty, ino:%lu", + "f2fs_evict_inode: inode is dirty, ino:%llu", inode->i_ino); f2fs_inode_synced(inode); set_sbi_flag(sbi, SBI_NEED_FSCK); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index e360f08a9586..efbb0732d420 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -505,7 +505,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, } if (inode->i_nlink == 0) { - f2fs_warn(F2FS_I_SB(inode), "%s: inode (ino=%lx) has zero i_nlink", + f2fs_warn(F2FS_I_SB(inode), "%s: inode (ino=%llx) has zero i_nlink", __func__, inode->i_ino); err = -EFSCORRUPTED; set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); @@ -515,7 +515,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, if (IS_ENCRYPTED(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !fscrypt_has_permitted_context(dir, inode)) { - f2fs_warn(F2FS_I_SB(inode), "Inconsistent encryption contexts: %lu/%lu", + f2fs_warn(F2FS_I_SB(inode), "Inconsistent encryption contexts: %llu/%llu", dir->i_ino, inode->i_ino); err = -EPERM; goto out_iput; @@ -573,11 +573,11 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) } if (unlikely(inode->i_nlink == 0)) { - f2fs_warn(sbi, "%s: inode (ino=%lx) has zero i_nlink", + f2fs_warn(sbi, "%s: inode (ino=%llx) has zero i_nlink", __func__, inode->i_ino); goto corrupted; } else if (S_ISDIR(inode->i_mode) && unlikely(inode->i_nlink == 1)) { - f2fs_warn(sbi, "%s: directory inode (ino=%lx) has a single i_nlink", + f2fs_warn(sbi, "%s: directory inode (ino=%llx) has a single i_nlink", __func__, inode->i_ino); goto corrupted; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 5ca6f518cfa1..5a7679c6317e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -847,7 +847,7 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) err = -EFSCORRUPTED; f2fs_err_ratelimited(sbi, "inode mapping table is corrupted, run fsck to fix it, " - "ino:%lu, nid:%u, level:%d, offset:%d", + "ino:%llu, nid:%u, level:%d, offset:%d", dn->inode->i_ino, nids[i], level, offset[level]); set_sbi_flag(sbi, SBI_NEED_FSCK); goto release_pages; @@ -1013,7 +1013,7 @@ static int truncate_dnode(struct dnode_of_data *dn) return PTR_ERR(folio); if (IS_INODE(folio) || ino_of_node(folio) != dn->inode->i_ino) { - f2fs_err(sbi, "incorrect node reference, ino: %lu, nid: %u, ino_of_node: %u", + f2fs_err(sbi, "incorrect node reference, ino: %llu, nid: %u, ino_of_node: %u", dn->inode->i_ino, dn->nid, ino_of_node(folio)); set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_handle_error(sbi, ERROR_INVALID_NODE_REFERENCE); @@ -1194,7 +1194,7 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from) if (level <= 0) { if (!level) { level = -EFSCORRUPTED; - f2fs_err(sbi, "%s: inode ino=%lx has corrupted node block, from:%lu addrs:%u", + f2fs_err(sbi, "%s: inode ino=%llx has corrupted node block, from:%lu addrs:%u", __func__, inode->i_ino, from, ADDRS_PER_INODE(inode)); set_sbi_flag(sbi, SBI_NEED_FSCK); @@ -1265,7 +1265,7 @@ skip_partial: set_sbi_flag(F2FS_F_SB(folio), SBI_NEED_FSCK); f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); f2fs_err_ratelimited(sbi, - "truncate node fail, ino:%lu, nid:%u, " + "truncate node fail, ino:%llu, nid:%u, " "offset[0]:%d, offset[1]:%d, nofs:%d", inode->i_ino, dn.nid, offset[0], offset[1], nofs); @@ -1351,7 +1351,7 @@ int f2fs_remove_inode_page(struct inode *inode) if (unlikely(inode->i_blocks != 0 && inode->i_blocks != 8)) { f2fs_warn(F2FS_I_SB(inode), - "f2fs_remove_inode_page: inconsistent i_blocks, ino:%lu, iblocks:%llu", + "f2fs_remove_inode_page: inconsistent i_blocks, ino:%llu, iblocks:%llu", inode->i_ino, (unsigned long long)inode->i_blocks); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); } diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index a26071f2b0bc..3d3dacec9482 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -232,7 +232,7 @@ out: name = ""; else name = raw_inode->i_name; - f2fs_notice(F2FS_I_SB(inode), "%s: ino = %x, name = %s, dir = %lx, err = %d", + f2fs_notice(F2FS_I_SB(inode), "%s: ino = %x, name = %s, dir = %llu, err = %d", __func__, ino_of_node(ifolio), name, IS_ERR(dir) ? 0 : dir->i_ino, err); return err; @@ -532,7 +532,7 @@ got_it: max_addrs = ADDRS_PER_PAGE(dn->node_folio, dn->inode); if (ofs_in_node >= max_addrs) { - f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%lu, nid:%u, max:%u", + f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%llu, nid:%u, max:%u", ofs_in_node, dn->inode->i_ino, nid, max_addrs); f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUMMARY); return -EFSCORRUPTED; @@ -674,7 +674,7 @@ retry_dn: f2fs_bug_on(sbi, ni.ino != ino_of_node(folio)); if (ofs_of_node(dn.node_folio) != ofs_of_node(folio)) { - f2fs_warn(sbi, "Inconsistent ofs_of_node, ino:%lu, ofs:%u, %u", + f2fs_warn(sbi, "Inconsistent ofs_of_node, ino:%llu, ofs:%u, %u", inode->i_ino, ofs_of_node(dn.node_folio), ofs_of_node(folio)); err = -EFSCORRUPTED; @@ -748,7 +748,7 @@ retry_prev: if (f2fs_is_valid_blkaddr(sbi, dest, DATA_GENERIC_ENHANCE_UPDATE)) { - f2fs_err(sbi, "Inconsistent dest blkaddr:%u, ino:%lu, ofs:%u", + f2fs_err(sbi, "Inconsistent dest blkaddr:%u, ino:%llu, ofs:%u", dest, inode->i_ino, dn.ofs_in_node); err = -EFSCORRUPTED; goto err; @@ -768,7 +768,7 @@ retry_prev: err: f2fs_put_dnode(&dn); out: - f2fs_notice(sbi, "recover_data: ino = %lx, nid = %x (i_size: %s), " + f2fs_notice(sbi, "recover_data: ino = %llx, nid = %x (i_size: %s), " "range (%u, %u), recovered = %d, err = %d", inode->i_ino, nid_of_node(folio), file_keep_isize(inode) ? "keep" : "recover", diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 941dc62a6d6f..610d5810074d 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -365,7 +365,7 @@ static int lookup_all_xattrs(struct inode *inode, struct folio *ifolio, *xe = __find_xattr(cur_addr, last_txattr_addr, NULL, index, len, name); if (!*xe) { - f2fs_err(F2FS_I_SB(inode), "lookup inode (%lu) has corrupted xattr", + f2fs_err(F2FS_I_SB(inode), "lookup inode (%llu) has corrupted xattr", inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); err = -ENODATA; @@ -585,7 +585,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) if ((void *)(entry) + sizeof(__u32) > last_base_addr || (void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) { - f2fs_err(F2FS_I_SB(inode), "list inode (%lu) has corrupted xattr", + f2fs_err(F2FS_I_SB(inode), "list inode (%llu) has corrupted xattr", inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); f2fs_handle_error(F2FS_I_SB(inode), @@ -664,14 +664,14 @@ retry: if (!F2FS_I(inode)->i_xattr_nid) { error = f2fs_recover_xattr_data(inode, NULL); f2fs_notice(F2FS_I_SB(inode), - "recover xattr in inode (%lu), error(%d)", + "recover xattr in inode (%llu), error(%d)", inode->i_ino, error); if (!error) { kfree(base_addr); goto retry; } } - f2fs_err(F2FS_I_SB(inode), "set inode (%lu) has corrupted xattr", + f2fs_err(F2FS_I_SB(inode), "set inode (%llu) has corrupted xattr", inode->i_ino); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); error = -EFSCORRUPTED; @@ -699,7 +699,7 @@ retry: while (!IS_XATTR_LAST_ENTRY(last)) { if ((void *)(last) + sizeof(__u32) > last_base_addr || (void *)XATTR_NEXT_ENTRY(last) > last_base_addr) { - f2fs_err(F2FS_I_SB(inode), "inode (%lu) has invalid last xattr entry, entry_size: %zu", + f2fs_err(F2FS_I_SB(inode), "inode (%llu) has invalid last xattr entry, entry_size: %zu", inode->i_ino, ENTRY_SIZE(last)); set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); error = -EFSCORRUPTED; diff --git a/fs/freevxfs/vxfs_bmap.c b/fs/freevxfs/vxfs_bmap.c index 26d367e3668d..e85222892038 100644 --- a/fs/freevxfs/vxfs_bmap.c +++ b/fs/freevxfs/vxfs_bmap.c @@ -260,12 +260,12 @@ vxfs_bmap1(struct inode *ip, long iblock) if (VXFS_ISIMMED(vip)) goto unsupp; - printk(KERN_WARNING "vxfs: inode %ld has no valid orgtype (%x)\n", + printk(KERN_WARNING "vxfs: inode %llu has no valid orgtype (%x)\n", ip->i_ino, vip->vii_orgtype); BUG(); unsupp: - printk(KERN_WARNING "vxfs: inode %ld has an unsupported orgtype (%x)\n", + printk(KERN_WARNING "vxfs: inode %llu has an unsupported orgtype (%x)\n", ip->i_ino, vip->vii_orgtype); return 0; } diff --git a/fs/fserror.c b/fs/fserror.c index 06ca86adab9b..1e4d11fd9562 100644 --- a/fs/fserror.c +++ b/fs/fserror.c @@ -176,7 +176,7 @@ lost_event: lost: if (inode) pr_err_ratelimited( - "%s: lost file I/O error report for ino %lu type %u pos 0x%llx len 0x%llx error %d", + "%s: lost file I/O error report for ino %llu type %u pos 0x%llx len 0x%llx error %d", sb->s_id, inode->i_ino, type, pos, len, error); else pr_err_ratelimited( diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c index b80ba40e3877..7f5339ee57c1 100644 --- a/fs/hfs/catalog.c +++ b/fs/hfs/catalog.c @@ -417,7 +417,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name, int entry_size, type; int err; - hfs_dbg("cnid %u - (ino %lu, name %s) - (ino %lu, name %s)\n", + hfs_dbg("cnid %u - (ino %llu, name %s) - (ino %llu, name %s)\n", cnid, src_dir->i_ino, src_name->name, dst_dir->i_ino, dst_name->name); sb = src_dir->i_sb; diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c index a097908b269d..f066a99a863b 100644 --- a/fs/hfs/extent.c +++ b/fs/hfs/extent.c @@ -411,7 +411,7 @@ int hfs_extend_file(struct inode *inode) goto out; } - hfs_dbg("ino %lu, start %u, len %u\n", inode->i_ino, start, len); + hfs_dbg("ino %llu, start %u, len %u\n", inode->i_ino, start, len); if (HFS_I(inode)->alloc_blocks == HFS_I(inode)->first_blocks) { if (!HFS_I(inode)->first_blocks) { hfs_dbg("first_extent: start %u, len %u\n", @@ -482,7 +482,7 @@ void hfs_file_truncate(struct inode *inode) u32 size; int res; - hfs_dbg("ino %lu, phys_size %llu -> i_size %llu\n", + hfs_dbg("ino %llu, phys_size %llu -> i_size %llu\n", inode->i_ino, (long long)HFS_I(inode)->phys_size, inode->i_size); if (inode->i_size > HFS_I(inode)->phys_size) { diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 878535db64d6..95f0333a608b 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -270,7 +270,7 @@ void hfs_delete_inode(struct inode *inode) { struct super_block *sb = inode->i_sb; - hfs_dbg("ino %lu\n", inode->i_ino); + hfs_dbg("ino %llu\n", inode->i_ino); if (S_ISDIR(inode->i_mode)) { atomic64_dec(&HFS_SB(sb)->folder_count); if (HFS_I(inode)->cat_key.ParID == cpu_to_be32(HFS_ROOT_CNID)) @@ -455,7 +455,7 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) hfs_cat_rec rec; int res; - hfs_dbg("ino %lu\n", inode->i_ino); + hfs_dbg("ino %llu\n", inode->i_ino); res = hfs_ext_write_extent(inode); if (res) return res; diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c index 4b79cd606276..174cd13106ad 100644 --- a/fs/hfsplus/attributes.c +++ b/fs/hfsplus/attributes.c @@ -203,7 +203,7 @@ int hfsplus_create_attr_nolock(struct inode *inode, const char *name, int entry_size; int err; - hfs_dbg("name %s, ino %ld\n", + hfs_dbg("name %s, ino %llu\n", name ? name : NULL, inode->i_ino); if (name) { @@ -255,7 +255,7 @@ int hfsplus_create_attr(struct inode *inode, hfsplus_attr_entry *entry_ptr; int err; - hfs_dbg("name %s, ino %ld\n", + hfs_dbg("name %s, ino %llu\n", name ? name : NULL, inode->i_ino); if (!HFSPLUS_SB(sb)->attr_tree) { @@ -337,7 +337,7 @@ int hfsplus_delete_attr_nolock(struct inode *inode, const char *name, struct super_block *sb = inode->i_sb; int err; - hfs_dbg("name %s, ino %ld\n", + hfs_dbg("name %s, ino %llu\n", name ? name : NULL, inode->i_ino); if (name) { @@ -367,7 +367,7 @@ int hfsplus_delete_attr(struct inode *inode, const char *name) struct super_block *sb = inode->i_sb; struct hfs_find_data fd; - hfs_dbg("name %s, ino %ld\n", + hfs_dbg("name %s, ino %llu\n", name ? name : NULL, inode->i_ino); if (!HFSPLUS_SB(sb)->attr_tree) { @@ -436,7 +436,7 @@ int hfsplus_replace_attr(struct inode *inode, hfsplus_attr_entry *entry_ptr; int err = 0; - hfs_dbg("name %s, ino %ld\n", + hfs_dbg("name %s, ino %llu\n", name ? name : NULL, inode->i_ino); if (!HFSPLUS_SB(sb)->attr_tree) { diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index 02c1eee4a4b8..0e961e99b985 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -441,7 +441,7 @@ int hfsplus_rename_cat(u32 cnid, int entry_size, type; int err; - hfs_dbg("cnid %u - ino %lu, name %s - ino %lu, name %s\n", + hfs_dbg("cnid %u - ino %llu, name %s - ino %llu, name %s\n", cnid, src_dir->i_ino, src_name->name, dst_dir->i_ino, dst_name->name); err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd); diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index d559bf8625f8..054f6da46033 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -313,7 +313,7 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir, if (!S_ISREG(inode->i_mode)) return -EPERM; - hfs_dbg("src_dir->i_ino %lu, dst_dir->i_ino %lu, inode->i_ino %lu\n", + hfs_dbg("src_dir->i_ino %llu, dst_dir->i_ino %llu, inode->i_ino %llu\n", src_dir->i_ino, dst_dir->i_ino, inode->i_ino); mutex_lock(&sbi->vh_mutex); @@ -385,7 +385,7 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry) if (HFSPLUS_IS_RSRC(inode)) return -EPERM; - hfs_dbg("dir->i_ino %lu, inode->i_ino %lu\n", + hfs_dbg("dir->i_ino %llu, inode->i_ino %llu\n", dir->i_ino, inode->i_ino); mutex_lock(&sbi->vh_mutex); @@ -393,7 +393,7 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry) if (inode->i_ino == cnid && atomic_read(&HFSPLUS_I(inode)->opencnt)) { str.name = name; - str.len = sprintf(name, "temp%lu", inode->i_ino); + str.len = sprintf(name, "temp%llu", inode->i_ino); res = hfsplus_rename_cat(inode->i_ino, dir, &dentry->d_name, sbi->hidden_dir, &str); diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index 8e886514d27f..474fde1a1653 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -275,7 +275,7 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock, mutex_unlock(&hip->extents_lock); done: - hfs_dbg("ino %lu, iblock %llu - dblock %u\n", + hfs_dbg("ino %llu, iblock %llu - dblock %u\n", inode->i_ino, (long long)iblock, dblock); mask = (1 << sbi->fs_shift) - 1; @@ -476,7 +476,7 @@ int hfsplus_file_extend(struct inode *inode, bool zeroout) goto out; } - hfs_dbg("ino %lu, start %u, len %u\n", inode->i_ino, start, len); + hfs_dbg("ino %llu, start %u, len %u\n", inode->i_ino, start, len); if (hip->alloc_blocks <= hip->first_blocks) { if (!hip->first_blocks) { @@ -545,7 +545,7 @@ void hfsplus_file_truncate(struct inode *inode) u32 alloc_cnt, blk_cnt, start; int res; - hfs_dbg("ino %lu, phys_size %llu -> i_size %llu\n", + hfs_dbg("ino %llu, phys_size %llu -> i_size %llu\n", inode->i_ino, (long long)hip->phys_size, inode->i_size); if (inode->i_size > hip->phys_size) { diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 922ff41df042..02be32dc6833 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -230,7 +230,7 @@ static int hfsplus_get_perms(struct inode *inode, inode->i_flags &= ~S_APPEND; return 0; bad_type: - pr_err("invalid file type 0%04o for inode %lu\n", mode, inode->i_ino); + pr_err("invalid file type 0%04o for inode %llu\n", mode, inode->i_ino); return -EIO; } @@ -328,7 +328,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, struct hfsplus_vh *vhdr = sbi->s_vhdr; int error = 0, error2; - hfs_dbg("inode->i_ino %lu, start %llu, end %llu\n", + hfs_dbg("inode->i_ino %llu, start %llu, end %llu\n", inode->i_ino, start, end); error = file_write_and_wait_range(file, start, end); @@ -639,7 +639,7 @@ int hfsplus_cat_write_inode(struct inode *inode) hfsplus_cat_entry entry; int res = 0; - hfs_dbg("inode->i_ino %lu\n", inode->i_ino); + hfs_dbg("inode->i_ino %llu\n", inode->i_ino); if (HFSPLUS_IS_RSRC(inode)) main_inode = HFSPLUS_I(inode)->rsrc_inode; @@ -716,7 +716,7 @@ out: if (!res) { res = hfs_btree_write(tree); if (res) { - pr_err("b-tree write err: %d, ino %lu\n", + pr_err("b-tree write err: %d, ino %llu\n", res, inode->i_ino); } } diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 7229a8ae89f9..b3917249c206 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -156,7 +156,7 @@ static int hfsplus_system_write_inode(struct inode *inode) int err = hfs_btree_write(tree); if (err) { - pr_err("b-tree write err: %d, ino %lu\n", + pr_err("b-tree write err: %d, ino %llu\n", err, inode->i_ino); return err; } @@ -169,7 +169,7 @@ static int hfsplus_write_inode(struct inode *inode, { int err; - hfs_dbg("ino %lu\n", inode->i_ino); + hfs_dbg("ino %llu\n", inode->i_ino); err = hfsplus_ext_write_extent(inode); if (err) @@ -184,7 +184,7 @@ static int hfsplus_write_inode(struct inode *inode, static void hfsplus_evict_inode(struct inode *inode) { - hfs_dbg("ino %lu\n", inode->i_ino); + hfs_dbg("ino %llu\n", inode->i_ino); truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (HFSPLUS_IS_RSRC(inode)) { diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index 9904944cbd54..c70bb6f494b2 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -277,7 +277,7 @@ int __hfsplus_setxattr(struct inode *inode, const char *name, u16 folder_finderinfo_len = sizeof(DInfo) + sizeof(DXInfo); u16 file_finderinfo_len = sizeof(FInfo) + sizeof(FXInfo); - hfs_dbg("ino %lu, name %s, value %p, size %zu\n", + hfs_dbg("ino %llu, name %s, value %p, size %zu\n", inode->i_ino, name ? name : NULL, value, size); @@ -447,7 +447,7 @@ int hfsplus_setxattr(struct inode *inode, const char *name, NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1; int res; - hfs_dbg("ino %lu, name %s, prefix %s, prefixlen %zu, " + hfs_dbg("ino %llu, name %s, prefix %s, prefixlen %zu, " "value %p, size %zu\n", inode->i_ino, name ? name : NULL, prefix ? prefix : NULL, prefixlen, @@ -607,7 +607,7 @@ ssize_t hfsplus_getxattr(struct inode *inode, const char *name, int res; char *xattr_name; - hfs_dbg("ino %lu, name %s, prefix %s\n", + hfs_dbg("ino %llu, name %s, prefix %s\n", inode->i_ino, name ? name : NULL, prefix ? prefix : NULL); @@ -717,7 +717,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) size_t strbuf_size; int xattr_name_len; - hfs_dbg("ino %lu\n", inode->i_ino); + hfs_dbg("ino %llu\n", inode->i_ino); if (!is_xattr_operation_supported(inode)) return -EOPNOTSUPP; @@ -819,7 +819,7 @@ static int hfsplus_removexattr(struct inode *inode, const char *name) int is_xattr_acl_deleted; int is_all_xattrs_deleted; - hfs_dbg("ino %lu, name %s\n", + hfs_dbg("ino %llu, name %s\n", inode->i_ino, name ? name : NULL); if (!HFSPLUS_SB(inode->i_sb)->attr_tree) diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index ceb50b2dc91a..3bf11202e2d3 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -96,8 +96,8 @@ static int hpfs_readdir(struct file *file, struct dir_context *ctx) } if (!fnode_is_dir(fno)) { e = 1; - hpfs_error(inode->i_sb, "not a directory, fnode %08lx", - (unsigned long)inode->i_ino); + hpfs_error(inode->i_sb, "not a directory, fnode %08llx", + inode->i_ino); } if (hpfs_inode->i_dno != le32_to_cpu(fno->u.external[0].disk_secno)) { e = 1; diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c index dde764ebe246..8c6aa060fd87 100644 --- a/fs/hpfs/dnode.c +++ b/fs/hpfs/dnode.c @@ -550,9 +550,9 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno) if (hpfs_sb(i->i_sb)->sb_chk) if (up != i->i_ino) { hpfs_error(i->i_sb, - "bad pointer to fnode, dnode %08x, pointing to %08x, should be %08lx", + "bad pointer to fnode, dnode %08x, pointing to %08x, should be %08llx", dno, up, - (unsigned long)i->i_ino); + i->i_ino); return; } if ((d1 = hpfs_map_dnode(i->i_sb, down, &qbh1))) { diff --git a/fs/hpfs/ea.c b/fs/hpfs/ea.c index 2149d3ca530b..4664f9ab06ee 100644 --- a/fs/hpfs/ea.c +++ b/fs/hpfs/ea.c @@ -245,8 +245,8 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key, fnode->ea_offs = cpu_to_le16(0xc4); } if (le16_to_cpu(fnode->ea_offs) < 0xc4 || le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s) + le16_to_cpu(fnode->ea_size_s) > 0x200) { - hpfs_error(s, "fnode %08lx: ea_offs == %03x, ea_size_s == %03x", - (unsigned long)inode->i_ino, + hpfs_error(s, "fnode %08llx: ea_offs == %03x, ea_size_s == %03x", + inode->i_ino, le16_to_cpu(fnode->ea_offs), le16_to_cpu(fnode->ea_size_s)); return; } diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 93d528f4f4f2..0e932cc8be1b 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -250,8 +250,8 @@ void hpfs_write_inode_nolock(struct inode *i) hpfs_brelse4(&qbh); } else hpfs_error(i->i_sb, - "directory %08lx doesn't have '.' entry", - (unsigned long)i->i_ino); + "directory %08llx doesn't have '.' entry", + i->i_ino); } mark_buffer_dirty(bh); brelse(bh); diff --git a/fs/inode.c b/fs/inode.c index 62df5dda0589..5ad169d51728 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -726,7 +726,7 @@ void dump_mapping(const struct address_space *mapping) struct dentry *dentry_ptr; struct dentry dentry; char fname[64] = {}; - unsigned long ino; + u64 ino; /* * If mapping is an invalid pointer, we don't want to crash @@ -750,14 +750,14 @@ void dump_mapping(const struct address_space *mapping) } if (!dentry_first) { - pr_warn("aops:%ps ino:%lx\n", a_ops, ino); + pr_warn("aops:%ps ino:%llx\n", a_ops, ino); return; } dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias); if (get_kernel_nofault(dentry, dentry_ptr) || !dentry.d_parent || !dentry.d_name.name) { - pr_warn("aops:%ps ino:%lx invalid dentry:%px\n", + pr_warn("aops:%ps ino:%llx invalid dentry:%px\n", a_ops, ino, dentry_ptr); return; } @@ -768,7 +768,7 @@ void dump_mapping(const struct address_space *mapping) * Even if strncpy_from_kernel_nofault() succeeded, * the fname could be unreliable */ - pr_warn("aops:%ps ino:%lx dentry name(?):\"%s\"\n", + pr_warn("aops:%ps ino:%llx dentry name(?):\"%s\"\n", a_ops, ino, fname); } @@ -2641,9 +2641,8 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) /* leave it no_open_fops */ break; default: - printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for" - " inode %s:%lu\n", mode, inode->i_sb->s_id, - inode->i_ino); + pr_debug("init_special_inode: bogus i_mode (%o) for inode %s:%llu\n", + mode, inode->i_sb->s_id, inode->i_ino); break; } } diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c index e4d57cb969f1..e655763a82ce 100644 --- a/fs/iomap/ioend.c +++ b/fs/iomap/ioend.c @@ -48,7 +48,7 @@ static u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend) mapping_set_error(inode->i_mapping, ioend->io_error); if (!bio_flagged(bio, BIO_QUIET)) { pr_err_ratelimited( -"%s: writeback error on inode %lu, offset %lld, sector %llu", +"%s: writeback error on inode %llu, offset %lld, sector %llu", inode->i_sb->s_id, inode->i_ino, ioend->io_offset, ioend->io_sector); } diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c index 50b4cb3aea87..397568b9c7e7 100644 --- a/fs/isofs/compress.c +++ b/fs/isofs/compress.c @@ -156,7 +156,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, else { printk(KERN_DEBUG "zisofs: zisofs_inflate returned" - " %d, inode = %lu," + " %d, inode = %llu," " page idx = %d, bh idx = %d," " avail_in = %ld," " avail_out = %ld\n", diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c index 2ca16c3fe5ef..2fd9948d606e 100644 --- a/fs/isofs/dir.c +++ b/fs/isofs/dir.c @@ -152,7 +152,7 @@ static int do_isofs_readdir(struct inode *inode, struct file *file, de_len < de->name_len[0] + sizeof(struct iso_directory_record)) { printk(KERN_NOTICE "iso9660: Corrupted directory entry" - " in block %lu of inode %lu\n", block, + " in block %lu of inode %llu\n", block, inode->i_ino); brelse(bh); return -EIO; diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 5c01536c5e8f..3593e02e75fe 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -1261,7 +1261,7 @@ out_noread: out_toomany: printk(KERN_INFO "%s: More than 100 file sections ?!?, aborting...\n" - "isofs_read_level3_size: inode=%lu\n", + "isofs_read_level3_size: inode=%llu\n", __func__, inode->i_ino); goto out; } @@ -1380,7 +1380,7 @@ static int isofs_read_inode(struct inode *inode, int relocated) /* I have no idea what file_unit_size is used for, so we will flag it for now */ if (de->file_unit_size[0] != 0) { - printk(KERN_DEBUG "ISOFS: File unit size != 0 for ISO file (%ld).\n", + printk(KERN_DEBUG "ISOFS: File unit size != 0 for ISO file (%llu).\n", inode->i_ino); } @@ -1450,7 +1450,7 @@ static int isofs_read_inode(struct inode *inode, int relocated) /* XXX - parse_rock_ridge_inode() had already set i_rdev. */ init_special_inode(inode, inode->i_mode, inode->i_rdev); } else { - printk(KERN_DEBUG "ISOFS: Invalid file type 0%04o for inode %lu.\n", + printk(KERN_DEBUG "ISOFS: Invalid file type 0%04o for inode %llu.\n", inode->i_mode, inode->i_ino); ret = -EIO; goto fail; diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c index 58f80e1b3ac0..8dd3911717e0 100644 --- a/fs/isofs/namei.c +++ b/fs/isofs/namei.c @@ -100,7 +100,7 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry, /* Basic sanity check, whether name doesn't exceed dir entry */ if (de_len < dlen + sizeof(struct iso_directory_record)) { printk(KERN_NOTICE "iso9660: Corrupted directory entry" - " in block %lu of inode %lu\n", block, + " in block %lu of inode %llu\n", block, dir->i_ino); brelse(bh); return 0; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index cb2c529a8f1b..b60918ed8a99 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1677,7 +1677,7 @@ journal_t *jbd2_journal_init_inode(struct inode *inode) return err ? ERR_PTR(err) : ERR_PTR(-EINVAL); } - jbd2_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n", + jbd2_debug(1, "JBD2: inode %s/%llu, size %lld, bits %d, blksize %ld\n", inode->i_sb->s_id, inode->i_ino, (long long) inode->i_size, inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); @@ -1689,7 +1689,7 @@ journal_t *jbd2_journal_init_inode(struct inode *inode) journal->j_inode = inode; snprintf(journal->j_devname, sizeof(journal->j_devname), - "%pg-%lu", journal->j_dev, journal->j_inode->i_ino); + "%pg-%llu", journal->j_dev, journal->j_inode->i_ino); strreplace(journal->j_devname, '/', '!'); jbd2_stats_proc_init(journal); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index dca4b5d8aaaa..a90f9092706c 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2651,7 +2651,7 @@ static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode, return -EROFS; journal = transaction->t_journal; - jbd2_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino, + jbd2_debug(4, "Adding inode %llu, tid:%d\n", jinode->i_vfs_inode->i_ino, transaction->t_tid); spin_lock(&journal->j_list_lock); diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 2b38ce1fd8e8..c4088c3b4ac0 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -129,7 +129,7 @@ static int jffs2_readdir(struct file *file, struct dir_context *ctx) struct jffs2_full_dirent *fd; unsigned long curofs = 1; - jffs2_dbg(1, "jffs2_readdir() for dir_i #%lu\n", inode->i_ino); + jffs2_dbg(1, "jffs2_readdir() for dir_i #%llu\n", inode->i_ino); if (!dir_emit_dots(file, ctx)) return 0; @@ -211,7 +211,7 @@ static int jffs2_create(struct mnt_idmap *idmap, struct inode *dir_i, jffs2_free_raw_inode(ri); - jffs2_dbg(1, "%s(): Created ino #%lu with mode %o, nlink %d(%d). nrpages %ld\n", + jffs2_dbg(1, "%s(): Created ino #%llu with mode %o, nlink %d(%d). nrpages %ld\n", __func__, inode->i_ino, inode->i_mode, inode->i_nlink, f->inocache->pino_nlink, inode->i_mapping->nrpages); diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 5e1ef4bc009b..1e18d3a79840 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -88,7 +88,7 @@ static int jffs2_do_readpage_nolock(struct inode *inode, struct folio *folio) unsigned char *kaddr; int ret; - jffs2_dbg(2, "%s(): ino #%lu, page at offset 0x%lx\n", + jffs2_dbg(2, "%s(): ino #%llu, page at offset 0x%lx\n", __func__, inode->i_ino, folio->index << PAGE_SHIFT); BUG_ON(!folio_test_locked(folio)); @@ -259,7 +259,7 @@ static int jffs2_write_end(const struct kiocb *iocb, uint32_t writtenlen = 0; void *buf; - jffs2_dbg(1, "%s(): ino #%lu, page at 0x%llx, range %d-%d, flags %lx\n", + jffs2_dbg(1, "%s(): ino #%llu, page at 0x%llx, range %d-%d, flags %lx\n", __func__, inode->i_ino, folio_pos(folio), start, end, folio->flags.f); diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index c3ce2c868f7a..6ada8369a762 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -43,7 +43,7 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) int ret; int alloc_type = ALLOC_NORMAL; - jffs2_dbg(1, "%s(): ino #%lu\n", __func__, inode->i_ino); + jffs2_dbg(1, "%s(): ino #%llu\n", __func__, inode->i_ino); /* Special cases - we don't want more than one data node for these types on the medium at any time. So setattr @@ -243,7 +243,7 @@ void jffs2_evict_inode (struct inode *inode) struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); - jffs2_dbg(1, "%s(): ino #%lu mode %o\n", + jffs2_dbg(1, "%s(): ino #%llu mode %o\n", __func__, inode->i_ino, inode->i_mode); truncate_inode_pages_final(&inode->i_data); clear_inode(inode); @@ -334,8 +334,8 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) ret = jffs2_read_dnode(c, f, f->metadata, (char *)&jdev, 0, f->metadata->size); if (ret < 0) { /* Eep */ - pr_notice("Read device numbers for inode %lu failed\n", - (unsigned long)inode->i_ino); + pr_notice("Read device numbers for inode %llu failed\n", + inode->i_ino); goto error; } if (f->metadata->size == sizeof(jdev.old_id)) @@ -351,8 +351,8 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) break; default: - pr_warn("%s(): Bogus i_mode %o for ino %lu\n", - __func__, inode->i_mode, (unsigned long)inode->i_ino); + pr_warn("%s(): Bogus i_mode %o for ino %llu\n", + __func__, inode->i_mode, inode->i_ino); } mutex_unlock(&f->sem); @@ -374,12 +374,12 @@ void jffs2_dirty_inode(struct inode *inode, int flags) struct iattr iattr; if (!(inode_state_read_once(inode) & I_DIRTY_DATASYNC)) { - jffs2_dbg(2, "%s(): not calling setattr() for ino #%lu\n", + jffs2_dbg(2, "%s(): not calling setattr() for ino #%llu\n", __func__, inode->i_ino); return; } - jffs2_dbg(1, "%s(): calling setattr() for ino #%lu\n", + jffs2_dbg(1, "%s(): calling setattr() for ino #%llu\n", __func__, inode->i_ino); iattr.ia_valid = ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_MTIME|ATTR_CTIME; @@ -428,7 +428,7 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r struct jffs2_inode_info *f; int ret; - jffs2_dbg(1, "%s(): dir_i %ld, mode 0x%x\n", + jffs2_dbg(1, "%s(): dir_i %llu, mode 0x%x\n", __func__, dir_i->i_ino, mode); c = JFFS2_SB_INFO(sb); diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 4709762713ef..c7914dbc91ed 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -64,7 +64,7 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino) inode->i_op = &jfs_file_inode_operations; init_special_inode(inode, inode->i_mode, inode->i_rdev); } else { - printk(KERN_DEBUG "JFS: Invalid file type 0%04o for inode %lu.\n", + printk(KERN_DEBUG "JFS: Invalid file type 0%04o for inode %llu.\n", inode->i_mode, inode->i_ino); iget_failed(inode); return ERR_PTR(-EIO); diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index 294a67327c73..13ab21e66f51 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -302,7 +302,7 @@ int diRead(struct inode *ip) unsigned long pageno; int rel_inode; - jfs_info("diRead: ino = %ld", ip->i_ino); + jfs_info("diRead: ino = %llu", ip->i_ino); ipimap = sbi->ipimap; JFS_IP(ip)->ipimap = ipimap; diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 64c6eaa7f3f2..c95804f6dc19 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -692,7 +692,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock, unsigned long page_index; unsigned long page_offset; - jfs_info("__get_metapage: ino = %ld, lblock = 0x%lx, abs=%d", + jfs_info("__get_metapage: ino = %llu, lblock = 0x%lx, abs=%d", inode->i_ino, lblock, absolute); l2bsize = inode->i_blkbits; diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 255a847ca0b6..0b6be8b8aeb1 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -487,7 +487,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, int async_block = 0; __be32 ret; - dprintk("lockd: nlmsvc_lock(%s/%ld, ty=%d, pi=%d, %Ld-%Ld, bl=%d)\n", + dprintk("lockd: nlmsvc_lock(%s/%llu, ty=%d, pi=%d, %Ld-%Ld, bl=%d)\n", inode->i_sb->s_id, inode->i_ino, lock->fl.c.flc_type, lock->fl.c.flc_pid, @@ -617,7 +617,7 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, int mode; __be32 ret; - dprintk("lockd: nlmsvc_testlock(%s/%ld, ty=%d, %Ld-%Ld)\n", + dprintk("lockd: nlmsvc_testlock(%s/%llu, ty=%d, %Ld-%Ld)\n", nlmsvc_file_inode(file)->i_sb->s_id, nlmsvc_file_inode(file)->i_ino, lock->fl.c.flc_type, @@ -676,7 +676,7 @@ nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock) { int error = 0; - dprintk("lockd: nlmsvc_unlock(%s/%ld, pi=%d, %Ld-%Ld)\n", + dprintk("lockd: nlmsvc_unlock(%s/%llu, pi=%d, %Ld-%Ld)\n", nlmsvc_file_inode(file)->i_sb->s_id, nlmsvc_file_inode(file)->i_ino, lock->fl.c.flc_pid, @@ -716,7 +716,7 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l int status = 0; int mode; - dprintk("lockd: nlmsvc_cancel(%s/%ld, pi=%d, %Ld-%Ld)\n", + dprintk("lockd: nlmsvc_cancel(%s/%llu, pi=%d, %Ld-%Ld)\n", nlmsvc_file_inode(file)->i_sb->s_id, nlmsvc_file_inode(file)->i_ino, lock->fl.c.flc_pid, diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index dd0214dcb695..79f3dd2fd366 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -47,7 +47,7 @@ static inline void nlm_debug_print_file(char *msg, struct nlm_file *file) { struct inode *inode = nlmsvc_file_inode(file); - dprintk("lockd: %s %s/%ld\n", + dprintk("lockd: %s %s/%llu\n", msg, inode->i_sb->s_id, inode->i_ino); } #else diff --git a/fs/locks.c b/fs/locks.c index d13ec930b7bb..d8b066fb4210 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -234,7 +234,7 @@ locks_check_ctx_lists(struct inode *inode) if (unlikely(!list_empty(&ctx->flc_flock) || !list_empty(&ctx->flc_posix) || !list_empty(&ctx->flc_lease))) { - pr_warn("Leaked locks on dev=0x%x:0x%x ino=0x%lx:\n", + pr_warn("Leaked locks on dev=0x%x:0x%x ino=0x%llx:\n", MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), inode->i_ino); locks_dump_ctx_list(&ctx->flc_flock, "FLOCK"); @@ -251,7 +251,7 @@ locks_check_ctx_file_list(struct file *filp, struct list_head *list, char *list_ list_for_each_entry(flc, list, flc_list) if (flc->flc_file == filp) - pr_warn("Leaked %s lock on dev=0x%x:0x%x ino=0x%lx " + pr_warn("Leaked %s lock on dev=0x%x:0x%x ino=0x%llx " " fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n", list_type, MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), inode->i_ino, @@ -2896,7 +2896,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock_core *flc, (type == F_RDLCK) ? "READ" : "UNLCK"); if (inode) { /* userspace relies on this representation of dev_t */ - seq_printf(f, "%d %02x:%02x:%lu ", pid, + seq_printf(f, "%d %02x:%02x:%llu ", pid, MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), inode->i_ino); } else { diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 99541c6a5bbf..838b072b6cf0 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -36,7 +36,7 @@ void __minix_error_inode(struct inode *inode, const char *function, vaf.fmt = fmt; vaf.va = &args; printk(KERN_CRIT "minix-fs error (device %s): %s:%d: " - "inode #%lu: comm %s: %pV\n", + "inode #%llu: comm %s: %pV\n", inode->i_sb->s_id, function, line, inode->i_ino, current->comm, &vaf); va_end(args); @@ -520,7 +520,7 @@ void minix_set_inode(struct inode *inode, dev_t rdev) S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { init_special_inode(inode, inode->i_mode, rdev); } else { - printk(KERN_DEBUG "MINIX-fs: Invalid file type 0%04o for inode %lu.\n", + printk(KERN_DEBUG "MINIX-fs: Invalid file type 0%04o for inode %llu.\n", inode->i_mode, inode->i_ino); make_bad_inode(inode); } @@ -542,7 +542,7 @@ static struct inode *V1_minix_iget(struct inode *inode) return ERR_PTR(-EIO); } if (raw_inode->i_nlinks == 0) { - printk("MINIX-fs: deleted inode referenced: %lu\n", + printk("MINIX-fs: deleted inode referenced: %llu\n", inode->i_ino); brelse(bh); iget_failed(inode); @@ -580,7 +580,7 @@ static struct inode *V2_minix_iget(struct inode *inode) return ERR_PTR(-EIO); } if (raw_inode->i_nlinks == 0) { - printk("MINIX-fs: deleted inode referenced: %lu\n", + printk("MINIX-fs: deleted inode referenced: %llu\n", inode->i_ino); brelse(bh); iget_failed(inode); @@ -692,7 +692,7 @@ static int minix_write_inode(struct inode *inode, struct writeback_control *wbc) if (wbc->sync_mode == WB_SYNC_ALL && buffer_dirty(bh)) { sync_dirty_buffer(bh); if (buffer_req(bh) && !buffer_uptodate(bh)) { - printk("IO error syncing minix inode [%s:%08lx]\n", + printk("IO error syncing minix inode [%s:%08llx]\n", inode->i_sb->s_id, inode->i_ino); err = -EIO; } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 2402f57c8e7d..ddc3789363a5 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1906,7 +1906,7 @@ static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags) } error = nfs_lookup_verify_inode(inode, flags); - dfprintk(LOOKUPCACHE, "NFS: %s: inode %lu is %s\n", + dfprintk(LOOKUPCACHE, "NFS: %s: inode %llu is %s\n", __func__, inode->i_ino, error ? "invalid" : "valid"); return !error; } @@ -2121,7 +2121,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, /* Expect a negative dentry */ BUG_ON(d_inode(dentry)); - dfprintk(VFS, "NFS: atomic_open(%s/%lu), %pd\n", + dfprintk(VFS, "NFS: atomic_open(%s/%llu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); err = nfs_check_flags(open_flags); @@ -2404,7 +2404,7 @@ static int nfs_do_create(struct inode *dir, struct dentry *dentry, open_flags |= O_CREAT; - dfprintk(VFS, "NFS: create(%s/%lu), %pd\n", + dfprintk(VFS, "NFS: create(%s/%llu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); attr.ia_mode = mode; @@ -2442,7 +2442,7 @@ nfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct iattr attr; int status; - dfprintk(VFS, "NFS: mknod(%s/%lu), %pd\n", + dfprintk(VFS, "NFS: mknod(%s/%llu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); attr.ia_mode = mode; @@ -2469,7 +2469,7 @@ struct dentry *nfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct iattr attr; struct dentry *ret; - dfprintk(VFS, "NFS: mkdir(%s/%lu), %pd\n", + dfprintk(VFS, "NFS: mkdir(%s/%llu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); attr.ia_valid = ATTR_MODE; @@ -2507,7 +2507,7 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry) { int error; - dfprintk(VFS, "NFS: rmdir(%s/%lu), %pd\n", + dfprintk(VFS, "NFS: rmdir(%s/%llu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); trace_nfs_rmdir_enter(dir, dentry); @@ -2578,7 +2578,7 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry) { int error; - dfprintk(VFS, "NFS: unlink(%s/%lu, %pd)\n", dir->i_sb->s_id, + dfprintk(VFS, "NFS: unlink(%s/%llu, %pd)\n", dir->i_sb->s_id, dir->i_ino, dentry); trace_nfs_unlink_enter(dir, dentry); @@ -2638,7 +2638,7 @@ int nfs_symlink(struct mnt_idmap *idmap, struct inode *dir, unsigned int pathlen = strlen(symname); int error; - dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s)\n", dir->i_sb->s_id, + dfprintk(VFS, "NFS: symlink(%s/%llu, %pd, %s)\n", dir->i_sb->s_id, dir->i_ino, dentry, symname); if (pathlen > PAGE_SIZE) @@ -2660,7 +2660,7 @@ int nfs_symlink(struct mnt_idmap *idmap, struct inode *dir, error = NFS_PROTO(dir)->symlink(dir, dentry, folio, pathlen, &attr); trace_nfs_symlink_exit(dir, dentry, error); if (error != 0) { - dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s) error %d\n", + dfprintk(VFS, "NFS: symlink(%s/%llu, %pd, %s) error %d\n", dir->i_sb->s_id, dir->i_ino, dentry, symname, error); d_drop(dentry); @@ -3414,7 +3414,7 @@ out: if (!res && (mask & MAY_EXEC)) res = nfs_execute_ok(inode, mask); - dfprintk(VFS, "NFS: permission(%s/%lu), mask=0x%x, res=%d\n", + dfprintk(VFS, "NFS: permission(%s/%llu), mask=0x%x, res=%d\n", inode->i_sb->s_id, inode->i_ino, mask, res); return res; out_notsup: diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 5d08b6409c28..25048a3c2364 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -391,7 +391,7 @@ static int nfs_write_begin(const struct kiocb *iocb, trace_nfs_write_begin(file_inode(file), pos, len); - dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n", + dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%llu), %u@%lld)\n", file, mapping->host->i_ino, len, (long long) pos); nfs_truncate_last_folio(mapping, i_size_read(mapping->host), pos); @@ -432,7 +432,7 @@ static int nfs_write_end(const struct kiocb *iocb, int status; trace_nfs_write_end(file_inode(file), pos, len); - dfprintk(PAGECACHE, "NFS: write_end(%pD2(%lu), %u@%lld)\n", + dfprintk(PAGECACHE, "NFS: write_end(%pD2(%llu), %u@%lld)\n", file, mapping->host->i_ino, len, (long long) pos); /* @@ -557,7 +557,7 @@ static int nfs_launder_folio(struct folio *folio) struct inode *inode = folio->mapping->host; int ret; - dfprintk(PAGECACHE, "NFS: launder_folio(%ld, %llu)\n", + dfprintk(PAGECACHE, "NFS: launder_folio(%llu, %llu)\n", inode->i_ino, folio_pos(folio)); folio_wait_private_2(folio); /* [DEPRECATED] */ @@ -647,7 +647,7 @@ static vm_fault_t nfs_vm_page_mkwrite(struct vm_fault *vmf) struct address_space *mapping; struct folio *folio = page_folio(vmf->page); - dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%pD2(%lu), offset %lld)\n", + dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%pD2(%llu), offset %lld)\n", filp, filp->f_mapping->host->i_ino, (long long)folio_pos(folio)); diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 90a11afa5d05..e85380e3b11d 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -241,7 +241,7 @@ filelayout_set_layoutcommit(struct nfs_pgio_header *hdr) /* Note: if the write is unstable, don't set end_offs until commit */ pnfs_set_layoutcommit(hdr->inode, hdr->lseg, end_offs); - dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, + dprintk("%s inode %llu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); } @@ -456,7 +456,7 @@ filelayout_read_pagelist(struct nfs_pgio_header *hdr) u32 j, idx; struct nfs_fh *fh; - dprintk("--> %s ino %lu pgbase %u req %zu@%llu\n", + dprintk("--> %s ino %llu pgbase %u req %zu@%llu\n", __func__, hdr->inode->i_ino, hdr->args.pgbase, (size_t)hdr->args.count, offset); @@ -514,7 +514,7 @@ filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync) if (IS_ERR(ds_clnt)) return PNFS_NOT_ATTEMPTED; - dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d\n", + dprintk("%s ino %llu sync %d req %zu@%llu DS: %s cl_count %d\n", __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count, offset, ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count)); @@ -1001,7 +1001,7 @@ static int filelayout_initiate_commit(struct nfs_commit_data *data, int how) if (IS_ERR(ds_clnt)) goto out_err; - dprintk("%s ino %lu, how %d cl_count %d\n", __func__, + dprintk("%s ino %llu, how %d cl_count %d\n", __func__, data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count)); data->commit_done_cb = filelayout_commit_done_cb; refcount_inc(&ds->ds_clp->cl_count); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index f67773d52830..8b1559171fe3 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1631,7 +1631,7 @@ ff_layout_set_layoutcommit(struct inode *inode, return; pnfs_set_layoutcommit(inode, lseg, end_offset); - dprintk("%s inode %lu pls_end_pos %llu\n", __func__, inode->i_ino, + dprintk("%s inode %llu pls_end_pos %llu\n", __func__, inode->i_ino, (unsigned long long) NFS_I(inode)->layout->plh_lwb); } @@ -2136,7 +2136,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) u32 dss_id; bool ds_fatal_error = false; - dprintk("--> %s ino %lu pgbase %u req %zu@%llu\n", + dprintk("--> %s ino %llu pgbase %u req %zu@%llu\n", __func__, hdr->inode->i_ino, hdr->args.pgbase, (size_t)hdr->args.count, offset); @@ -2245,7 +2245,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) vers = nfs4_ff_layout_ds_version(mirror, dss_id); - dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d vers %d\n", + dprintk("%s ino %llu sync %d req %zu@%llu DS: %s cl_count %d vers %d\n", __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count, offset, ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count), vers); @@ -2336,7 +2336,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) vers = nfs4_ff_layout_ds_version(mirror, dss_id); - dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__, + dprintk("%s ino %llu, how %d cl_count %d vers %d\n", __func__, data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count), vers); data->commit_done_cb = ff_layout_commit_done_cb; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 4786343eeee0..98a8f0de1199 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -2258,7 +2258,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) bool attr_changed = false; bool have_delegation; - dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%llx)\n", + dfprintk(VFS, "NFS: %s(%s/%llu fh_crc=0x%08x ct=%d info=0x%llx)\n", __func__, inode->i_sb->s_id, inode->i_ino, nfs_display_fhandle_hash(NFS_FH(inode)), icount_read(inode), fattr->valid); @@ -2288,7 +2288,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* * Big trouble! The inode has become a different object. */ - printk(KERN_DEBUG "NFS: %s: inode %lu mode changed, %07o to %07o\n", + printk(KERN_DEBUG "NFS: %s: inode %llu mode changed, %07o to %07o\n", __func__, inode->i_ino, inode->i_mode, fattr->mode); goto out_err; } @@ -2358,7 +2358,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); attr_changed = true; - dprintk("NFS: change_attr change on server for file %s/%ld\n", + dprintk("NFS: change_attr change on server for file %s/%llu\n", inode->i_sb->s_id, inode->i_ino); } else if (!have_delegation) { diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 91bcf67bd743..d839a97df822 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4714,7 +4714,7 @@ static int _nfs4_proc_lookupp(struct inode *inode, nfs_fattr_init(fattr); nfs4_init_sequence(server->nfs_client, &args.seq_args, &res.seq_res, 0, 0); - dprintk("NFS call lookupp ino=0x%lx\n", inode->i_ino); + dprintk("NFS call lookupp ino=0x%llx\n", inode->i_ino); status = nfs4_do_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, task_flags); dprintk("NFS reply lookupp: %d\n", status); @@ -10019,7 +10019,7 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync) int status = 0; dprintk("NFS: initiating layoutcommit call. sync %d " - "lbw: %llu inode %lu\n", sync, + "lbw: %llu inode %llu\n", sync, data->args.lastbytewritten, data->args.inode->i_ino); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index bc13d1e69449..e79deb9bf664 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -891,7 +891,7 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, while (!list_empty(layout_list)) { lo = list_entry(layout_list->next, struct pnfs_layout_hdr, plh_bulk_destroy); - dprintk("%s freeing layout for inode %lu\n", __func__, + dprintk("%s freeing layout for inode %llu\n", __func__, lo->plh_inode->i_ino); inode = lo->plh_inode; @@ -1440,7 +1440,7 @@ _pnfs_return_layout(struct inode *ino) int status = 0; bool send, valid_layout; - dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino); + dprintk("NFS: %s for inode %llu\n", __func__, ino->i_ino); spin_lock(&ino->i_lock); lo = nfsi->layout; @@ -3055,7 +3055,7 @@ pnfs_try_to_write_data(struct nfs_pgio_header *hdr, hdr->mds_ops = call_ops; - dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, + dprintk("%s: Writing ino:%llu %u@%llu (how %d)\n", __func__, inode->i_ino, hdr->args.count, hdr->args.offset, how); trypnfs = nfss->pnfs_curr_ld->write_pagelist(hdr, how); if (trypnfs != PNFS_NOT_ATTEMPTED) @@ -3181,7 +3181,7 @@ pnfs_try_to_read_data(struct nfs_pgio_header *hdr, hdr->mds_ops = call_ops; - dprintk("%s: Reading ino:%lu %u@%llu\n", + dprintk("%s: Reading ino:%llu %u@%llu\n", __func__, inode->i_ino, hdr->args.count, hdr->args.offset); trypnfs = nfss->pnfs_curr_ld->read_pagelist(hdr); @@ -3314,7 +3314,7 @@ pnfs_set_layoutcommit(struct inode *inode, struct pnfs_layout_segment *lseg, if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { nfsi->layout->plh_lwb = end_pos; mark_as_dirty = true; - dprintk("%s: Set layoutcommit for inode %lu ", + dprintk("%s: Set layoutcommit for inode %llu ", __func__, inode->i_ino); } else if (end_pos > nfsi->layout->plh_lwb) nfsi->layout->plh_lwb = end_pos; @@ -3363,7 +3363,7 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) if (!pnfs_layoutcommit_outstanding(inode)) return 0; - dprintk("--> %s inode %lu\n", __func__, inode->i_ino); + dprintk("--> %s inode %llu\n", __func__, inode->i_ino); status = -EAGAIN; if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) { diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 8fdbba7cad96..d2259d948cc3 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1039,7 +1039,7 @@ exp_rootfh(struct net *net, struct auth_domain *clp, char *name, } inode = d_inode(path.dentry); - dprintk("nfsd: exp_rootfh(%s [%p] %s:%s/%ld)\n", + dprintk("nfsd: exp_rootfh(%s [%p] %s:%s/%llu)\n", name, path.dentry, clp->name, inode->i_sb->s_id, inode->i_ino); exp = exp_parent(cd, clp, &path); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 6b9c399b89df..a569d89ac912 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1253,7 +1253,7 @@ static void nfsd4_finalize_deleg_timestamps(struct nfs4_delegation *dp, struct f if (ret) { struct inode *inode = file_inode(f); - pr_notice_ratelimited("nfsd: Unable to update timestamps on inode %02x:%02x:%lu: %d\n", + pr_notice_ratelimited("nfsd: Unable to update timestamps on inode %02x:%02x:%llu: %d\n", MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), inode->i_ino, ret); @@ -2888,7 +2888,7 @@ static void nfs4_show_superblock(struct seq_file *s, struct nfsd_file *f) { struct inode *inode = file_inode(f->nf_file); - seq_printf(s, "superblock: \"%02x:%02x:%ld\"", + seq_printf(s, "superblock: \"%02x:%02x:%llu\"", MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), inode->i_ino); diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index ed85dd43da18..ee72c9565e4f 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -601,9 +601,9 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, struct inode * inode = d_inode(dentry); dev_t ex_dev = exp_sb(exp)->s_dev; - dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %pd2, ino=%ld)\n", + dprintk("nfsd: fh_compose(exp %02x:%02x/%llu %pd2, ino=%llu)\n", MAJOR(ex_dev), MINOR(ex_dev), - (long) d_inode(exp->ex_path.dentry)->i_ino, + d_inode(exp->ex_path.dentry)->i_ino, dentry, (inode ? inode->i_ino : 0)); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index c884c3f34afb..eafdf7b7890f 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1163,7 +1163,7 @@ nfsd_direct_read(struct svc_rqst *rqstp, struct svc_fh *fhp, } else if (unlikely(host_err == -EINVAL)) { struct inode *inode = d_inode(fhp->fh_dentry); - pr_info_ratelimited("nfsd: Direct I/O alignment failure on %s/%ld\n", + pr_info_ratelimited("nfsd: Direct I/O alignment failure on %s/%llu\n", inode->i_sb->s_id, inode->i_ino); host_err = -ESERVERFAULT; } diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index e7eebb04f9a4..7b1cd2baefcf 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -707,7 +707,7 @@ void nilfs_palloc_commit_free_entry(struct inode *inode, if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) nilfs_warn(inode->i_sb, - "%s (ino=%lu): entry number %llu already freed", + "%s (ino=%llu): entry number %llu already freed", __func__, inode->i_ino, (unsigned long long)req->pr_entry_nr); else @@ -748,7 +748,7 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode, if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) nilfs_warn(inode->i_sb, - "%s (ino=%lu): entry number %llu already freed", + "%s (ino=%llu): entry number %llu already freed", __func__, inode->i_ino, (unsigned long long)req->pr_entry_nr); else @@ -861,7 +861,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) { nilfs_warn(inode->i_sb, - "%s (ino=%lu): entry number %llu already freed", + "%s (ino=%llu): entry number %llu already freed", __func__, inode->i_ino, (unsigned long long)entry_nrs[j]); } else { @@ -906,7 +906,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) last_nrs[k]); if (ret && ret != -ENOENT) nilfs_warn(inode->i_sb, - "error %d deleting block that object (entry=%llu, ino=%lu) belongs to", + "error %d deleting block that object (entry=%llu, ino=%llu) belongs to", ret, (unsigned long long)last_nrs[k], inode->i_ino); } @@ -923,7 +923,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) ret = nilfs_palloc_delete_bitmap_block(inode, group); if (ret && ret != -ENOENT) nilfs_warn(inode->i_sb, - "error %d deleting bitmap block of group=%lu, ino=%lu", + "error %d deleting bitmap block of group=%lu, ino=%llu", ret, group, inode->i_ino); } } diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index ccc1a7aa52d2..824f2bd91c16 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -33,7 +33,7 @@ static int nilfs_bmap_convert_error(struct nilfs_bmap *bmap, if (err == -EINVAL) { __nilfs_error(inode->i_sb, fname, - "broken bmap (inode number=%lu)", inode->i_ino); + "broken bmap (inode number=%llu)", inode->i_ino); err = -EIO; } return err; diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 568367129092..2e553d698d0f 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -64,7 +64,7 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr) * clearing of an abandoned b-tree node is missing somewhere). */ nilfs_error(inode->i_sb, - "state inconsistency probably due to duplicate use of b-tree node block address %llu (ino=%lu)", + "state inconsistency probably due to duplicate use of b-tree node block address %llu (ino=%llu)", (unsigned long long)blocknr, inode->i_ino); goto failed; } diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index dd0c8e560ef6..3c03f5a741d1 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -353,7 +353,7 @@ static int nilfs_btree_node_broken(const struct nilfs_btree_node *node, nchildren <= 0 || nchildren > NILFS_BTREE_NODE_NCHILDREN_MAX(size))) { nilfs_crit(inode->i_sb, - "bad btree node (ino=%lu, blocknr=%llu): level = %d, flags = 0x%x, nchildren = %d", + "bad btree node (ino=%llu, blocknr=%llu): level = %d, flags = 0x%x, nchildren = %d", inode->i_ino, (unsigned long long)blocknr, level, flags, nchildren); ret = 1; @@ -384,7 +384,7 @@ static int nilfs_btree_root_broken(const struct nilfs_btree_node *node, nchildren > NILFS_BTREE_ROOT_NCHILDREN_MAX || (nchildren == 0 && level > NILFS_BTREE_LEVEL_NODE_MIN))) { nilfs_crit(inode->i_sb, - "bad btree root (ino=%lu): level = %d, flags = 0x%x, nchildren = %d", + "bad btree root (ino=%llu): level = %d, flags = 0x%x, nchildren = %d", inode->i_ino, level, flags, nchildren); ret = 1; } @@ -453,7 +453,7 @@ static int nilfs_btree_bad_node(const struct nilfs_bmap *btree, if (unlikely(nilfs_btree_node_get_level(node) != level)) { dump_stack(); nilfs_crit(btree->b_inode->i_sb, - "btree level mismatch (ino=%lu): %d != %d", + "btree level mismatch (ino=%llu): %d != %d", btree->b_inode->i_ino, nilfs_btree_node_get_level(node), level); return 1; @@ -521,7 +521,7 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr, out_no_wait: if (!buffer_uptodate(bh)) { nilfs_err(btree->b_inode->i_sb, - "I/O error reading b-tree node block (ino=%lu, blocknr=%llu)", + "I/O error reading b-tree node block (ino=%llu, blocknr=%llu)", btree->b_inode->i_ino, (unsigned long long)ptr); brelse(bh); return -EIO; @@ -2104,7 +2104,7 @@ static int nilfs_btree_propagate(struct nilfs_bmap *btree, if (ret < 0) { if (unlikely(ret == -ENOENT)) { nilfs_crit(btree->b_inode->i_sb, - "writing node/leaf block does not appear in b-tree (ino=%lu) at key=%llu, level=%d", + "writing node/leaf block does not appear in b-tree (ino=%llu) at key=%llu, level=%d", btree->b_inode->i_ino, (unsigned long long)key, level); ret = -EINVAL; @@ -2146,7 +2146,7 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_bmap *btree, level >= NILFS_BTREE_LEVEL_MAX) { dump_stack(); nilfs_warn(btree->b_inode->i_sb, - "invalid btree level: %d (key=%llu, ino=%lu, blocknr=%llu)", + "invalid btree level: %d (key=%llu, ino=%llu, blocknr=%llu)", level, (unsigned long long)key, btree->b_inode->i_ino, (unsigned long long)bh->b_blocknr); diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index b243199036df..3653db5cdb65 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -150,7 +150,7 @@ out: Ebadsize: nilfs_error(sb, - "size of directory #%lu is not a multiple of chunk size", + "size of directory #%llu is not a multiple of chunk size", dir->i_ino); goto fail; Eshort: @@ -169,7 +169,7 @@ Einumber: error = "disallowed inode number"; bad_entry: nilfs_error(sb, - "bad entry in directory #%lu: %s - offset=%lu, inode=%lu, rec_len=%zd, name_len=%d", + "bad entry in directory #%llu: %s - offset=%lu, inode=%lu, rec_len=%zd, name_len=%d", dir->i_ino, error, (folio->index << PAGE_SHIFT) + offs, (unsigned long)le64_to_cpu(p->inode), rec_len, p->name_len); @@ -177,7 +177,7 @@ bad_entry: Eend: p = (struct nilfs_dir_entry *)(kaddr + offs); nilfs_error(sb, - "entry in directory #%lu spans the page boundary offset=%lu, inode=%lu", + "entry in directory #%llu spans the page boundary offset=%lu, inode=%lu", dir->i_ino, (folio->index << PAGE_SHIFT) + offs, (unsigned long)le64_to_cpu(p->inode)); fail: @@ -251,7 +251,7 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx) kaddr = nilfs_get_folio(inode, n, &folio); if (IS_ERR(kaddr)) { - nilfs_error(sb, "bad page in #%lu", inode->i_ino); + nilfs_error(sb, "bad page in #%llu", inode->i_ino); ctx->pos += PAGE_SIZE - offset; return -EIO; } @@ -336,7 +336,7 @@ struct nilfs_dir_entry *nilfs_find_entry(struct inode *dir, /* next folio is past the blocks we've got */ if (unlikely(n > (dir->i_blocks >> (PAGE_SHIFT - 9)))) { nilfs_error(dir->i_sb, - "dir %lu size %lld exceeds block count %llu", + "dir %llu size %lld exceeds block count %llu", dir->i_ino, dir->i_size, (unsigned long long)dir->i_blocks); goto out; @@ -382,7 +382,7 @@ struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct folio **foliop) return next_de; fail: - nilfs_error(dir->i_sb, "directory #%lu %s", dir->i_ino, msg); + nilfs_error(dir->i_sb, "directory #%llu %s", dir->i_ino, msg); folio_release_kmap(folio, de); return NULL; } diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c index 2d8dc6b35b54..8bd0b1374e25 100644 --- a/fs/nilfs2/direct.c +++ b/fs/nilfs2/direct.c @@ -338,7 +338,7 @@ static int nilfs_direct_assign(struct nilfs_bmap *bmap, key = nilfs_bmap_data_get_key(bmap, *bh); if (unlikely(key > NILFS_DIRECT_KEY_MAX)) { nilfs_crit(bmap->b_inode->i_sb, - "%s (ino=%lu): invalid key: %llu", + "%s (ino=%llu): invalid key: %llu", __func__, bmap->b_inode->i_ino, (unsigned long long)key); return -EINVAL; @@ -346,7 +346,7 @@ static int nilfs_direct_assign(struct nilfs_bmap *bmap, ptr = nilfs_direct_get_ptr(bmap, key); if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) { nilfs_crit(bmap->b_inode->i_sb, - "%s (ino=%lu): invalid pointer: %llu", + "%s (ino=%llu): invalid pointer: %llu", __func__, bmap->b_inode->i_ino, (unsigned long long)ptr); return -EINVAL; diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index 561c220799c7..62d4c1b787e9 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -137,7 +137,7 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh) struct inode *inode = bh->b_folio->mapping->host; nilfs_err(inode->i_sb, - "I/O error reading %s block for GC (ino=%lu, vblocknr=%llu)", + "I/O error reading %s block for GC (ino=%llu, vblocknr=%llu)", buffer_nilfs_node(bh) ? "node" : "data", inode->i_ino, (unsigned long long)bh->b_blocknr); return -EIO; diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 51bde45d5865..51f7e125a311 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -108,7 +108,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff, * be locked in this case. */ nilfs_warn(inode->i_sb, - "%s (ino=%lu): a race condition while inserting a data block at offset=%llu", + "%s (ino=%llu): a race condition while inserting a data block at offset=%llu", __func__, inode->i_ino, (unsigned long long)blkoff); err = -EAGAIN; @@ -789,7 +789,7 @@ repeat: goto repeat; failed: - nilfs_warn(ii->vfs_inode.i_sb, "error %d truncating bmap (ino=%lu)", + nilfs_warn(ii->vfs_inode.i_sb, "error %d truncating bmap (ino=%llu)", ret, ii->vfs_inode.i_ino); } @@ -1026,7 +1026,7 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty) * this inode. */ nilfs_warn(inode->i_sb, - "cannot set file dirty (ino=%lu): the file is being freed", + "cannot set file dirty (ino=%llu): the file is being freed", inode->i_ino); spin_unlock(&nilfs->ns_inode_lock); return -EINVAL; /* @@ -1057,7 +1057,7 @@ int __nilfs_mark_inode_dirty(struct inode *inode, int flags) err = nilfs_load_inode_block(inode, &ibh); if (unlikely(err)) { nilfs_warn(inode->i_sb, - "cannot mark inode dirty (ino=%lu): error %d loading inode block", + "cannot mark inode dirty (ino=%llu): error %d loading inode block", inode->i_ino, err); return err; } diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 946b0d3534a5..09adb40c65e5 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -203,7 +203,7 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block, err = -EIO; if (!buffer_uptodate(first_bh)) { nilfs_err(inode->i_sb, - "I/O error reading meta-data file (ino=%lu, block-offset=%lu)", + "I/O error reading meta-data file (ino=%llu, block-offset=%lu)", inode->i_ino, block); goto failed_bh; } diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 40f4b1a28705..40ac679ec56e 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -292,7 +292,7 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry) if (!inode->i_nlink) { nilfs_warn(inode->i_sb, - "deleting nonexistent file (ino=%lu), %d", + "deleting nonexistent file (ino=%llu), %d", inode->i_ino, inode->i_nlink); set_nlink(inode, 1); } diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 098a3bd103e0..4b1bf559f352 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2024,7 +2024,7 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci, ifile, ii->vfs_inode.i_ino, &ibh); if (unlikely(err)) { nilfs_warn(sci->sc_super, - "log writer: error %d getting inode block (ino=%lu)", + "log writer: error %d getting inode block (ino=%llu)", err, ii->vfs_inode.i_ino); return err; } diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 9cc7eb863643..0f731eddeb8b 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -84,7 +84,7 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); inode = igrab(fsnotify_conn_inode(mark->connector)); if (inode) { - seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:0 ", + seq_printf(m, "inotify wd:%x ino:%llx sdev:%x mask:%x ignored_mask:0 ", inode_mark->wd, inode->i_ino, inode->i_sb->s_dev, inotify_mark_user_mask(mark)); show_mark_fhandle(m, inode); @@ -111,7 +111,7 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) inode = igrab(fsnotify_conn_inode(mark->connector)); if (!inode) return; - seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ", + seq_printf(m, "fanotify ino:%llx sdev:%x mflags:%x mask:%x ignored_mask:%x ", inode->i_ino, inode->i_sb->s_dev, mflags, mark->mask, mark->ignore_mask); show_mark_fhandle(m, inode); diff --git a/fs/nsfs.c b/fs/nsfs.c index db91de208645..eac326b85314 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -46,7 +46,7 @@ static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) struct ns_common *ns = inode->i_private; const struct proc_ns_operations *ns_ops = ns->ops; - return dynamic_dname(buffer, buflen, "%s:[%lu]", + return dynamic_dname(buffer, buflen, "%s:[%llu]", ns_ops->name, inode->i_ino); } @@ -394,7 +394,7 @@ static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry) const struct ns_common *ns = inode->i_private; const struct proc_ns_operations *ns_ops = ns->ops; - seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino); + seq_printf(seq, "%s:[%llu]", ns_ops->name, inode->i_ino); return 0; } diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 174a7cb202a0..51aa008e126a 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -153,7 +153,7 @@ void ntfs_inode_printk(struct inode *inode, const char *fmt, ...) vaf.fmt = printk_skip_level(fmt); vaf.va = &args; - printk("%c%cntfs3(%s): ino=%lx,%s %pV\n", KERN_SOH_ASCII, level, + printk("%c%cntfs3(%s): ino=%llx,%s %pV\n", KERN_SOH_ASCII, level, sb->s_id, inode->i_ino, name ? name : "", &vaf); va_end(args); diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 344fd4d95fbc..d40f5d205bce 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -7318,7 +7318,7 @@ start: * to check it up here before changing the tree. */ if (root_el->l_tree_depth && rec->e_int_clusters == 0) { - mlog(ML_ERROR, "Inode %lu has an empty " + mlog(ML_ERROR, "Inode %llu has an empty " "extent record, depth %u\n", inode->i_ino, le16_to_cpu(root_el->l_tree_depth)); status = ocfs2_remove_rightmost_empty_extent(osb, diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 17ba79f443ee..c7ad912ec7a0 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -137,7 +137,7 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock, (unsigned long long)iblock, bh_result, create); if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) - mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n", + mlog(ML_NOTICE, "get_block on system inode 0x%p (%llu)\n", inode, inode->i_ino); if (S_ISLNK(inode->i_mode)) { @@ -2146,7 +2146,7 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock, ((iblock + ((len - 1) >> i_blkbits)) > endblk)) len = (endblk - iblock + 1) << i_blkbits; - mlog(0, "get block of %lu at %llu:%u req %u\n", + mlog(0, "get block of %llu at %llu:%u req %u\n", inode->i_ino, pos, len, total_len); /* diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 1c8abf2c592c..b82fe4431eb1 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -794,7 +794,7 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode, if (le16_to_cpu(el->l_count) != ocfs2_extent_recs_per_dx_root(inode->i_sb)) { ret = ocfs2_error(inode->i_sb, - "Inode %lu has invalid extent list length %u\n", + "Inode %llu has invalid extent list length %u\n", inode->i_ino, le16_to_cpu(el->l_count)); goto out; } @@ -812,7 +812,7 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode, if (el->l_tree_depth) { ret = ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in btree tree block %llu\n", + "Inode %llu has non zero tree depth in btree tree block %llu\n", inode->i_ino, (unsigned long long)eb_bh->b_blocknr); goto out; @@ -821,7 +821,7 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode, if (le16_to_cpu(el->l_next_free_rec) == 0) { ret = ocfs2_error(inode->i_sb, - "Inode %lu has empty extent list at depth %u\n", + "Inode %llu has empty extent list at depth %u\n", inode->i_ino, le16_to_cpu(el->l_tree_depth)); goto out; @@ -839,7 +839,7 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode, if (!found) { ret = ocfs2_error(inode->i_sb, - "Inode %lu has bad extent record (%u, %u, 0) in btree\n", + "Inode %llu has bad extent record (%u, %u, 0) in btree\n", inode->i_ino, le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 45cce261da65..5821e33df78f 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -123,7 +123,7 @@ static int dlmfs_file_open(struct inode *inode, if (S_ISDIR(inode->i_mode)) BUG(); - mlog(0, "open called on inode %lu, flags 0x%x\n", inode->i_ino, + mlog(0, "open called on inode %llu, flags 0x%x\n", inode->i_ino, file->f_flags); status = dlmfs_decode_open_flags(file->f_flags, &level, &flags); @@ -170,7 +170,7 @@ static int dlmfs_file_release(struct inode *inode, if (S_ISDIR(inode->i_mode)) BUG(); - mlog(0, "close called on inode %lu\n", inode->i_ino); + mlog(0, "close called on inode %llu\n", inode->i_ino); if (fp) { level = fp->fp_lock_level; @@ -242,7 +242,7 @@ static ssize_t dlmfs_file_write(struct file *filp, int bytes_left; struct inode *inode = file_inode(filp); - mlog(0, "inode %lu, count = %zu, *ppos = %llu\n", + mlog(0, "inode %llu, count = %zu, *ppos = %llu\n", inode->i_ino, count, *ppos); if (*ppos >= DLM_LVB_LEN) @@ -301,7 +301,7 @@ static void dlmfs_evict_inode(struct inode *inode) clear_inode(inode); - mlog(0, "inode %lu\n", inode->i_ino); + mlog(0, "inode %llu\n", inode->i_ino); ip = DLMFS_I(inode); lockres = &ip->ip_lockres; @@ -490,7 +490,7 @@ static int dlmfs_unlink(struct inode *dir, int status; struct inode *inode = d_inode(dentry); - mlog(0, "unlink inode %lu\n", inode->i_ino); + mlog(0, "unlink inode %llu\n", inode->i_ino); /* if there are no current holders, or none that are waiting * to acquire a lock, this basically destroys our lockres. */ diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index d68229422dda..eb5dcd17d437 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -291,7 +291,7 @@ static int ocfs2_last_eb_is_empty(struct inode *inode, if (el->l_tree_depth) { ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in leaf block %llu\n", + "Inode %llu has non zero tree depth in leaf block %llu\n", inode->i_ino, (unsigned long long)eb_bh->b_blocknr); ret = -EROFS; @@ -427,7 +427,7 @@ static int ocfs2_get_clusters_nocache(struct inode *inode, if (el->l_tree_depth) { ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in leaf block %llu\n", + "Inode %llu has non zero tree depth in leaf block %llu\n", inode->i_ino, (unsigned long long)eb_bh->b_blocknr); ret = -EROFS; @@ -437,7 +437,7 @@ static int ocfs2_get_clusters_nocache(struct inode *inode, if (le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)) { ocfs2_error(inode->i_sb, - "Inode %lu has an invalid extent (next_free_rec %u, count %u)\n", + "Inode %llu has an invalid extent (next_free_rec %u, count %u)\n", inode->i_ino, le16_to_cpu(el->l_next_free_rec), le16_to_cpu(el->l_count)); @@ -472,7 +472,7 @@ static int ocfs2_get_clusters_nocache(struct inode *inode, if (!rec->e_blkno) { ocfs2_error(inode->i_sb, - "Inode %lu has bad extent record (%u, %u, 0)\n", + "Inode %llu has bad extent record (%u, %u, 0)\n", inode->i_ino, le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); @@ -561,7 +561,7 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, if (el->l_tree_depth) { ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in xattr leaf block %llu\n", + "Inode %llu has non zero tree depth in xattr leaf block %llu\n", inode->i_ino, (unsigned long long)eb_bh->b_blocknr); ret = -EROFS; @@ -580,7 +580,7 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, if (!rec->e_blkno) { ocfs2_error(inode->i_sb, - "Inode %lu has bad extent record (%u, %u, 0) in xattr\n", + "Inode %llu has bad extent record (%u, %u, 0) in xattr\n", inode->i_ino, le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 03a51662ea8e..26025ba2656c 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1196,7 +1196,7 @@ static void ocfs2_clear_inode(struct inode *inode) inode->i_nlink); mlog_bug_on_msg(osb == NULL, - "Inode=%lu\n", inode->i_ino); + "Inode=%llu\n", inode->i_ino); dquot_drop(inode); diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index c4e0117d8977..269b0f27d567 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -471,7 +471,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode, qsize_t spacechange, inodechange; unsigned int memalloc; - trace_ocfs2_recover_local_quota_file((unsigned long)lqinode->i_ino, type); + trace_ocfs2_recover_local_quota_file(lqinode->i_ino, type); list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) { chunk = rchunk->rc_chunk; diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index c1cdececdfa4..6d7f44d3e929 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2341,7 +2341,7 @@ static int ocfs2_mark_extent_refcounted(struct inode *inode, cpos, len, phys); if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { - ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", + ret = ocfs2_error(inode->i_sb, "Inode %llu want to use refcount tree, but the feature bit is not set in the super block\n", inode->i_ino); goto out; } @@ -2524,7 +2524,7 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno); if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { - ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", + ret = ocfs2_error(inode->i_sb, "Inode %llu want to use refcount tree, but the feature bit is not set in the super block\n", inode->i_ino); goto out; } @@ -2650,7 +2650,7 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode, if (el->l_tree_depth) { ret = ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in leaf block %llu\n", + "Inode %llu has non zero tree depth in leaf block %llu\n", inode->i_ino, (unsigned long long)eb_bh->b_blocknr); goto out; @@ -2662,7 +2662,7 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode, rec = &el->l_recs[i]; if (ocfs2_is_empty_extent(rec)) { - mlog_bug_on_msg(i != 0, "Inode %lu has empty record in " + mlog_bug_on_msg(i != 0, "Inode %llu has empty record in " "index %d\n", inode->i_ino, i); continue; } @@ -3325,7 +3325,7 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context) struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (!ocfs2_refcount_tree(osb)) { - return ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", + return ocfs2_error(inode->i_sb, "Inode %llu want to use refcount tree, but the feature bit is not set in the super block\n", inode->i_ino); } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 42ee5db362d3..4d55ad963ac5 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -3741,7 +3741,7 @@ static int ocfs2_xattr_get_rec(struct inode *inode, if (el->l_tree_depth) { ret = ocfs2_error(inode->i_sb, - "Inode %lu has non zero tree depth in xattr tree block %llu\n", + "Inode %llu has non zero tree depth in xattr tree block %llu\n", inode->i_ino, (unsigned long long)eb_bh->b_blocknr); goto out; @@ -3758,7 +3758,7 @@ static int ocfs2_xattr_get_rec(struct inode *inode, } if (!e_blkno) { - ret = ocfs2_error(inode->i_sb, "Inode %lu has bad extent record (%u, %u, 0) in xattr\n", + ret = ocfs2_error(inode->i_sb, "Inode %llu has bad extent record (%u, %u, 0) in xattr\n", inode->i_ino, le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 2d4710d0e05e..9e8a2a9e5229 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -1062,7 +1062,7 @@ struct inode *orangefs_iget(struct super_block *sb, unlock_new_inode(inode); gossip_debug(GOSSIP_INODE_DEBUG, - "iget handle %pU, fsid %d hash %ld i_ino %lu\n", + "iget handle %pU, fsid %d hash %ld i_ino %llu\n", &ref->khandle, ref->fs_id, hash, diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 83f80fdb1567..0a35d1a20f13 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -262,7 +262,7 @@ out: return err; fail: - pr_warn_ratelimited("failed to encode file handle (ino=%lu, err=%i)\n", + pr_warn_ratelimited("failed to encode file handle (ino=%llu, err=%i)\n", inode->i_ino, err); goto out; } diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index d8dd4b052984..ca899fdfaafd 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -591,7 +591,7 @@ out: fail: inode = d_inode(real); - pr_warn_ratelimited("failed to verify %s (%pd2, ino=%lu, err=%i)\n", + pr_warn_ratelimited("failed to verify %s (%pd2, ino=%llu, err=%i)\n", is_upper ? "upper" : "origin", real, inode ? inode->i_ino : 0, err); goto out; @@ -831,7 +831,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, index = NULL; goto out; } - pr_warn_ratelimited("failed inode index lookup (ino=%lu, key=%.*s, err=%i);\n" + pr_warn_ratelimited("failed inode index lookup (ino=%llu, key=%.*s, err=%i);\n" "overlayfs: mount with '-o index=off' to disable inodes index.\n", d_inode(origin)->i_ino, name.len, name.name, err); diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 3f1b763a8bb4..2edad9a14648 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -1092,7 +1092,7 @@ static void ovl_cleanup_index(struct dentry *dentry) got_write = true; inode = d_inode(upperdentry); if (!S_ISDIR(inode->i_mode) && inode->i_nlink != 1) { - pr_warn_ratelimited("cleanup linked index (%pd2, ino=%lu, nlink=%u)\n", + pr_warn_ratelimited("cleanup linked index (%pd2, ino=%llu, nlink=%u)\n", upperdentry, inode->i_ino, inode->i_nlink); /* * We either have a bug with persistent union nlink or a lower diff --git a/fs/pipe.c b/fs/pipe.c index b44a756c0b41..9841648c9cf3 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -873,7 +873,7 @@ static struct vfsmount *pipe_mnt __ro_after_init; */ static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) { - return dynamic_dname(buffer, buflen, "pipe:[%lu]", + return dynamic_dname(buffer, buflen, "pipe:[%llu]", d_inode(dentry)->i_ino); } diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 9eeccff49b2a..aae1a83e8846 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -54,7 +54,7 @@ static int seq_show(struct seq_file *m, void *v) if (ret) return ret; - seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\nino:\t%lu\n", + seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\nino:\t%llu\n", (long long)file->f_pos, f_flags, real_mount(file->f_path.mnt)->mnt_id, file_inode(file)->i_ino); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e091931d7ca1..751b9ba160fb 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -442,7 +442,7 @@ static void get_vma_name(struct vm_area_struct *vma, static void show_vma_header_prefix(struct seq_file *m, unsigned long start, unsigned long end, vm_flags_t flags, unsigned long long pgoff, - dev_t dev, unsigned long ino) + dev_t dev, u64 ino) { seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); seq_put_hex_ll(m, NULL, start, 8); @@ -465,7 +465,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) const struct path *path; const char *name_fmt, *name; vm_flags_t flags = vma->vm_flags; - unsigned long ino = 0; + u64 ino = 0; unsigned long long pgoff = 0; unsigned long start, end; dev_t dev = 0; diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 8aeb63d397cf..4deb0eeadbde 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -62,7 +62,7 @@ static int qnx4_get_block( struct inode *inode, sector_t iblock, struct buffer_h { unsigned long phys; - QNX4DEBUG((KERN_INFO "qnx4: qnx4_get_block inode=[%ld] iblock=[%ld]\n",inode->i_ino,iblock)); + QNX4DEBUG((KERN_INFO "qnx4: qnx4_get_block inode=[%llu] iblock=[%ld]\n", inode->i_ino, iblock)); phys = qnx4_block_map( inode, iblock ); if ( phys ) { @@ -128,7 +128,7 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock ) brelse( bh ); } - QNX4DEBUG((KERN_INFO "qnx4: mapping block %ld of inode %ld = %ld\n",iblock,inode->i_ino,block)); + QNX4DEBUG((KERN_INFO "qnx4: mapping block %ld of inode %llu = %ld\n", iblock, inode->i_ino, block)); return block; } diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index c4049bb8bd60..6de49333acad 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -75,7 +75,7 @@ static int qnx6_get_block(struct inode *inode, sector_t iblock, { unsigned phys; - pr_debug("qnx6_get_block inode=[%ld] iblock=[%ld]\n", + pr_debug("qnx6_get_block inode=[%llu] iblock=[%ld]\n", inode->i_ino, (unsigned long)iblock); phys = qnx6_block_map(inode, iblock); diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 160c16aa7b6e..5794de5a9069 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -230,7 +230,7 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode) int count = 2; pr_err("Dump in-memory inode:"); - pr_err("\tinode %lu\n", inode->i_ino); + pr_err("\tinode %llu\n", inode->i_ino); pr_err("\tsize %llu\n", (unsigned long long)i_size_read(inode)); pr_err("\tnlink %u\n", inode->i_nlink); @@ -1101,7 +1101,7 @@ int dbg_check_synced_i_size(const struct ubifs_info *c, struct inode *inode) if (ui->ui_size != ui->synced_i_size && !ui->dirty) { ubifs_err(c, "ui_size is %lld, synced_i_size is %lld, but inode is clean", ui->ui_size, ui->synced_i_size); - ubifs_err(c, "i_ino %lu, i_mode %#x, i_size %lld", inode->i_ino, + ubifs_err(c, "i_ino %llu, i_mode %#x, i_size %lld", inode->i_ino, inode->i_mode, i_size_read(inode)); dump_stack(); err = -EINVAL; @@ -1163,7 +1163,7 @@ int dbg_check_dir(struct ubifs_info *c, const struct inode *dir) kfree(pdent); if (i_size_read(dir) != size) { - ubifs_err(c, "directory inode %lu has size %llu, but calculated size is %llu", + ubifs_err(c, "directory inode %llu has size %llu, but calculated size is %llu", dir->i_ino, (unsigned long long)i_size_read(dir), (unsigned long long)size); ubifs_dump_inode(c, dir); @@ -1171,7 +1171,7 @@ int dbg_check_dir(struct ubifs_info *c, const struct inode *dir) return -EINVAL; } if (dir->i_nlink != nlink) { - ubifs_err(c, "directory inode %lu has nlink %u, but calculated nlink is %u", + ubifs_err(c, "directory inode %llu has nlink %u, but calculated nlink is %u", dir->i_ino, dir->i_nlink, nlink); ubifs_dump_inode(c, dir); dump_stack(); diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 4c9f57f3b2ad..86d41e077e4d 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -223,7 +223,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, struct ubifs_info *c = dir->i_sb->s_fs_info; struct fscrypt_name nm; - dbg_gen("'%pd' in dir ino %lu", dentry, dir->i_ino); + dbg_gen("'%pd' in dir ino %llu", dentry, dir->i_ino); err = fscrypt_prepare_lookup(dir, dentry, &nm); if (err == -ENOENT) @@ -281,7 +281,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, if (IS_ENCRYPTED(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !fscrypt_has_permitted_context(dir, inode)) { - ubifs_warn(c, "Inconsistent encryption contexts: %lu/%lu", + ubifs_warn(c, "Inconsistent encryption contexts: %llu/%llu", dir->i_ino, inode->i_ino); iput(inode); inode = ERR_PTR(-EPERM); @@ -318,7 +318,7 @@ static int ubifs_create(struct mnt_idmap *idmap, struct inode *dir, * parent directory inode. */ - dbg_gen("dent '%pd', mode %#hx in dir ino %lu", + dbg_gen("dent '%pd', mode %#hx in dir ino %llu", dentry, mode, dir->i_ino); err = ubifs_budget_space(c, &req); @@ -386,7 +386,7 @@ static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry) * atomically. */ - dbg_gen("dent '%pd', mode %#hx in dir ino %lu", + dbg_gen("dent '%pd', mode %#hx in dir ino %llu", dentry, mode, dir->i_ino); inode = ubifs_new_inode(c, dir, mode, false); @@ -460,7 +460,7 @@ static int ubifs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, * be released via writeback. */ - dbg_gen("dent '%pd', mode %#hx in dir ino %lu", + dbg_gen("dent '%pd', mode %#hx in dir ino %llu", dentry, mode, dir->i_ino); err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); @@ -589,7 +589,7 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx) bool encrypted = IS_ENCRYPTED(dir); struct ubifs_dir_data *data = file->private_data; - dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, ctx->pos); + dbg_gen("dir ino %llu, f_pos %#llx", dir->i_ino, ctx->pos); if (ctx->pos > UBIFS_S_KEY_HASH_MASK || ctx->pos == 2) /* @@ -764,7 +764,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, * changing the parent inode. */ - dbg_gen("dent '%pd' to ino %lu (nlink %d) in dir ino %lu", + dbg_gen("dent '%pd' to ino %llu (nlink %d) in dir ino %llu", dentry, inode->i_ino, inode->i_nlink, dir->i_ino); ubifs_assert(c, inode_is_locked(dir)); @@ -836,7 +836,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) * deletions. */ - dbg_gen("dent '%pd' from ino %lu (nlink %d) in dir ino %lu", + dbg_gen("dent '%pd' from ino %llu (nlink %d) in dir ino %llu", dentry, inode->i_ino, inode->i_nlink, dir->i_ino); @@ -941,7 +941,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) * because we have extra space reserved for deletions. */ - dbg_gen("directory '%pd', ino %lu in dir ino %lu", dentry, + dbg_gen("directory '%pd', ino %llu in dir ino %llu", dentry, inode->i_ino, dir->i_ino); ubifs_assert(c, inode_is_locked(dir)); ubifs_assert(c, inode_is_locked(inode)); @@ -1018,7 +1018,7 @@ static struct dentry *ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir, * directory inode. */ - dbg_gen("dent '%pd', mode %#hx in dir ino %lu", + dbg_gen("dent '%pd', mode %#hx in dir ino %llu", dentry, mode, dir->i_ino); err = ubifs_budget_space(c, &req); @@ -1096,7 +1096,7 @@ static int ubifs_mknod(struct mnt_idmap *idmap, struct inode *dir, * directory inode. */ - dbg_gen("dent '%pd' in dir ino %lu", dentry, dir->i_ino); + dbg_gen("dent '%pd' in dir ino %llu", dentry, dir->i_ino); if (S_ISBLK(mode) || S_ISCHR(mode)) { dev = kmalloc_obj(union ubifs_dev_desc, GFP_NOFS); @@ -1183,7 +1183,7 @@ static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir, .dirtied_ino = 1 }; struct fscrypt_name nm; - dbg_gen("dent '%pd', target '%s' in dir ino %lu", dentry, + dbg_gen("dent '%pd', target '%s' in dir ino %llu", dentry, symname, dir->i_ino); err = fscrypt_prepare_symlink(dir, symname, len, UBIFS_MAX_INO_DATA, @@ -1349,7 +1349,7 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, * ino_req: marks the target inode as dirty and does not write it. */ - dbg_gen("dent '%pd' ino %lu in dir ino %lu to dent '%pd' in dir ino %lu flags 0x%x", + dbg_gen("dent '%pd' ino %llu in dir ino %llu to dent '%pd' in dir ino %llu flags 0x%x", old_dentry, old_inode->i_ino, old_dir->i_ino, new_dentry, new_dir->i_ino, flags); @@ -1597,7 +1597,7 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry, * parent directory inodes. */ - dbg_gen("dent '%pd' ino %lu in dir ino %lu exchange dent '%pd' ino %lu in dir ino %lu", + dbg_gen("dent '%pd' ino %llu in dir ino %llu exchange dent '%pd' ino %llu in dir ino %llu", old_dentry, fst_inode->i_ino, old_dir->i_ino, new_dentry, snd_inode->i_ino, new_dir->i_ino); diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index cd04755e792a..e73c28b12f97 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -90,7 +90,7 @@ static int read_block(struct inode *inode, struct folio *folio, size_t offset, return 0; dump: - ubifs_err(c, "bad data node (block %u, inode %lu)", + ubifs_err(c, "bad data node (block %u, inode %llu)", block, inode->i_ino); ubifs_dump_node(c, dn, UBIFS_MAX_DATA_NODE_SZ); return -EINVAL; @@ -106,7 +106,7 @@ static int do_readpage(struct folio *folio) loff_t i_size = i_size_read(inode); size_t offset = 0; - dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx", + dbg_gen("ino %llu, pg %lu, i_size %lld, flags %#lx", inode->i_ino, folio->index, i_size, folio->flags.f); ubifs_assert(c, !folio_test_checked(folio)); ubifs_assert(c, !folio->private); @@ -162,7 +162,7 @@ static int do_readpage(struct folio *folio) dbg_gen("hole"); err = 0; } else { - ubifs_err(c, "cannot read page %lu of inode %lu, error %d", + ubifs_err(c, "cannot read page %lu of inode %llu, error %d", folio->index, inode->i_ino, err); } } @@ -212,7 +212,7 @@ static int write_begin_slow(struct address_space *mapping, int err, appending = !!(pos + len > inode->i_size); struct folio *folio; - dbg_gen("ino %lu, pos %llu, len %u, i_size %lld", + dbg_gen("ino %llu, pos %llu, len %u, i_size %lld", inode->i_ino, pos, len, inode->i_size); /* @@ -526,7 +526,7 @@ static int ubifs_write_end(const struct kiocb *iocb, loff_t end_pos = pos + len; int appending = !!(end_pos > inode->i_size); - dbg_gen("ino %lu, pos %llu, pg %lu, len %u, copied %d, i_size %lld", + dbg_gen("ino %llu, pos %llu, pg %lu, len %u, copied %d, i_size %lld", inode->i_ino, pos, folio->index, len, copied, inode->i_size); if (unlikely(copied < len && !folio_test_uptodate(folio))) { @@ -599,7 +599,7 @@ static int populate_page(struct ubifs_info *c, struct folio *folio, size_t offset = 0; pgoff_t end_index; - dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx", + dbg_gen("ino %llu, pg %lu, i_size %lld, flags %#lx", inode->i_ino, folio->index, i_size, folio->flags.f); end_index = (i_size - 1) >> PAGE_SHIFT; @@ -680,7 +680,7 @@ out_hole: return 0; out_err: - ubifs_err(c, "bad data node (block %u, inode %lu)", + ubifs_err(c, "bad data node (block %u, inode %llu)", page_block, inode->i_ino); return -EINVAL; } @@ -913,7 +913,7 @@ static int do_writepage(struct folio *folio, size_t len) } if (err) { mapping_set_error(folio->mapping, err); - ubifs_err(c, "cannot write folio %lu of inode %lu, error %d", + ubifs_err(c, "cannot write folio %lu of inode %llu, error %d", folio->index, inode->i_ino, err); ubifs_ro_mode(c, err); } @@ -987,7 +987,7 @@ static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc) loff_t i_size = i_size_read(inode), synced_i_size; int err, len = folio_size(folio); - dbg_gen("ino %lu, pg %lu, pg flags %#lx", + dbg_gen("ino %llu, pg %lu, pg flags %#lx", inode->i_ino, folio->index, folio->flags.f); ubifs_assert(c, folio->private != NULL); @@ -1106,7 +1106,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, int offset = new_size & (UBIFS_BLOCK_SIZE - 1), budgeted = 1; struct ubifs_inode *ui = ubifs_inode(inode); - dbg_gen("ino %lu, size %lld -> %lld", inode->i_ino, old_size, new_size); + dbg_gen("ino %llu, size %lld -> %lld", inode->i_ino, old_size, new_size); memset(&req, 0, sizeof(struct ubifs_budget_req)); /* @@ -1258,7 +1258,7 @@ int ubifs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode = d_inode(dentry); struct ubifs_info *c = inode->i_sb->s_fs_info; - dbg_gen("ino %lu, mode %#x, ia_valid %#x", + dbg_gen("ino %llu, mode %#x, ia_valid %#x", inode->i_ino, inode->i_mode, attr->ia_valid); err = setattr_prepare(&nop_mnt_idmap, dentry, attr); if (err) @@ -1308,7 +1308,7 @@ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) struct ubifs_info *c = inode->i_sb->s_fs_info; int err; - dbg_gen("syncing inode %lu", inode->i_ino); + dbg_gen("syncing inode %llu", inode->i_ino); if (c->ro_mount) /* @@ -1495,7 +1495,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf) struct ubifs_budget_req req = { .new_page = 1 }; int err, update_time; - dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, folio->index, + dbg_gen("ino %llu, pg %lu, i_size %lld", inode->i_ino, folio->index, i_size_read(inode)); ubifs_assert(c, !c->ro_media && !c->ro_mount); @@ -1531,7 +1531,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf) err = ubifs_budget_space(c, &req); if (unlikely(err)) { if (err == -ENOSPC) - ubifs_warn(c, "out of space for mmapped file (inode number %lu)", + ubifs_warn(c, "out of space for mmapped file (inode number %llu)", inode->i_ino); return VM_FAULT_SIGBUS; } diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index e28ab4395e5c..40a95a2fad50 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -982,7 +982,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode) int kill_xattrs = ui->xattr_cnt && last_reference; u8 hash[UBIFS_HASH_ARR_SZ]; - dbg_jnl("ino %lu, nlink %u", inode->i_ino, inode->i_nlink); + dbg_jnl("ino %llu, nlink %u", inode->i_ino, inode->i_nlink); if (kill_xattrs && ui->xattr_cnt > ubifs_xattr_max_cnt(c)) { ubifs_err(c, "Cannot delete inode, it has too many xattrs!"); @@ -1743,7 +1743,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, int dn_len = le32_to_cpu(dn->size); if (dn_len <= 0 || dn_len > UBIFS_BLOCK_SIZE) { - ubifs_err(c, "bad data node (block %u, inode %lu)", + ubifs_err(c, "bad data node (block %u, inode %llu)", blk, inode->i_ino); ubifs_dump_node(c, dn, dn_size); err = -EUCLEAN; @@ -1987,7 +1987,7 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode, u8 hash_host[UBIFS_HASH_ARR_SZ]; u8 hash[UBIFS_HASH_ARR_SZ]; - dbg_jnl("ino %lu, ino %lu", host->i_ino, inode->i_ino); + dbg_jnl("ino %llu, ino %llu", host->i_ino, inode->i_ino); ubifs_assert(c, inode->i_nlink > 0); ubifs_assert(c, mutex_is_locked(&host_ui->ui_mutex)); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 03bf924756ca..9a77d8b64ffa 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -92,7 +92,7 @@ static int validate_inode(struct ubifs_info *c, const struct inode *inode) return 5; if (!ubifs_compr_present(c, ui->compr_type)) { - ubifs_warn(c, "inode %lu uses '%s' compression, but it was not compiled in", + ubifs_warn(c, "inode %llu uses '%s' compression, but it was not compiled in", inode->i_ino, ubifs_compr_name(c, ui->compr_type)); } @@ -248,14 +248,14 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) return inode; out_invalid: - ubifs_err(c, "inode %lu validation failed, error %d", inode->i_ino, err); + ubifs_err(c, "inode %llu validation failed, error %d", inode->i_ino, err); ubifs_dump_node(c, ino, UBIFS_MAX_INO_NODE_SZ); ubifs_dump_inode(c, inode); err = -EINVAL; out_ino: kfree(ino); out: - ubifs_err(c, "failed to read inode %lu, error %d", inode->i_ino, err); + ubifs_err(c, "failed to read inode %llu, error %d", inode->i_ino, err); iget_failed(inode); return ERR_PTR(err); } @@ -316,12 +316,12 @@ static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc) * As an optimization, do not write orphan inodes to the media just * because this is not needed. */ - dbg_gen("inode %lu, mode %#x, nlink %u", + dbg_gen("inode %llu, mode %#x, nlink %u", inode->i_ino, (int)inode->i_mode, inode->i_nlink); if (inode->i_nlink) { err = ubifs_jnl_write_inode(c, inode); if (err) - ubifs_err(c, "can't write inode %lu, error %d", + ubifs_err(c, "can't write inode %llu, error %d", inode->i_ino, err); else err = dbg_check_inode_size(c, inode, ui->ui_size); @@ -357,7 +357,7 @@ static void ubifs_evict_inode(struct inode *inode) */ goto out; - dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode); + dbg_gen("inode %llu, mode %#x", inode->i_ino, (int)inode->i_mode); ubifs_assert(c, !icount_read(inode)); truncate_inode_pages_final(&inode->i_data); @@ -375,7 +375,7 @@ static void ubifs_evict_inode(struct inode *inode) * Worst case we have a lost orphan inode wasting space, so a * simple error message is OK here. */ - ubifs_err(c, "can't delete inode %lu, error %d", + ubifs_err(c, "can't delete inode %llu, error %d", inode->i_ino, err); out: @@ -399,7 +399,7 @@ static void ubifs_dirty_inode(struct inode *inode, int flags) ubifs_assert(c, mutex_is_locked(&ui->ui_mutex)); if (!ui->dirty) { ui->dirty = 1; - dbg_gen("inode %lu", inode->i_ino); + dbg_gen("inode %llu", inode->i_ino); } } diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index 694b08d27d7d..c9d8935f6678 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -3561,8 +3561,8 @@ int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode, out_dump: block = key_block(c, key); - ubifs_err(c, "inode %lu has size %lld, but there are data at offset %lld", - (unsigned long)inode->i_ino, size, + ubifs_err(c, "inode %llu has size %lld, but there are data at offset %lld", + inode->i_ino, size, ((loff_t)block) << UBIFS_BLOCK_SHIFT); mutex_unlock(&c->tnc_mutex); ubifs_dump_inode(c, inode); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index c21a0c2b3e90..b5a9ab9d8a10 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -76,7 +76,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, .dirtied_ino_d = ALIGN(host_ui->data_len, 8) }; if (host_ui->xattr_cnt >= ubifs_xattr_max_cnt(c)) { - ubifs_err(c, "inode %lu already has too many xattrs (%d), cannot create more", + ubifs_err(c, "inode %llu already has too many xattrs (%d), cannot create more", host->i_ino, host_ui->xattr_cnt); return -ENOSPC; } @@ -88,7 +88,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, */ names_len = host_ui->xattr_names + host_ui->xattr_cnt + fname_len(nm) + 1; if (names_len > XATTR_LIST_MAX) { - ubifs_err(c, "cannot add one more xattr name to inode %lu, total names length would become %d, max. is %d", + ubifs_err(c, "cannot add one more xattr name to inode %llu, total names length would become %d, max. is %d", host->i_ino, names_len, XATTR_LIST_MAX); return -ENOSPC; } @@ -390,7 +390,7 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size) int err, len, written = 0; struct fscrypt_name nm = {0}; - dbg_gen("ino %lu ('%pd'), buffer size %zd", host->i_ino, + dbg_gen("ino %llu ('%pd'), buffer size %zd", host->i_ino, dentry, size); down_read(&host_ui->xattr_sem); @@ -498,7 +498,7 @@ int ubifs_purge_xattrs(struct inode *host) if (ubifs_inode(host)->xattr_cnt <= ubifs_xattr_max_cnt(c)) return 0; - ubifs_warn(c, "inode %lu has too many xattrs, doing a non-atomic deletion", + ubifs_warn(c, "inode %llu has too many xattrs, doing a non-atomic deletion", host->i_ino); down_write(&ubifs_inode(host)->xattr_sem); @@ -641,7 +641,7 @@ int ubifs_init_security(struct inode *dentry, struct inode *inode, &init_xattrs, NULL); if (err) { struct ubifs_info *c = dentry->i_sb->s_fs_info; - ubifs_err(c, "cannot initialize security for inode %lu, error %d", + ubifs_err(c, "cannot initialize security for inode %llu, error %d", inode->i_ino, err); } return err; @@ -652,7 +652,7 @@ static int xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size) { - dbg_gen("xattr '%s', ino %lu ('%pd'), buf size %zd", name, + dbg_gen("xattr '%s', ino %llu ('%pd'), buf size %zd", name, inode->i_ino, dentry, size); name = xattr_full_name(handler, name); @@ -665,7 +665,7 @@ static int xattr_set(const struct xattr_handler *handler, const char *name, const void *value, size_t size, int flags) { - dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd", + dbg_gen("xattr '%s', host ino %llu ('%pd'), size %zd", name, inode->i_ino, dentry, size); name = xattr_full_name(handler, name); diff --git a/fs/udf/directory.c b/fs/udf/directory.c index 632453aa3893..f5c81e13eacb 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -22,7 +22,7 @@ static int udf_verify_fi(struct udf_fileident_iter *iter) if (iter->fi.descTag.tagIdent != cpu_to_le16(TAG_IDENT_FID)) { udf_err(iter->dir->i_sb, - "directory (ino %lu) has entry at pos %llu with incorrect tag %x\n", + "directory (ino %llu) has entry at pos %llu with incorrect tag %x\n", iter->dir->i_ino, (unsigned long long)iter->pos, le16_to_cpu(iter->fi.descTag.tagIdent)); return -EFSCORRUPTED; @@ -30,7 +30,7 @@ static int udf_verify_fi(struct udf_fileident_iter *iter) len = udf_dir_entry_len(&iter->fi); if (le16_to_cpu(iter->fi.lengthOfImpUse) & 3) { udf_err(iter->dir->i_sb, - "directory (ino %lu) has entry at pos %llu with unaligned length of impUse field\n", + "directory (ino %llu) has entry at pos %llu with unaligned length of impUse field\n", iter->dir->i_ino, (unsigned long long)iter->pos); return -EFSCORRUPTED; } @@ -41,20 +41,20 @@ static int udf_verify_fi(struct udf_fileident_iter *iter) */ if (len > 1 << iter->dir->i_blkbits) { udf_err(iter->dir->i_sb, - "directory (ino %lu) has too big (%u) entry at pos %llu\n", + "directory (ino %llu) has too big (%u) entry at pos %llu\n", iter->dir->i_ino, len, (unsigned long long)iter->pos); return -EFSCORRUPTED; } if (iter->pos + len > iter->dir->i_size) { udf_err(iter->dir->i_sb, - "directory (ino %lu) has entry past directory size at pos %llu\n", + "directory (ino %llu) has entry past directory size at pos %llu\n", iter->dir->i_ino, (unsigned long long)iter->pos); return -EFSCORRUPTED; } if (udf_dir_entry_len(&iter->fi) != sizeof(struct tag) + le16_to_cpu(iter->fi.descTag.descCRCLength)) { udf_err(iter->dir->i_sb, - "directory (ino %lu) has entry where CRC length (%u) does not match entry length (%u)\n", + "directory (ino %llu) has entry where CRC length (%u) does not match entry length (%u)\n", iter->dir->i_ino, (unsigned)le16_to_cpu(iter->fi.descTag.descCRCLength), (unsigned)(udf_dir_entry_len(&iter->fi) - @@ -78,7 +78,7 @@ static int udf_copy_fi(struct udf_fileident_iter *iter) } if (iter->dir->i_size < iter->pos + sizeof(struct fileIdentDesc)) { udf_err(iter->dir->i_sb, - "directory (ino %lu) has entry straddling EOF\n", + "directory (ino %llu) has entry straddling EOF\n", iter->dir->i_ino); return -EFSCORRUPTED; } @@ -184,7 +184,7 @@ static int udf_fiiter_advance_blk(struct udf_fileident_iter *iter) return 0; } udf_err(iter->dir->i_sb, - "extent after position %llu not allocated in directory (ino %lu)\n", + "extent after position %llu not allocated in directory (ino %llu)\n", (unsigned long long)iter->pos, iter->dir->i_ino); return -EFSCORRUPTED; } @@ -272,7 +272,7 @@ int udf_fiiter_init(struct udf_fileident_iter *iter, struct inode *dir, if (pos == dir->i_size) return 0; udf_err(dir->i_sb, - "position %llu not allocated in directory (ino %lu)\n", + "position %llu not allocated in directory (ino %llu)\n", (unsigned long long)pos, dir->i_ino); err = -EFSCORRUPTED; goto out; @@ -483,7 +483,7 @@ int udf_fiiter_append_blk(struct udf_fileident_iter *iter) &iter->loffset, &etype); if (err <= 0 || etype != (EXT_RECORDED_ALLOCATED >> 30)) { udf_err(iter->dir->i_sb, - "block %llu not allocated in directory (ino %lu)\n", + "block %llu not allocated in directory (ino %llu)\n", (unsigned long long)block, iter->dir->i_ino); return -EFSCORRUPTED; } diff --git a/fs/udf/file.c b/fs/udf/file.c index 32ae7cfd72c5..b043fe10e5d6 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -133,7 +133,7 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) int result; if (file_permission(filp, MAY_READ) != 0) { - udf_debug("no permission to access inode %lu\n", inode->i_ino); + udf_debug("no permission to access inode %llu\n", inode->i_ino); return -EPERM; } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 7fae8002344a..902f81729bd8 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -147,7 +147,7 @@ void udf_evict_inode(struct inode *inode) if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && inode->i_size != iinfo->i_lenExtents) { udf_warn(inode->i_sb, - "Inode %lu (mode %o) has inode size %llu different from extent length %llu. Filesystem need not be standards compliant.\n", + "Inode %llu (mode %o) has inode size %llu different from extent length %llu. Filesystem need not be standards compliant.\n", inode->i_ino, inode->i_mode, (unsigned long long)inode->i_size, (unsigned long long)iinfo->i_lenExtents); @@ -1386,13 +1386,13 @@ reread: */ bh = udf_read_ptagged(inode->i_sb, iloc, 0, &ident); if (!bh) { - udf_err(inode->i_sb, "(ino %lu) failed !bh\n", inode->i_ino); + udf_err(inode->i_sb, "(ino %llu) failed !bh\n", inode->i_ino); return -EIO; } if (ident != TAG_IDENT_FE && ident != TAG_IDENT_EFE && ident != TAG_IDENT_USE) { - udf_err(inode->i_sb, "(ino %lu) failed ident=%u\n", + udf_err(inode->i_sb, "(ino %llu) failed ident=%u\n", inode->i_ino, ident); goto out; } @@ -1641,7 +1641,7 @@ reread: udf_debug("METADATA BITMAP FILE-----\n"); break; default: - udf_err(inode->i_sb, "(ino %lu) failed unknown file type=%u\n", + udf_err(inode->i_sb, "(ino %llu) failed unknown file type=%u\n", inode->i_ino, fe->icbTag.fileType); goto out; } @@ -1942,7 +1942,7 @@ finish: if (do_sync) { sync_dirty_buffer(bh); if (buffer_write_io_error(bh)) { - udf_warn(inode->i_sb, "IO error syncing udf inode [%08lx]\n", + udf_warn(inode->i_sb, "IO error syncing udf inode [%08llx]\n", inode->i_ino); err = -EIO; } @@ -2224,7 +2224,7 @@ int udf_next_aext(struct inode *inode, struct extent_position *epos, if (++indirections > UDF_MAX_INDIR_EXTS) { udf_err(inode->i_sb, - "too many indirect extents in inode %lu\n", + "too many indirect extents in inode %llu\n", inode->i_ino); return -EFSCORRUPTED; } diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 5f2e9a892bff..ccafcaa96809 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -550,7 +550,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry) goto end_unlink; if (!inode->i_nlink) { - udf_debug("Deleting nonexistent file (%lu), %u\n", + udf_debug("Deleting nonexistent file (%llu), %u\n", inode->i_ino, inode->i_nlink); set_nlink(inode, 1); } @@ -809,7 +809,7 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, &diriter); if (retval == -ENOENT) { udf_err(old_inode->i_sb, - "directory (ino %lu) has no '..' entry\n", + "directory (ino %llu) has no '..' entry\n", old_inode->i_ino); retval = -EFSCORRUPTED; } @@ -821,7 +821,7 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, old_dir->i_ino) { retval = -EFSCORRUPTED; udf_err(old_inode->i_sb, - "directory (ino %lu) has parent entry pointing to another inode (%lu != %u)\n", + "directory (ino %llu) has parent entry pointing to another inode (%llu != %u)\n", old_inode->i_ino, old_dir->i_ino, udf_get_lb_pblock(old_inode->i_sb, &tloc, 0)); goto out_oiter; @@ -869,7 +869,7 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, retval = udf_fiiter_find_entry(old_dir, &old_dentry->d_name, &oiter); if (retval) { udf_err(old_dir->i_sb, - "failed to find renamed entry again in directory (ino %lu)\n", + "failed to find renamed entry again in directory (ino %llu)\n", old_dir->i_ino); } else { udf_fiiter_delete_entry(&oiter); diff --git a/fs/udf/super.c b/fs/udf/super.c index 27f463fd1d89..3a2d66c7e856 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -1166,7 +1166,7 @@ static int udf_fill_partdesc_info(struct super_block *sb, } map->s_uspace.s_table = inode; map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE; - udf_debug("unallocSpaceTable (part %d) @ %lu\n", + udf_debug("unallocSpaceTable (part %d) @ %llu\n", p_index, map->s_uspace.s_table->i_ino); } diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 194ed3ab945e..628edfde3a9f 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -245,7 +245,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg, sector_t end, i; struct buffer_head *head, *bh; - UFSD("ENTER, ino %lu, count %u, oldb %llu, newb %llu\n", + UFSD("ENTER, ino %llu, count %u, oldb %llu, newb %llu\n", inode->i_ino, count, (unsigned long long)oldb, (unsigned long long)newb); @@ -340,7 +340,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, unsigned cgno, oldcount, newcount; u64 tmp, request, result; - UFSD("ENTER, ino %lu, fragment %llu, goal %llu, count %u\n", + UFSD("ENTER, ino %llu, fragment %llu, goal %llu, count %u\n", inode->i_ino, (unsigned long long)fragment, (unsigned long long)goal, count); @@ -583,7 +583,7 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno, unsigned oldcg, i, j, k, allocsize; u64 result; - UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n", + UFSD("ENTER, ino %llu, cgno %u, goal %llu, count %u\n", inode->i_ino, cgno, (unsigned long long)goal, count); sb = inode->i_sb; diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 43f1578ab866..f10a50f7e78b 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -150,7 +150,7 @@ out: Ebadsize: ufs_error(sb, __func__, - "size of directory #%lu is not a multiple of chunk size", + "size of directory #%llu is not a multiple of chunk size", dir->i_ino ); goto fail; @@ -169,7 +169,7 @@ Espan: Einumber: error = "inode out of bounds"; bad_entry: - ufs_error(sb, __func__, "bad entry in directory #%lu: %s - " + ufs_error(sb, __func__, "bad entry in directory #%llu: %s - " "offset=%llu, rec_len=%d, name_len=%d", dir->i_ino, error, folio_pos(folio) + offs, rec_len, ufs_get_de_namlen(sb, p)); @@ -177,7 +177,7 @@ bad_entry: Eend: p = (struct ufs_dir_entry *)(kaddr + offs); ufs_error(sb, __func__, - "entry in directory #%lu spans the page boundary" + "entry in directory #%llu spans the page boundary" "offset=%llu", dir->i_ino, folio_pos(folio) + offs); fail: @@ -258,7 +258,7 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr, struct ufs_inode_info *ui = UFS_I(dir); struct ufs_dir_entry *de; - UFSD("ENTER, dir_ino %lu, name %s, namlen %u\n", dir->i_ino, name, namelen); + UFSD("ENTER, dir_ino %llu, name %s, namlen %u\n", dir->i_ino, name, namelen); if (npages == 0 || namelen > UFS_MAXNAMLEN) goto out; @@ -434,7 +434,7 @@ ufs_readdir(struct file *file, struct dir_context *ctx) if (IS_ERR(kaddr)) { ufs_error(sb, __func__, - "bad page in #%lu", + "bad page in #%llu", inode->i_ino); ctx->pos += PAGE_SIZE - offset; return PTR_ERR(kaddr); diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 73531827ecee..8e51f4630d18 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -63,7 +63,7 @@ void ufs_free_inode (struct inode * inode) int is_directory; unsigned ino, cg, bit; - UFSD("ENTER, ino %lu\n", inode->i_ino); + UFSD("ENTER, ino %llu\n", inode->i_ino); sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; @@ -317,7 +317,7 @@ cg_found: bh = sb_bread(sb, uspi->s_sbbase + ufs_inotofsba(inode->i_ino)); if (!bh) { ufs_warning(sb, "ufs_read_inode", - "unable to read inode %lu\n", + "unable to read inode %llu\n", inode->i_ino); err = -EIO; goto fail_remove_inode; @@ -336,7 +336,7 @@ cg_found: } mutex_unlock(&sbi->s_lock); - UFSD("allocating inode %lu\n", inode->i_ino); + UFSD("allocating inode %llu\n", inode->i_ino); UFSD("EXIT\n"); return inode; diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index e2b0a35de2a7..2a8728c87979 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -400,7 +400,7 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff mutex_lock(&UFS_I(inode)->truncate_mutex); - UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment); + UFSD("ENTER, ino %llu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment); if (unlikely(!depth)) { ufs_warning(sb, "ufs_get_block", "block > big"); err = -EIO; @@ -595,7 +595,7 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode) struct super_block *sb = inode->i_sb; umode_t mode; - UFSD("Reading ufs2 inode, ino %lu\n", inode->i_ino); + UFSD("Reading ufs2 inode, ino %llu\n", inode->i_ino); /* * Copy data to the in-core inode. */ @@ -662,7 +662,7 @@ struct inode *ufs_iget(struct super_block *sb, unsigned long ino) bh = sb_bread(sb, uspi->s_sbbase + ufs_inotofsba(inode->i_ino)); if (!bh) { - ufs_warning(sb, "ufs_read_inode", "unable to read inode %lu\n", + ufs_warning(sb, "ufs_read_inode", "unable to read inode %llu\n", inode->i_ino); goto bad_inode; } @@ -793,17 +793,17 @@ static int ufs_update_inode(struct inode * inode, int do_sync) struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; struct buffer_head * bh; - UFSD("ENTER, ino %lu\n", inode->i_ino); + UFSD("ENTER, ino %llu\n", inode->i_ino); if (inode->i_ino < UFS_ROOTINO || inode->i_ino > (uspi->s_ncg * uspi->s_ipg)) { - ufs_warning (sb, "ufs_read_inode", "bad inode number (%lu)\n", inode->i_ino); + ufs_warning (sb, "ufs_read_inode", "bad inode number (%llu)\n", inode->i_ino); return -1; } bh = sb_bread(sb, ufs_inotofsba(inode->i_ino)); if (!bh) { - ufs_warning (sb, "ufs_read_inode", "unable to read inode %lu\n", inode->i_ino); + ufs_warning (sb, "ufs_read_inode", "unable to read inode %llu\n", inode->i_ino); return -1; } if (uspi->fs_magic == UFS2_MAGIC) { @@ -891,7 +891,7 @@ static void ufs_trunc_direct(struct inode *inode) unsigned int old_tail, new_tail; struct to_free ctx = {.inode = inode}; - UFSD("ENTER: ino %lu\n", inode->i_ino); + UFSD("ENTER: ino %llu\n", inode->i_ino); new_frags = DIRECT_FRAGMENT; // new_frags = first fragment past the new EOF @@ -956,7 +956,7 @@ static void ufs_trunc_direct(struct inode *inode) } } done: - UFSD("EXIT: ino %lu\n", inode->i_ino); + UFSD("EXIT: ino %llu\n", inode->i_ino); } static void free_full_branch(struct inode *inode, u64 ind_block, int depth) @@ -1169,7 +1169,7 @@ static int ufs_truncate(struct inode *inode, loff_t size) { int err = 0; - UFSD("ENTER: ino %lu, i_size: %llu, old_i_size: %llu\n", + UFSD("ENTER: ino %llu, i_size: %llu, old_i_size: %llu\n", inode->i_ino, (unsigned long long)size, (unsigned long long)i_size_read(inode)); diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h index 0905f9a16b91..b8dc354ae90f 100644 --- a/fs/ufs/ufs_fs.h +++ b/fs/ufs/ufs_fs.h @@ -226,10 +226,10 @@ typedef __u16 __bitwise __fs16; * inode number to cylinder group number. * inode number to file system block address. */ -#define ufs_inotocg(x) ((x) / uspi->s_ipg) -#define ufs_inotocgoff(x) ((x) % uspi->s_ipg) +#define ufs_inotocg(x) ((unsigned int)(x) / uspi->s_ipg) +#define ufs_inotocgoff(x) ((unsigned int)(x) % uspi->s_ipg) #define ufs_inotofsba(x) (((u64)ufs_cgimin(ufs_inotocg(x))) + ufs_inotocgoff(x) / uspi->s_inopf) -#define ufs_inotofsbo(x) ((x) % uspi->s_inopf) +#define ufs_inotofsbo(x) ((unsigned int)(x) % uspi->s_inopf) /* * Compute the cylinder and rotational position of a cyl block addr. diff --git a/fs/ufs/util.c b/fs/ufs/util.c index 034b1d82c355..dff6f74618de 100644 --- a/fs/ufs/util.c +++ b/fs/ufs/util.c @@ -203,7 +203,7 @@ struct folio *ufs_get_locked_folio(struct address_space *mapping, folio = read_mapping_folio(mapping, index, NULL); if (IS_ERR(folio)) { - printk(KERN_ERR "ufs_change_blocknr: read_mapping_folio error: ino %lu, index: %lu\n", + printk(KERN_ERR "ufs_change_blocknr: read_mapping_folio error: ino %llu, index: %lu\n", mapping->host->i_ino, index); return folio; } diff --git a/fs/verity/init.c b/fs/verity/init.c index d65206608583..3aa55dec88fc 100644 --- a/fs/verity/init.c +++ b/fs/verity/init.c @@ -50,7 +50,7 @@ void fsverity_msg(const struct inode *inode, const char *level, vaf.fmt = fmt; vaf.va = &args; if (inode) - printk("%sfs-verity (%s, inode %lu): %pV\n", + printk("%sfs-verity (%s, inode %llu): %pV\n", level, inode->i_sb->s_id, inode->i_ino, &vaf); else printk("%sfs-verity: %pV\n", level, &vaf); diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index e83b2ec5e49f..9b646cb5335d 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -297,7 +297,7 @@ static void zonefs_handle_io_error(struct inode *inode, struct blk_zone *zone, */ if (isize != data_size) zonefs_warn(sb, - "inode %lu: invalid size %lld (should be %lld)\n", + "inode %llu: invalid size %lld (should be %lld)\n", inode->i_ino, isize, data_size); /* @@ -308,7 +308,7 @@ static void zonefs_handle_io_error(struct inode *inode, struct blk_zone *zone, */ if ((z->z_flags & ZONEFS_ZONE_OFFLINE) || (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL)) { - zonefs_warn(sb, "inode %lu: read/write access disabled\n", + zonefs_warn(sb, "inode %llu: read/write access disabled\n", inode->i_ino); if (!(z->z_flags & ZONEFS_ZONE_OFFLINE)) z->z_flags |= ZONEFS_ZONE_OFFLINE; @@ -316,7 +316,7 @@ static void zonefs_handle_io_error(struct inode *inode, struct blk_zone *zone, data_size = 0; } else if ((z->z_flags & ZONEFS_ZONE_READONLY) || (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO)) { - zonefs_warn(sb, "inode %lu: write access disabled\n", + zonefs_warn(sb, "inode %llu: write access disabled\n", inode->i_ino); if (!(z->z_flags & ZONEFS_ZONE_READONLY)) z->z_flags |= ZONEFS_ZONE_READONLY; @@ -402,7 +402,7 @@ void __zonefs_io_error(struct inode *inode, bool write) memalloc_noio_restore(noio_flag); if (ret != 1) { - zonefs_err(sb, "Get inode %lu zone information failed %d\n", + zonefs_err(sb, "Get inode %llu zone information failed %d\n", inode->i_ino, ret); zonefs_warn(sb, "remounting filesystem read-only\n"); sb->s_flags |= SB_RDONLY; diff --git a/include/linux/fs.h b/include/linux/fs.h index b4f5e5fdbe4b..e820c14f9c5a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -783,7 +783,7 @@ struct inode { #endif /* Stat data, not accessed from path walking */ - unsigned long i_ino; + u64 i_ino; /* * Filesystems may only read i_nlink directly. They shall use the * following functions for modification: diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 923b24b321cc..4084e926e284 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -344,7 +344,7 @@ out: static void update_ref_ctr_warn(struct uprobe *uprobe, struct mm_struct *mm, short d) { - pr_warn("ref_ctr %s failed for inode: 0x%lx offset: " + pr_warn("ref_ctr %s failed for inode: 0x%llx offset: " "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%p\n", d > 0 ? "increment" : "decrement", uprobe->inode->i_ino, (unsigned long long) uprobe->offset, @@ -982,7 +982,7 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) static void ref_ctr_mismatch_warn(struct uprobe *cur_uprobe, struct uprobe *uprobe) { - pr_warn("ref_ctr_offset mismatch. inode: 0x%lx offset: 0x%llx " + pr_warn("ref_ctr_offset mismatch. inode: 0x%llx offset: 0x%llx " "ref_ctr_offset(old): 0x%llx ref_ctr_offset(new): 0x%llx\n", uprobe->inode->i_ino, (unsigned long long) uprobe->offset, (unsigned long long) cur_uprobe->ref_ctr_offset, diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index b816c56124ab..5fc54836dfa8 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -1305,7 +1305,7 @@ static int nr_info_show(struct seq_file *seq, void *v) seq_printf(seq, "%-9s ", ax2asc(buf, &nr->user_addr)); seq_printf(seq, "%-9s ", ax2asc(buf, &nr->dest_addr)); seq_printf(seq, -"%-9s %-3s %02X/%02X %02X/%02X %2d %3d %3d %3d %3lu/%03lu %2lu/%02lu %3lu/%03lu %3lu/%03lu %2d/%02d %3d %5d %5d %ld\n", +"%-9s %-3s %02X/%02X %02X/%02X %2d %3d %3d %3d %3lu/%03lu %2lu/%02lu %3lu/%03lu %3lu/%03lu %2d/%02d %3d %5d %5d %llu\n", ax2asc(buf, &nr->source_addr), devname, nr->my_index, @@ -1329,7 +1329,7 @@ static int nr_info_show(struct seq_file *seq, void *v) nr->window, sk_wmem_alloc_get(s), sk_rmem_alloc_get(s), - s->sk_socket ? SOCK_INODE(s->sk_socket)->i_ino : 0L); + s->sk_socket ? SOCK_INODE(s->sk_socket)->i_ino : (u64)0); bh_unlock_sock(s); } diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 841d62481048..53557176b41e 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -1479,7 +1479,7 @@ static int rose_info_show(struct seq_file *seq, void *v) callsign = ax2asc(buf, &rose->source_call); seq_printf(seq, - "%-10s %-9s %-5s %3.3X %05d %d %d %d %d %3lu %3lu %3lu %3lu %3lu %3lu/%03lu %5d %5d %ld\n", + "%-10s %-9s %-5s %3.3X %05d %d %d %d %d %3lu %3lu %3lu %3lu %3lu %3lu/%03lu %5d %5d %llu\n", rose2asc(rsbuf, &rose->source_addr), callsign, devname, @@ -1498,7 +1498,7 @@ static int rose_info_show(struct seq_file *seq, void *v) rose->idle / (60 * HZ), sk_wmem_alloc_get(s), sk_rmem_alloc_get(s), - s->sk_socket ? SOCK_INODE(s->sk_socket)->i_ino : 0L); + s->sk_socket ? SOCK_INODE(s->sk_socket)->i_ino : (u64)0); } return 0; diff --git a/net/socket.c b/net/socket.c index 136b98c54fb3..395c271afb1d 100644 --- a/net/socket.c +++ b/net/socket.c @@ -373,7 +373,7 @@ static const struct super_operations sockfs_ops = { */ static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen) { - return dynamic_dname(buffer, buflen, "socket:[%lu]", + return dynamic_dname(buffer, buflen, "socket:[%llu]", d_inode(dentry)->i_ino); } diff --git a/net/x25/x25_proc.c b/net/x25/x25_proc.c index 0412814a2295..7e0dbff8f538 100644 --- a/net/x25/x25_proc.c +++ b/net/x25/x25_proc.c @@ -96,7 +96,7 @@ static int x25_seq_socket_show(struct seq_file *seq, void *v) devname = x25->neighbour->dev->name; seq_printf(seq, "%-10s %-10s %-5s %3.3X %d %d %d %d %3lu %3lu " - "%3lu %3lu %3lu %5d %5d %ld\n", + "%3lu %3lu %3lu %5d %5d %llu\n", !x25->dest_addr.x25_addr[0] ? "*" : x25->dest_addr.x25_addr, !x25->source_addr.x25_addr[0] ? "*" : x25->source_addr.x25_addr, devname, x25->lci & 0x0FFF, x25->state, x25->vs, x25->vr, @@ -104,7 +104,7 @@ static int x25_seq_socket_show(struct seq_file *seq, void *v) x25->t21 / HZ, x25->t22 / HZ, x25->t23 / HZ, sk_wmem_alloc_get(s), sk_rmem_alloc_get(s), - s->sk_socket ? SOCK_INODE(s->sk_socket)->i_ino : 0L); + s->sk_socket ? SOCK_INODE(s->sk_socket)->i_ino : (u64)0); out: return 0; } diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c index 2f84bd23edb6..7b645f40e71c 100644 --- a/security/apparmor/apparmorfs.c +++ b/security/apparmor/apparmorfs.c @@ -149,7 +149,7 @@ static int aafs_count; static int aafs_show_path(struct seq_file *seq, struct dentry *dentry) { - seq_printf(seq, "%s:[%lu]", AAFS_NAME, d_inode(dentry)->i_ino); + seq_printf(seq, "%s:[%llu]", AAFS_NAME, d_inode(dentry)->i_ino); return 0; } @@ -2644,7 +2644,7 @@ static int policy_readlink(struct dentry *dentry, char __user *buffer, char name[32]; int res; - res = snprintf(name, sizeof(name), "%s:[%lu]", AAFS_NAME, + res = snprintf(name, sizeof(name), "%s:[%llu]", AAFS_NAME, d_inode(dentry)->i_ino); if (res > 0 && res < sizeof(name)) res = readlink_copy(buffer, buflen, name, strlen(name)); diff --git a/security/integrity/integrity_audit.c b/security/integrity/integrity_audit.c index 0ec5e4c22cb2..d8d9e5ff1cd2 100644 --- a/security/integrity/integrity_audit.c +++ b/security/integrity/integrity_audit.c @@ -62,7 +62,7 @@ void integrity_audit_message(int audit_msgno, struct inode *inode, if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); - audit_log_format(ab, " ino=%lu", inode->i_ino); + audit_log_format(ab, " ino=%llu", inode->i_ino); } audit_log_format(ab, " res=%d errno=%d", !result, errno); audit_log_end(ab); diff --git a/security/ipe/audit.c b/security/ipe/audit.c index 3f0deeb54912..93fb59fbddd6 100644 --- a/security/ipe/audit.c +++ b/security/ipe/audit.c @@ -153,7 +153,7 @@ void ipe_audit_match(const struct ipe_eval_ctx *const ctx, if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); - audit_log_format(ab, " ino=%lu", inode->i_ino); + audit_log_format(ab, " ino=%llu", inode->i_ino); } else { audit_log_format(ab, " dev=? ino=?"); } diff --git a/security/lsm_audit.c b/security/lsm_audit.c index 7d623b00495c..737f5a263a8f 100644 --- a/security/lsm_audit.c +++ b/security/lsm_audit.c @@ -202,7 +202,7 @@ void audit_log_lsm_data(struct audit_buffer *ab, if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); - audit_log_format(ab, " ino=%lu", inode->i_ino); + audit_log_format(ab, " ino=%llu", inode->i_ino); } break; } @@ -215,7 +215,7 @@ void audit_log_lsm_data(struct audit_buffer *ab, if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); - audit_log_format(ab, " ino=%lu", inode->i_ino); + audit_log_format(ab, " ino=%llu", inode->i_ino); } break; } @@ -228,7 +228,7 @@ void audit_log_lsm_data(struct audit_buffer *ab, if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); - audit_log_format(ab, " ino=%lu", inode->i_ino); + audit_log_format(ab, " ino=%llu", inode->i_ino); } audit_log_format(ab, " ioctlcmd=0x%hx", a->u.op->cmd); @@ -246,7 +246,7 @@ void audit_log_lsm_data(struct audit_buffer *ab, if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); - audit_log_format(ab, " ino=%lu", inode->i_ino); + audit_log_format(ab, " ino=%llu", inode->i_ino); } break; } @@ -265,7 +265,7 @@ void audit_log_lsm_data(struct audit_buffer *ab, } audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); - audit_log_format(ab, " ino=%lu", inode->i_ino); + audit_log_format(ab, " ino=%llu", inode->i_ino); rcu_read_unlock(); break; } diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index d8224ea113d1..8f38de4d223e 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -1400,7 +1400,7 @@ static int inode_doinit_use_xattr(struct inode *inode, struct dentry *dentry, if (rc < 0) { kfree(context); if (rc != -ENODATA) { - pr_warn("SELinux: %s: getxattr returned %d for dev=%s ino=%ld\n", + pr_warn("SELinux: %s: getxattr returned %d for dev=%s ino=%llu\n", __func__, -rc, inode->i_sb->s_id, inode->i_ino); return rc; } @@ -1412,13 +1412,13 @@ static int inode_doinit_use_xattr(struct inode *inode, struct dentry *dentry, def_sid, GFP_NOFS); if (rc) { char *dev = inode->i_sb->s_id; - unsigned long ino = inode->i_ino; + u64 ino = inode->i_ino; if (rc == -EINVAL) { - pr_notice_ratelimited("SELinux: inode=%lu on dev=%s was found to have an invalid context=%s. This indicates you may need to relabel the inode or the filesystem in question.\n", + pr_notice_ratelimited("SELinux: inode=%llu on dev=%s was found to have an invalid context=%s. This indicates you may need to relabel the inode or the filesystem in question.\n", ino, dev, context); } else { - pr_warn("SELinux: %s: context_to_sid(%s) returned %d for dev=%s ino=%ld\n", + pr_warn("SELinux: %s: context_to_sid(%s) returned %d for dev=%s ino=%llu\n", __func__, context, -rc, dev, ino); } } @@ -3477,7 +3477,7 @@ static void selinux_inode_post_setxattr(struct dentry *dentry, const char *name, &newsid); if (rc) { pr_err("SELinux: unable to map context to SID" - "for (%s, %lu), rc=%d\n", + "for (%s, %llu), rc=%d\n", inode->i_sb->s_id, inode->i_ino, -rc); return; } diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 98af9d7b9434..2eb3368a3632 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -182,7 +182,7 @@ static int smk_bu_inode(struct inode *inode, int mode, int rc) char acc[SMK_NUM_ACCESS_TYPE + 1]; if (isp->smk_flags & SMK_INODE_IMPURE) - pr_info("Smack Unconfined Corruption: inode=(%s %ld) %s\n", + pr_info("Smack Unconfined Corruption: inode=(%s %llu) %s\n", inode->i_sb->s_id, inode->i_ino, current->comm); if (rc <= 0) @@ -195,7 +195,7 @@ static int smk_bu_inode(struct inode *inode, int mode, int rc) smk_bu_mode(mode, acc); - pr_info("Smack %s: (%s %s %s) inode=(%s %ld) %s\n", smk_bu_mess[rc], + pr_info("Smack %s: (%s %s %s) inode=(%s %llu) %s\n", smk_bu_mess[rc], tsp->smk_task->smk_known, isp->smk_inode->smk_known, acc, inode->i_sb->s_id, inode->i_ino, current->comm); return 0; @@ -214,7 +214,7 @@ static int smk_bu_file(struct file *file, int mode, int rc) char acc[SMK_NUM_ACCESS_TYPE + 1]; if (isp->smk_flags & SMK_INODE_IMPURE) - pr_info("Smack Unconfined Corruption: inode=(%s %ld) %s\n", + pr_info("Smack Unconfined Corruption: inode=(%s %llu) %s\n", inode->i_sb->s_id, inode->i_ino, current->comm); if (rc <= 0) @@ -223,7 +223,7 @@ static int smk_bu_file(struct file *file, int mode, int rc) rc = 0; smk_bu_mode(mode, acc); - pr_info("Smack %s: (%s %s %s) file=(%s %ld %pD) %s\n", smk_bu_mess[rc], + pr_info("Smack %s: (%s %s %s) file=(%s %llu %pD) %s\n", smk_bu_mess[rc], sskp->smk_known, smk_of_inode(inode)->smk_known, acc, inode->i_sb->s_id, inode->i_ino, file, current->comm); @@ -244,7 +244,7 @@ static int smk_bu_credfile(const struct cred *cred, struct file *file, char acc[SMK_NUM_ACCESS_TYPE + 1]; if (isp->smk_flags & SMK_INODE_IMPURE) - pr_info("Smack Unconfined Corruption: inode=(%s %ld) %s\n", + pr_info("Smack Unconfined Corruption: inode=(%s %llu) %s\n", inode->i_sb->s_id, inode->i_ino, current->comm); if (rc <= 0) @@ -253,7 +253,7 @@ static int smk_bu_credfile(const struct cred *cred, struct file *file, rc = 0; smk_bu_mode(mode, acc); - pr_info("Smack %s: (%s %s %s) file=(%s %ld %pD) %s\n", smk_bu_mess[rc], + pr_info("Smack %s: (%s %s %s) file=(%s %llu %pD) %s\n", smk_bu_mess[rc], sskp->smk_known, smk_of_inode(inode)->smk_known, acc, inode->i_sb->s_id, inode->i_ino, file, current->comm); -- cgit v1.2.3 From ebeca1f930eac8f11f815d58eb38fa5d07e7c16e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Mar 2026 07:58:03 -1000 Subject: sched_ext: Introduce cgroup sub-sched support A system often runs multiple workloads especially in multi-tenant server environments where a system is split into partitions servicing separate more-or-less independent workloads each requiring an application-specific scheduler. To support such and other use cases, sched_ext is in the process of growing multiple scheduler support. When partitioning a system in terms of CPUs for such use cases, an oft-taken approach is hard partitioning the system using cpuset. While it would be possible to tie sched_ext multiple scheduler support to cpuset partitions, such an approach would have fundamental limitations stemming from the lack of dynamism and flexibility. Users often don't care which specific CPUs are assigned to which workload and want to take advantage of optimizations which are enabled by running workloads on a larger machine - e.g. opportunistic over-commit, improving latency critical workload characteristics while maintaining bandwidth fairness, employing control mechanisms based on different criteria than on-CPU time for e.g. flexible memory bandwidth isolation, packing similar parts from different workloads on same L3s to improve cache efficiency, and so on. As this sort of dynamic behaviors are impossible or difficult to implement with hard partitioning, sched_ext is implementing cgroup sub-sched support where schedulers can be attached to the cgroup hierarchy and a parent scheduler is responsible for controlling the CPUs that each child can use at any given moment. This makes CPU distribution dynamically controlled by BPF allowing high flexibility. This patch adds the skeletal sched_ext cgroup sub-sched support: - sched_ext_ops.sub_cgroup_id and .sub_attach/detach() are added. Non-zero sub_cgroup_id indicates that the scheduler is to be attached to the identified cgroup. A sub-sched is attached to the cgroup iff the nearest ancestor scheduler implements .sub_attach() and grants the attachment. Max nesting depth is limited by SCX_SUB_MAX_DEPTH. - When a scheduler exits, all its descendant schedulers are exited together. Also, cgroup.scx_sched added which points to the effective scheduler instance for the cgroup. This is updated on scheduler init/exit and inherited on cgroup online. When a cgroup is offlined, the attached scheduler is automatically exited. - Sub-sched support is gated on CONFIG_EXT_SUB_SCHED which is automatically enabled if both SCX and cgroups are enabled. Sub-sched support is not tied to the CPU controller but rather the cgroup hierarchy itself. This is intentional as the support for cpu.weight and cpu.max based resource control is orthogonal to sub-sched support. Note that CONFIG_CGROUPS around cgroup subtree iteration support for scx_task_iter is replaced with CONFIG_EXT_SUB_SCHED for consistency. - This allows loading sub-scheds and most framework operations such as propagating disable down the hierarchy work. However, sub-scheds are not operational yet and all tasks stay with the root sched. This will serve as the basis for building up full sub-sched support. - DSQs point to the scx_sched they belong to. - scx_qmap is updated to allow attachment of sub-scheds and also serving as sub-scheds. - scx_is_descendant() is added but not yet used in this patch. It is used by later changes in the series and placed here as this is where the function belongs. Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/cgroup-defs.h | 4 + include/linux/sched/ext.h | 3 + init/Kconfig | 4 + kernel/sched/ext.c | 532 ++++++++++++++++++++++++++++++++++++++--- kernel/sched/ext_internal.h | 67 +++++- tools/sched_ext/scx_qmap.bpf.c | 9 +- tools/sched_ext/scx_qmap.c | 13 +- 7 files changed, 596 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index bb92f5c169ca..dd61767cf9bb 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -624,6 +625,9 @@ struct cgroup { #ifdef CONFIG_BPF_SYSCALL struct bpf_local_storage __rcu *bpf_cgrp_storage; #endif +#ifdef CONFIG_EXT_SUB_SCHED + struct scx_sched __rcu *scx_sched; +#endif /* All ancestors including self */ union { diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 0150b3fe6230..fa4349b319e6 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -78,6 +78,7 @@ struct scx_dispatch_q { u64 id; struct rhash_head hash_node; struct llist_node free_node; + struct scx_sched *sched; struct rcu_head rcu; }; @@ -157,6 +158,8 @@ struct scx_dsq_list_node { .priv = (__priv), \ } +struct scx_sched; + /* * The following is embedded in task_struct and contains all fields necessary * for a task to be scheduled by SCX. diff --git a/init/Kconfig b/init/Kconfig index b55deae9256c..06abd8e272cb 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1176,6 +1176,10 @@ config EXT_GROUP_SCHED endif #CGROUP_SCHED +config EXT_SUB_SCHED + def_bool y + depends on SCHED_CLASS_EXT + config SCHED_MM_CID def_bool y depends on SMP && RSEQ diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 142845bcddaa..bb3e33b660da 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -9,6 +9,8 @@ #include #include "ext_idle.h" +static DEFINE_RAW_SPINLOCK(scx_sched_lock); + /* * NOTE: sched_ext is in the process of growing multiple scheduler support and * scx_root usage is in a transitional state. Naked dereferences are safe if the @@ -19,6 +21,12 @@ */ static struct scx_sched __rcu *scx_root; +/* + * All scheds, writers must hold both scx_enable_mutex and scx_sched_lock. + * Readers can hold either or rcu_read_lock(). + */ +static LIST_HEAD(scx_sched_all); + /* * During exit, a task may schedule after losing its PIDs. When disabling the * BPF scheduler, we need to be able to iterate tasks in every state to @@ -197,6 +205,7 @@ static void process_ddsp_deferred_locals(struct rq *rq); static bool task_dead_and_done(struct task_struct *p); static u32 reenq_local(struct rq *rq); static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags); +static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind); static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, s64 exit_code, const char *fmt, va_list args); @@ -245,6 +254,88 @@ static bool u32_before(u32 a, u32 b) return (s32)(a - b) < 0; } +#ifdef CONFIG_EXT_SUB_SCHED +/** + * scx_parent - Find the parent sched + * @sch: sched to find the parent of + * + * Returns the parent scheduler or %NULL if @sch is root. + */ +static struct scx_sched *scx_parent(struct scx_sched *sch) +{ + if (sch->level) + return sch->ancestors[sch->level - 1]; + else + return NULL; +} + +/** + * scx_next_descendant_pre - find the next descendant for pre-order walk + * @pos: the current position (%NULL to initiate traversal) + * @root: sched whose descendants to walk + * + * To be used by scx_for_each_descendant_pre(). Find the next descendant to + * visit for pre-order traversal of @root's descendants. @root is included in + * the iteration and the first node to be visited. + */ +static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, + struct scx_sched *root) +{ + struct scx_sched *next; + + lockdep_assert(lockdep_is_held(&scx_enable_mutex) || + lockdep_is_held(&scx_sched_lock)); + + /* if first iteration, visit @root */ + if (!pos) + return root; + + /* visit the first child if exists */ + next = list_first_entry_or_null(&pos->children, struct scx_sched, sibling); + if (next) + return next; + + /* no child, visit my or the closest ancestor's next sibling */ + while (pos != root) { + if (!list_is_last(&pos->sibling, &scx_parent(pos)->children)) + return list_next_entry(pos, sibling); + pos = scx_parent(pos); + } + + return NULL; +} +#else /* CONFIG_EXT_SUB_SCHED */ +static struct scx_sched *scx_parent(struct scx_sched *sch) { return NULL; } +static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; } +#endif /* CONFIG_EXT_SUB_SCHED */ + +/** + * scx_is_descendant - Test whether sched is a descendant + * @sch: sched to test + * @ancestor: ancestor sched to test against + * + * Test whether @sch is a descendant of @ancestor. + */ +static bool scx_is_descendant(struct scx_sched *sch, struct scx_sched *ancestor) +{ + if (sch->level < ancestor->level) + return false; + return sch->ancestors[ancestor->level] == ancestor; +} + +/** + * scx_for_each_descendant_pre - pre-order walk of a sched's descendants + * @pos: iteration cursor + * @root: sched to walk the descendants of + * + * Walk @root's descendants. @root is included in the iteration and the first + * node to be visited. Must be called with either scx_enable_mutex or + * scx_sched_lock held. + */ +#define scx_for_each_descendant_pre(pos, root) \ + for ((pos) = scx_next_descendant_pre(NULL, (root)); (pos); \ + (pos) = scx_next_descendant_pre((pos), (root))) + static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch, struct task_struct *p) { @@ -514,7 +605,7 @@ struct scx_task_iter { struct rq_flags rf; u32 cnt; bool list_locked; -#ifdef CONFIG_CGROUPS +#ifdef CONFIG_EXT_SUB_SCHED struct cgroup *cgrp; struct cgroup_subsys_state *css_pos; struct css_task_iter css_iter; @@ -553,7 +644,7 @@ static void scx_task_iter_start(struct scx_task_iter *iter, struct cgroup *cgrp) { memset(iter, 0, sizeof(*iter)); -#ifdef CONFIG_CGROUPS +#ifdef CONFIG_EXT_SUB_SCHED if (cgrp) { lockdep_assert_held(&cgroup_mutex); iter->cgrp = cgrp; @@ -614,7 +705,7 @@ static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter) */ static void scx_task_iter_stop(struct scx_task_iter *iter) { -#ifdef CONFIG_CGROUPS +#ifdef CONFIG_EXT_SUB_SCHED if (iter->cgrp) { if (iter->css_pos) css_task_iter_end(&iter->css_iter); @@ -645,7 +736,7 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) cond_resched(); } -#ifdef CONFIG_CGROUPS +#ifdef CONFIG_EXT_SUB_SCHED if (iter->cgrp) { while (iter->css_pos) { struct task_struct *p; @@ -3032,7 +3123,10 @@ static int scx_init_task(struct task_struct *p, struct task_group *tg, bool fork scx_set_task_state(p, SCX_TASK_INIT); if (p->scx.disallow) { - if (unlikely(fork)) { + if (unlikely(scx_parent(sch))) { + scx_error(sch, "non-root ops.init_task() set task->scx.disallow for %s[%d]", + p->comm, p->pid); + } else if (unlikely(fork)) { scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork", p->comm, p->pid); } else { @@ -3555,25 +3649,51 @@ void scx_group_set_bandwidth(struct task_group *tg, percpu_up_read(&scx_cgroup_ops_rwsem); } +#endif /* CONFIG_EXT_GROUP_SCHED */ + +#if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED) +static struct cgroup *root_cgroup(void) +{ + return &cgrp_dfl_root.cgrp; +} + +static struct cgroup *sch_cgroup(struct scx_sched *sch) +{ + return sch->cgrp; +} + +/* for each descendant of @cgrp including self, set ->scx_sched to @sch */ +static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) +{ + struct cgroup *pos; + struct cgroup_subsys_state *css; + + cgroup_for_each_live_descendant_pre(pos, css, cgrp) + rcu_assign_pointer(pos->scx_sched, sch); +} static void scx_cgroup_lock(void) { +#ifdef CONFIG_EXT_GROUP_SCHED percpu_down_write(&scx_cgroup_ops_rwsem); +#endif cgroup_lock(); } static void scx_cgroup_unlock(void) { cgroup_unlock(); +#ifdef CONFIG_EXT_GROUP_SCHED percpu_up_write(&scx_cgroup_ops_rwsem); +#endif } - -#else /* CONFIG_EXT_GROUP_SCHED */ - +#else /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ +static struct cgroup *root_cgroup(void) { return NULL; } +static struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; } +static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {} static void scx_cgroup_lock(void) {} static void scx_cgroup_unlock(void) {} - -#endif /* CONFIG_EXT_GROUP_SCHED */ +#endif /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ /* * Omitted operations: @@ -3622,13 +3742,15 @@ DEFINE_SCHED_CLASS(ext) = { #endif }; -static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id) +static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id, + struct scx_sched *sch) { memset(dsq, 0, sizeof(*dsq)); raw_spin_lock_init(&dsq->lock); INIT_LIST_HEAD(&dsq->list); dsq->id = dsq_id; + dsq->sched = sch; } static void free_dsq_irq_workfn(struct irq_work *irq_work) @@ -3826,6 +3948,12 @@ static void scx_sched_free_rcu_work(struct work_struct *work) irq_work_sync(&sch->error_irq_work); kthread_destroy_worker(sch->helper); +#ifdef CONFIG_EXT_SUB_SCHED + kfree(sch->cgrp_path); + if (sch_cgroup(sch)) + cgroup_put(sch_cgroup(sch)); +#endif /* CONFIG_EXT_SUB_SCHED */ + free_percpu(sch->pcpu); for_each_node_state(node, N_POSSIBLE) @@ -4405,6 +4533,8 @@ static const char *scx_exit_reason(enum scx_exit_kind kind) return "unregistered from the main kernel"; case SCX_EXIT_SYSRQ: return "disabled by sysrq-S"; + case SCX_EXIT_PARENT: + return "parent exiting"; case SCX_EXIT_ERROR: return "runtime error"; case SCX_EXIT_ERROR_BPF: @@ -4430,6 +4560,69 @@ static void free_kick_syncs(void) } } +#ifdef CONFIG_EXT_SUB_SCHED +static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq); + +static void drain_descendants(struct scx_sched *sch) +{ + /* + * Child scheds that finished the critical part of disabling will take + * themselves off @sch->children. Wait for it to drain. As propagation + * is recursive, empty @sch->children means that all proper descendant + * scheds reached unlinking stage. + */ + wait_event(scx_unlink_waitq, list_empty(&sch->children)); +} + +static void scx_sub_disable(struct scx_sched *sch) +{ + struct scx_sched *parent = scx_parent(sch); + + drain_descendants(sch); + + mutex_lock(&scx_enable_mutex); + percpu_down_write(&scx_fork_rwsem); + scx_cgroup_lock(); + + set_cgroup_sched(sch_cgroup(sch), parent); + + /* TODO - perform actual disabling here */ + + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); + + raw_spin_lock_irq(&scx_sched_lock); + list_del_init(&sch->sibling); + list_del_rcu(&sch->all); + raw_spin_unlock_irq(&scx_sched_lock); + + mutex_unlock(&scx_enable_mutex); + + /* + * @sch is now unlinked from the parent's children list. Notify and call + * ops.sub_detach/exit(). Note that ops.sub_detach/exit() must be called + * after unlinking and releasing all locks. See scx_claim_exit(). + */ + wake_up_all(&scx_unlink_waitq); + + if (sch->ops.sub_detach && sch->sub_attached) { + struct scx_sub_detach_args sub_detach_args = { + .ops = &sch->ops, + .cgroup_path = sch->cgrp_path, + }; + SCX_CALL_OP(parent, SCX_KF_UNLOCKED, sub_detach, NULL, + &sub_detach_args); + } + + if (sch->ops.exit) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, sch->exit_info); + kobject_del(&sch->kobj); +} +#else /* CONFIG_EXT_SUB_SCHED */ +static void drain_descendants(struct scx_sched *sch) { } +static void scx_sub_disable(struct scx_sched *sch) { } +#endif /* CONFIG_EXT_SUB_SCHED */ + static void scx_root_disable(struct scx_sched *sch) { struct scx_exit_info *ei = sch->exit_info; @@ -4437,9 +4630,10 @@ static void scx_root_disable(struct scx_sched *sch) struct task_struct *p; int cpu; - /* guarantee forward progress by bypassing scx_ops */ + /* guarantee forward progress and wait for descendants to be disabled */ scx_bypass(true); WRITE_ONCE(scx_aborting, false); + drain_descendants(sch); switch (scx_set_enable_state(SCX_DISABLING)) { case SCX_DISABLING: @@ -4498,6 +4692,11 @@ static void scx_root_disable(struct scx_sched *sch) scx_exit_task(p); } scx_task_iter_stop(&sti); + + scx_cgroup_lock(); + set_cgroup_sched(sch_cgroup(sch), NULL); + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); /* @@ -4534,6 +4733,10 @@ static void scx_root_disable(struct scx_sched *sch) cancel_delayed_work_sync(&scx_watchdog_work); + raw_spin_lock_irq(&scx_sched_lock); + list_del_rcu(&sch->all); + raw_spin_unlock_irq(&scx_sched_lock); + /* * scx_root clearing must be inside cpus_read_lock(). See * handle_hotplug(). @@ -4591,6 +4794,24 @@ static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind) * successfully reach scx_bypass(). */ WRITE_ONCE(scx_aborting, true); + + /* + * Propagate exits to descendants immediately. Each has a dedicated + * helper kthread and can run in parallel. While most of disabling is + * serialized, running them in separate threads allows parallelizing + * ops.exit(), which can take arbitrarily long prolonging bypass mode. + * + * This doesn't cause recursions as propagation only takes place for + * non-propagation exits. + */ + if (kind != SCX_EXIT_PARENT) { + scoped_guard (raw_spinlock_irqsave, &scx_sched_lock) { + struct scx_sched *pos; + scx_for_each_descendant_pre(pos, sch) + scx_disable(pos, SCX_EXIT_PARENT); + } + } + return true; } @@ -4611,7 +4832,10 @@ static void scx_disable_workfn(struct kthread_work *work) ei->kind = kind; ei->reason = scx_exit_reason(ei->kind); - scx_root_disable(sch); + if (scx_parent(sch)) + scx_sub_disable(sch); + else + scx_root_disable(sch); } static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind) @@ -4987,12 +5211,15 @@ static int alloc_kick_syncs(void) return 0; } -static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) +static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, + struct cgroup *cgrp, + struct scx_sched *parent) { struct scx_sched *sch; + s32 level = parent ? parent->level + 1 : 0; int node, ret; - sch = kzalloc_obj(*sch); + sch = kzalloc_flex(*sch, ancestors, level); if (!sch) return ERR_PTR(-ENOMEM); @@ -5021,7 +5248,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) goto err_free_gdsqs; } - init_dsq(dsq, SCX_DSQ_GLOBAL); + init_dsq(dsq, SCX_DSQ_GLOBAL, sch); sch->global_dsqs[node] = dsq; } @@ -5039,6 +5266,12 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) sched_set_fifo(sch->helper->task); + if (parent) + memcpy(sch->ancestors, parent->ancestors, + level * sizeof(parent->ancestors[0])); + sch->ancestors[level] = sch; + sch->level = level; + atomic_set(&sch->exit_kind, SCX_EXIT_NONE); init_irq_work(&sch->error_irq_work, scx_error_irq_workfn); kthread_init_work(&sch->disable_work, scx_disable_workfn); @@ -5046,10 +5279,46 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) ops->priv = sch; sch->kobj.kset = scx_kset; + +#ifdef CONFIG_EXT_SUB_SCHED + char *buf = kzalloc(PATH_MAX, GFP_KERNEL); + if (!buf) + goto err_stop_helper; + cgroup_path(cgrp, buf, PATH_MAX); + sch->cgrp_path = kstrdup(buf, GFP_KERNEL); + kfree(buf); + if (!sch->cgrp_path) + goto err_stop_helper; + + sch->cgrp = cgrp; + INIT_LIST_HEAD(&sch->children); + INIT_LIST_HEAD(&sch->sibling); + + if (parent) + ret = kobject_init_and_add(&sch->kobj, &scx_ktype, + &parent->sub_kset->kobj, + "sub-%llu", cgroup_id(cgrp)); + else + ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); + + if (ret < 0) { + kfree(sch->cgrp_path); + goto err_stop_helper; + } + + if (ops->sub_attach) { + sch->sub_kset = kset_create_and_add("sub", NULL, &sch->kobj); + if (!sch->sub_kset) { + kobject_put(&sch->kobj); + return ERR_PTR(-ENOMEM); + } + } + +#else /* CONFIG_EXT_SUB_SCHED */ ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); if (ret < 0) goto err_stop_helper; - +#endif /* CONFIG_EXT_SUB_SCHED */ return sch; err_stop_helper: @@ -5157,7 +5426,7 @@ static void scx_root_enable_workfn(struct kthread_work *work) if (ret) goto err_unlock; - sch = scx_alloc_and_add_sched(ops); + sch = scx_alloc_and_add_sched(ops, root_cgroup(), NULL); if (IS_ERR(sch)) { ret = PTR_ERR(sch); goto err_free_ksyncs; @@ -5174,8 +5443,13 @@ static void scx_root_enable_workfn(struct kthread_work *work) atomic_long_set(&scx_nr_rejected, 0); - for_each_possible_cpu(cpu) - cpu_rq(cpu)->scx.cpuperf_target = SCX_CPUPERF_ONE; + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + + rq->scx.local_dsq.sched = sch; + rq->scx.bypass_dsq.sched = sch; + rq->scx.cpuperf_target = SCX_CPUPERF_ONE; + } /* * Keep CPUs stable during enable so that the BPF scheduler can track @@ -5189,6 +5463,10 @@ static void scx_root_enable_workfn(struct kthread_work *work) */ rcu_assign_pointer(scx_root, sch); + raw_spin_lock_irq(&scx_sched_lock); + list_add_tail_rcu(&sch->all, &scx_sched_all); + raw_spin_unlock_irq(&scx_sched_lock); + scx_idle_enable(ops); if (sch->ops.init) { @@ -5278,6 +5556,7 @@ static void scx_root_enable_workfn(struct kthread_work *work) * never sees uninitialized tasks. */ scx_cgroup_lock(); + set_cgroup_sched(sch_cgroup(sch), sch); ret = scx_cgroup_init(sch); if (ret) goto err_disable_unlock_all; @@ -5392,6 +5671,185 @@ err_disable: cmd->ret = 0; } +#ifdef CONFIG_EXT_SUB_SCHED +/* verify that a scheduler can be attached to @cgrp and return the parent */ +static struct scx_sched *find_parent_sched(struct cgroup *cgrp) +{ + struct scx_sched *parent = cgrp->scx_sched; + struct scx_sched *pos; + + lockdep_assert_held(&scx_sched_lock); + + /* can't attach twice to the same cgroup */ + if (parent->cgrp == cgrp) + return ERR_PTR(-EBUSY); + + /* does $parent allow sub-scheds? */ + if (!parent->ops.sub_attach) + return ERR_PTR(-EOPNOTSUPP); + + /* can't insert between $parent and its exiting children */ + list_for_each_entry(pos, &parent->children, sibling) + if (cgroup_is_descendant(pos->cgrp, cgrp)) + return ERR_PTR(-EBUSY); + + return parent; +} + +static void scx_sub_enable_workfn(struct kthread_work *work) +{ + struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work); + struct sched_ext_ops *ops = cmd->ops; + struct cgroup *cgrp; + struct scx_sched *parent, *sch; + s32 ret; + + mutex_lock(&scx_enable_mutex); + + if (!scx_enabled()) { + ret = -ENODEV; + goto out_unlock; + } + + cgrp = cgroup_get_from_id(ops->sub_cgroup_id); + if (IS_ERR(cgrp)) { + ret = PTR_ERR(cgrp); + goto out_unlock; + } + + raw_spin_lock_irq(&scx_sched_lock); + parent = find_parent_sched(cgrp); + if (IS_ERR(parent)) { + raw_spin_unlock_irq(&scx_sched_lock); + ret = PTR_ERR(parent); + goto out_put_cgrp; + } + kobject_get(&parent->kobj); + raw_spin_unlock_irq(&scx_sched_lock); + + sch = scx_alloc_and_add_sched(ops, cgrp, parent); + kobject_put(&parent->kobj); + if (IS_ERR(sch)) { + ret = PTR_ERR(sch); + goto out_put_cgrp; + } + + raw_spin_lock_irq(&scx_sched_lock); + list_add_tail(&sch->sibling, &parent->children); + list_add_tail_rcu(&sch->all, &scx_sched_all); + raw_spin_unlock_irq(&scx_sched_lock); + + if (sch->level >= SCX_SUB_MAX_DEPTH) { + scx_error(sch, "max nesting depth %d violated", + SCX_SUB_MAX_DEPTH); + goto err_disable; + } + + if (sch->ops.init) { + ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init, NULL); + if (ret) { + ret = ops_sanitize_err(sch, "init", ret); + scx_error(sch, "ops.init() failed (%d)", ret); + goto err_disable; + } + sch->exit_info->flags |= SCX_EFLAG_INITIALIZED; + } + + if (validate_ops(sch, ops)) + goto err_disable; + + struct scx_sub_attach_args sub_attach_args = { + .ops = &sch->ops, + .cgroup_path = sch->cgrp_path, + }; + + ret = SCX_CALL_OP_RET(parent, SCX_KF_UNLOCKED, sub_attach, NULL, + &sub_attach_args); + if (ret) { + ret = ops_sanitize_err(sch, "sub_attach", ret); + scx_error(sch, "parent rejected (%d)", ret); + goto err_disable; + } + sch->sub_attached = true; + + percpu_down_write(&scx_fork_rwsem); + scx_cgroup_lock(); + + /* + * Set cgroup->scx_sched's and check CSS_ONLINE. Either we see + * !CSS_ONLINE or scx_cgroup_lifetime_notify() sees and shoots us down. + */ + set_cgroup_sched(sch_cgroup(sch), sch); + if (!(cgrp->self.flags & CSS_ONLINE)) { + scx_error(sch, "cgroup is not online"); + goto err_unlock_and_disable; + } + + /* TODO - perform actual enabling here */ + + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); + + pr_info("sched_ext: BPF sub-scheduler \"%s\" enabled\n", sch->ops.name); + kobject_uevent(&sch->kobj, KOBJ_ADD); + ret = 0; + goto out_unlock; + +out_put_cgrp: + cgroup_put(cgrp); +out_unlock: + mutex_unlock(&scx_enable_mutex); + cmd->ret = ret; + return; + +err_unlock_and_disable: + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); +err_disable: + mutex_unlock(&scx_enable_mutex); + kthread_flush_work(&sch->disable_work); + cmd->ret = 0; +} + +static s32 scx_cgroup_lifetime_notify(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct cgroup *cgrp = data; + struct cgroup *parent = cgroup_parent(cgrp); + + if (!cgroup_on_dfl(cgrp)) + return NOTIFY_OK; + + switch (action) { + case CGROUP_LIFETIME_ONLINE: + /* inherit ->scx_sched from $parent */ + if (parent) + rcu_assign_pointer(cgrp->scx_sched, parent->scx_sched); + break; + case CGROUP_LIFETIME_OFFLINE: + /* if there is a sched attached, shoot it down */ + if (cgrp->scx_sched && cgrp->scx_sched->cgrp == cgrp) + scx_exit(cgrp->scx_sched, SCX_EXIT_UNREG_KERN, + SCX_ECODE_RSN_CGROUP_OFFLINE, + "cgroup %llu going offline", cgroup_id(cgrp)); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block scx_cgroup_lifetime_nb = { + .notifier_call = scx_cgroup_lifetime_notify, +}; + +static s32 __init scx_cgroup_lifetime_notifier_init(void) +{ + return blocking_notifier_chain_register(&cgroup_lifetime_notifier, + &scx_cgroup_lifetime_nb); +} +core_initcall(scx_cgroup_lifetime_notifier_init); +#endif /* CONFIG_EXT_SUB_SCHED */ + static s32 scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) { static struct kthread_worker *helper; @@ -5418,7 +5876,12 @@ static s32 scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) mutex_unlock(&helper_mutex); } - kthread_init_work(&cmd.work, scx_root_enable_workfn); +#ifdef CONFIG_EXT_SUB_SCHED + if (ops->sub_cgroup_id > 1) + kthread_init_work(&cmd.work, scx_sub_enable_workfn); + else +#endif /* CONFIG_EXT_SUB_SCHED */ + kthread_init_work(&cmd.work, scx_root_enable_workfn); cmd.ops = ops; kthread_queue_work(READ_ONCE(helper), &cmd.work); @@ -5520,6 +5983,11 @@ static int bpf_scx_init_member(const struct btf_type *t, case offsetof(struct sched_ext_ops, hotplug_seq): ops->hotplug_seq = *(u64 *)(udata + moff); return 1; +#ifdef CONFIG_EXT_SUB_SCHED + case offsetof(struct sched_ext_ops, sub_cgroup_id): + ops->sub_cgroup_id = *(u64 *)(udata + moff); + return 1; +#endif /* CONFIG_EXT_SUB_SCHED */ } return 0; @@ -5542,6 +6010,8 @@ static int bpf_scx_check_member(const struct btf_type *t, case offsetof(struct sched_ext_ops, cpu_offline): case offsetof(struct sched_ext_ops, init): case offsetof(struct sched_ext_ops, exit): + case offsetof(struct sched_ext_ops, sub_attach): + case offsetof(struct sched_ext_ops, sub_detach): break; default: if (prog->sleepable) @@ -5619,7 +6089,9 @@ static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgro static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {} static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {} static void sched_ext_ops__cgroup_set_idle(struct cgroup *cgrp, bool idle) {} -#endif +#endif /* CONFIG_EXT_GROUP_SCHED */ +static s32 sched_ext_ops__sub_attach(struct scx_sub_attach_args *args) { return -EINVAL; } +static void sched_ext_ops__sub_detach(struct scx_sub_detach_args *args) {} static void sched_ext_ops__cpu_online(s32 cpu) {} static void sched_ext_ops__cpu_offline(s32 cpu) {} static s32 sched_ext_ops__init(void) { return -EINVAL; } @@ -5659,6 +6131,8 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = { .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth, .cgroup_set_idle = sched_ext_ops__cgroup_set_idle, #endif + .sub_attach = sched_ext_ops__sub_attach, + .sub_detach = sched_ext_ops__sub_detach, .cpu_online = sched_ext_ops__cpu_online, .cpu_offline = sched_ext_ops__cpu_offline, .init = sched_ext_ops__init, @@ -5941,8 +6415,10 @@ void __init init_sched_ext_class(void) struct rq *rq = cpu_rq(cpu); int n = cpu_to_node(cpu); - init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL); - init_dsq(&rq->scx.bypass_dsq, SCX_DSQ_BYPASS); + /* local/bypass dsq's sch will be set during scx_root_enable() */ + init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL, NULL); + init_dsq(&rq->scx.bypass_dsq, SCX_DSQ_BYPASS, NULL); + INIT_LIST_HEAD(&rq->scx.runnable_list); INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals); @@ -6598,16 +7074,16 @@ __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) if (!dsq) return -ENOMEM; - init_dsq(dsq, dsq_id); - rcu_read_lock(); sch = rcu_dereference(scx_root); - if (sch) + if (sch) { + init_dsq(dsq, dsq_id, sch); ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node, dsq_hash_params); - else + } else { ret = -ENODEV; + } rcu_read_unlock(); if (ret) diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h index 417d3c6f02fe..75b7f57e20ab 100644 --- a/kernel/sched/ext_internal.h +++ b/kernel/sched/ext_internal.h @@ -28,6 +28,8 @@ enum scx_consts { SCX_BYPASS_LB_DONOR_PCT = 125, SCX_BYPASS_LB_MIN_DELTA_DIV = 4, SCX_BYPASS_LB_BATCH = 256, + + SCX_SUB_MAX_DEPTH = 4, }; enum scx_exit_kind { @@ -38,6 +40,7 @@ enum scx_exit_kind { SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */ SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */ SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ + SCX_EXIT_PARENT, /* parent exiting */ SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ @@ -62,6 +65,7 @@ enum scx_exit_kind { enum scx_exit_code { /* Reasons */ SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, + SCX_ECODE_RSN_CGROUP_OFFLINE = 2LLU << 32, /* Actions */ SCX_ECODE_ACT_RESTART = 1LLU << 48, @@ -213,7 +217,7 @@ struct scx_exit_task_args { bool cancelled; }; -/* argument container for ops->cgroup_init() */ +/* argument container for ops.cgroup_init() */ struct scx_cgroup_init_args { /* the weight of the cgroup [1..10000] */ u32 weight; @@ -236,12 +240,12 @@ enum scx_cpu_preempt_reason { }; /* - * Argument container for ops->cpu_acquire(). Currently empty, but may be + * Argument container for ops.cpu_acquire(). Currently empty, but may be * expanded in the future. */ struct scx_cpu_acquire_args {}; -/* argument container for ops->cpu_release() */ +/* argument container for ops.cpu_release() */ struct scx_cpu_release_args { /* the reason the CPU was preempted */ enum scx_cpu_preempt_reason reason; @@ -250,9 +254,7 @@ struct scx_cpu_release_args { struct task_struct *task; }; -/* - * Informational context provided to dump operations. - */ +/* informational context provided to dump operations */ struct scx_dump_ctx { enum scx_exit_kind kind; s64 exit_code; @@ -261,6 +263,18 @@ struct scx_dump_ctx { u64 at_jiffies; }; +/* argument container for ops.sub_attach() */ +struct scx_sub_attach_args { + struct sched_ext_ops *ops; + char *cgroup_path; +}; + +/* argument container for ops.sub_detach() */ +struct scx_sub_detach_args { + struct sched_ext_ops *ops; + char *cgroup_path; +}; + /** * struct sched_ext_ops - Operation table for BPF scheduler implementation * @@ -721,6 +735,20 @@ struct sched_ext_ops { #endif /* CONFIG_EXT_GROUP_SCHED */ + /** + * @sub_attach: Attach a sub-scheduler + * @args: argument container, see the struct definition + * + * Return 0 to accept the sub-scheduler. -errno to reject. + */ + s32 (*sub_attach)(struct scx_sub_attach_args *args); + + /** + * @sub_detach: Detach a sub-scheduler + * @args: argument container, see the struct definition + */ + void (*sub_detach)(struct scx_sub_detach_args *args); + /* * All online ops must come before ops.cpu_online(). */ @@ -762,6 +790,10 @@ struct sched_ext_ops { */ void (*exit)(struct scx_exit_info *info); + /* + * Data fields must comes after all ops fields. + */ + /** * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch */ @@ -796,6 +828,12 @@ struct sched_ext_ops { */ u64 hotplug_seq; + /** + * @cgroup_id: When >1, attach the scheduler as a sub-scheduler on the + * specified cgroup. + */ + u64 sub_cgroup_id; + /** * @name: BPF scheduler's name * @@ -900,6 +938,8 @@ struct scx_sched { struct scx_dispatch_q **global_dsqs; struct scx_sched_pcpu __percpu *pcpu; + s32 level; + /* * Updates to the following warned bitfields can race causing RMW issues * but it doesn't really matter. @@ -907,6 +947,18 @@ struct scx_sched { bool warned_zero_slice:1; bool warned_deprecated_rq:1; + struct list_head all; + +#ifdef CONFIG_EXT_SUB_SCHED + struct list_head children; + struct list_head sibling; + struct cgroup *cgrp; + char *cgrp_path; + struct kset *sub_kset; + + bool sub_attached; +#endif /* CONFIG_EXT_SUB_SCHED */ + atomic_t exit_kind; struct scx_exit_info *exit_info; @@ -916,6 +968,9 @@ struct scx_sched { struct irq_work error_irq_work; struct kthread_work disable_work; struct rcu_work rcu_work; + + /* all ancestors including self */ + struct scx_sched *ancestors[]; }; enum scx_wake_flags { diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index d51d8c38f1cf..ff6ff34177ab 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -41,6 +41,7 @@ const volatile u32 dsp_batch; const volatile bool highpri_boosting; const volatile bool print_dsqs_and_events; const volatile bool print_msgs; +const volatile u64 sub_cgroup_id; const volatile s32 disallow_tgid; const volatile bool suppress_dump; @@ -862,7 +863,7 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) struct bpf_timer *timer; s32 ret; - if (print_msgs) + if (print_msgs && !sub_cgroup_id) print_cpus(); ret = scx_bpf_create_dsq(SHARED_DSQ, -1); @@ -892,6 +893,11 @@ void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei) UEI_RECORD(uei, ei); } +s32 BPF_STRUCT_OPS(qmap_sub_attach, struct scx_sub_attach_args *args) +{ + return 0; +} + SCX_OPS_DEFINE(qmap_ops, .select_cpu = (void *)qmap_select_cpu, .enqueue = (void *)qmap_enqueue, @@ -907,6 +913,7 @@ SCX_OPS_DEFINE(qmap_ops, .cgroup_init = (void *)qmap_cgroup_init, .cgroup_set_weight = (void *)qmap_cgroup_set_weight, .cgroup_set_bandwidth = (void *)qmap_cgroup_set_bandwidth, + .sub_attach = (void *)qmap_sub_attach, .cpu_online = (void *)qmap_cpu_online, .cpu_offline = (void *)qmap_cpu_offline, .init = (void *)qmap_init, diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c index ef701d45ba43..5d762d10f4db 100644 --- a/tools/sched_ext/scx_qmap.c +++ b/tools/sched_ext/scx_qmap.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include "scx_qmap.bpf.skel.h" @@ -67,7 +68,7 @@ int main(int argc, char **argv) skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL"); - while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PMHd:D:Spvh")) != -1) { + while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PMHc:d:D:Spvh")) != -1) { switch (opt) { case 's': skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; @@ -96,6 +97,16 @@ int main(int argc, char **argv) case 'H': skel->rodata->highpri_boosting = true; break; + case 'c': { + struct stat st; + if (stat(optarg, &st) < 0) { + perror("stat"); + return 1; + } + skel->struct_ops.qmap_ops->sub_cgroup_id = st.st_ino; + skel->rodata->sub_cgroup_id = st.st_ino; + break; + } case 'd': skel->rodata->disallow_tgid = strtol(optarg, NULL, 0); if (skel->rodata->disallow_tgid < 0) -- cgit v1.2.3 From 88234b075c3fc23d57406e1867523b6aba783ebf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Mar 2026 07:58:03 -1000 Subject: sched_ext: Introduce scx_task_sched[_rcu]() In preparation of multiple scheduler support, add p->scx.sched which points to the scx_sched instance that the task is scheduled by, which is currently always scx_root. Add scx_task_sched[_rcu]() accessors which return the associated scx_sched of the specified task and replace the raw scx_root dereferences with it where applicable. scx_task_on_sched() is also added to test whether a given task is on the specified sched. As scx_root is still the only scheduler, this shouldn't introduce user-visible behavior changes. Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 7 +++++ kernel/sched/ext.c | 63 ++++++++++++++++++++++++++++----------------- kernel/sched/ext_internal.h | 59 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 105 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index fa4349b319e6..3213e31c7979 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -165,6 +165,13 @@ struct scx_sched; * for a task to be scheduled by SCX. */ struct sched_ext_entity { +#ifdef CONFIG_CGROUPS + /* + * Associated scx_sched. Updated either during fork or while holding + * both p->pi_lock and rq lock. + */ + struct scx_sched __rcu *sched; +#endif struct scx_dispatch_q *dsq; atomic_long_t ops_state; u64 ddsp_dsq_id; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index bb3e33b660da..d56539449f26 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -19,7 +19,7 @@ static DEFINE_RAW_SPINLOCK(scx_sched_lock); * are used as temporary markers to indicate that the dereferences need to be * updated to point to the associated scheduler instances rather than scx_root. */ -static struct scx_sched __rcu *scx_root; +struct scx_sched __rcu *scx_root; /* * All scheds, writers must hold both scx_enable_mutex and scx_sched_lock. @@ -304,9 +304,15 @@ static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, return NULL; } + +static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) +{ + rcu_assign_pointer(p->scx.sched, sch); +} #else /* CONFIG_EXT_SUB_SCHED */ static struct scx_sched *scx_parent(struct scx_sched *sch) { return NULL; } static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; } +static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) {} #endif /* CONFIG_EXT_SUB_SCHED */ /** @@ -1542,7 +1548,7 @@ static bool scx_rq_online(struct rq *rq) static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, int sticky_cpu) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); struct task_struct **ddsp_taskp; struct scx_dispatch_q *dsq; unsigned long qseq; @@ -1672,7 +1678,7 @@ static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at) static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); int sticky_cpu = p->scx.sticky_cpu; if (enq_flags & ENQUEUE_WAKEUP) @@ -1723,7 +1729,7 @@ out: static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); unsigned long opss; u64 op_deq_flags = deq_flags; @@ -1794,7 +1800,7 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); if (!(p->scx.flags & SCX_TASK_QUEUED)) { WARN_ON_ONCE(task_runnable(p)); @@ -1838,8 +1844,8 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags static void yield_task_scx(struct rq *rq) { - struct scx_sched *sch = scx_root; struct task_struct *p = rq->donor; + struct scx_sched *sch = scx_task_sched(p); if (SCX_HAS_OP(sch, yield)) SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, p, NULL); @@ -1849,10 +1855,10 @@ static void yield_task_scx(struct rq *rq) static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) { - struct scx_sched *sch = scx_root; struct task_struct *from = rq->donor; + struct scx_sched *sch = scx_task_sched(from); - if (SCX_HAS_OP(sch, yield)) + if (SCX_HAS_OP(sch, yield) && sch == scx_task_sched(to)) return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, from, to); else @@ -2517,7 +2523,7 @@ static void process_ddsp_deferred_locals(struct rq *rq) */ while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals, struct task_struct, scx.dsq_list.node))) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); struct scx_dispatch_q *dsq; list_del_init(&p->scx.dsq_list.node); @@ -2531,7 +2537,7 @@ static void process_ddsp_deferred_locals(struct rq *rq) static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); if (p->scx.flags & SCX_TASK_QUEUED) { /* @@ -2628,7 +2634,7 @@ static void switch_class(struct rq *rq, struct task_struct *next) static void put_prev_task_scx(struct rq *rq, struct task_struct *p, struct task_struct *next) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); /* see kick_cpus_irq_workfn() */ smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); @@ -2722,14 +2728,14 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) if (keep_prev) { p = prev; if (!p->scx.slice) - refill_task_slice_dfl(rcu_dereference_sched(scx_root), p); + refill_task_slice_dfl(scx_task_sched(p), p); } else { p = first_local_task(rq); if (!p) return NULL; if (unlikely(!p->scx.slice)) { - struct scx_sched *sch = rcu_dereference_sched(scx_root); + struct scx_sched *sch = scx_task_sched(p); if (!scx_rq_bypassing(rq) && !sch->warned_zero_slice) { printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n", @@ -2817,7 +2823,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); bool rq_bypass; /* @@ -2878,7 +2884,7 @@ static void task_woken_scx(struct rq *rq, struct task_struct *p) static void set_cpus_allowed_scx(struct task_struct *p, struct affinity_context *ac) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); set_cpus_allowed_common(p, ac); @@ -3022,7 +3028,7 @@ void scx_tick(struct rq *rq) static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(curr); update_curr_scx(rq); @@ -3212,11 +3218,12 @@ static void scx_disable_task(struct task_struct *p) static void scx_exit_task(struct task_struct *p) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); struct scx_exit_task_args args = { .cancelled = false, }; + lockdep_assert_held(&p->pi_lock); lockdep_assert_rq_held(task_rq(p)); switch (scx_get_task_state(p)) { @@ -3238,6 +3245,7 @@ static void scx_exit_task(struct task_struct *p) if (SCX_HAS_OP(sch, exit_task)) SCX_CALL_OP_TASK(sch, SCX_KF_REST, exit_task, task_rq(p), p, &args); + scx_set_task_sched(p, NULL); scx_set_task_state(p, SCX_TASK_NONE); } @@ -3267,12 +3275,18 @@ void scx_pre_fork(struct task_struct *p) int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs) { + s32 ret; + percpu_rwsem_assert_held(&scx_fork_rwsem); - if (scx_init_task_enabled) - return scx_init_task(p, task_group(p), true); - else - return 0; + if (scx_init_task_enabled) { + ret = scx_init_task(p, task_group(p), true); + if (!ret) + scx_set_task_sched(p, scx_root); + return ret; + } + + return 0; } void scx_post_fork(struct task_struct *p) @@ -3377,7 +3391,7 @@ void sched_ext_dead(struct task_struct *p) static void reweight_task_scx(struct rq *rq, struct task_struct *p, const struct load_weight *lw) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); lockdep_assert_rq_held(task_rq(p)); @@ -3396,7 +3410,7 @@ static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio) static void switching_to_scx(struct rq *rq, struct task_struct *p) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); if (task_dead_and_done(p)) return; @@ -4062,7 +4076,7 @@ bool scx_allow_ttwu_queue(const struct task_struct *p) if (!scx_enabled()) return true; - sch = rcu_dereference_sched(scx_root); + sch = scx_task_sched(p); if (unlikely(!sch)) return true; @@ -5582,6 +5596,7 @@ static void scx_root_enable_workfn(struct kthread_work *work) goto err_disable_unlock_all; } + scx_set_task_sched(p, sch); scx_set_task_state(p, SCX_TASK_READY); put_task_struct(p); diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h index 75b7f57e20ab..0612006019da 100644 --- a/kernel/sched/ext_internal.h +++ b/kernel/sched/ext_internal.h @@ -1223,6 +1223,7 @@ enum scx_ops_state { #define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) #define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) +extern struct scx_sched __rcu *scx_root; DECLARE_PER_CPU(struct rq *, scx_locked_rq_state); /* @@ -1243,3 +1244,61 @@ static inline bool scx_rq_bypassing(struct rq *rq) { return unlikely(rq->scx.flags & SCX_RQ_BYPASSING); } + +#ifdef CONFIG_EXT_SUB_SCHED +/** + * scx_task_sched - Find scx_sched scheduling a task + * @p: task of interest + * + * Return @p's scheduler instance. Must be called with @p's pi_lock or rq lock + * held. + */ +static inline struct scx_sched *scx_task_sched(const struct task_struct *p) +{ + return rcu_dereference_protected(p->scx.sched, + lockdep_is_held(&p->pi_lock) || + lockdep_is_held(__rq_lockp(task_rq(p)))); +} + +/** + * scx_task_sched_rcu - Find scx_sched scheduling a task + * @p: task of interest + * + * Return @p's scheduler instance. The returned scx_sched is RCU protected. + */ +static inline struct scx_sched *scx_task_sched_rcu(const struct task_struct *p) +{ + return rcu_dereference_all(p->scx.sched); +} + +/** + * scx_task_on_sched - Is a task on the specified sched? + * @sch: sched to test against + * @p: task of interest + * + * Returns %true if @p is on @sch, %false otherwise. + */ +static inline bool scx_task_on_sched(struct scx_sched *sch, + const struct task_struct *p) +{ + return rcu_access_pointer(p->scx.sched) == sch; +} +#else /* CONFIG_EXT_SUB_SCHED */ +static inline struct scx_sched *scx_task_sched(const struct task_struct *p) +{ + return rcu_dereference_protected(scx_root, + lockdep_is_held(&p->pi_lock) || + lockdep_is_held(__rq_lockp(task_rq(p)))); +} + +static inline struct scx_sched *scx_task_sched_rcu(const struct task_struct *p) +{ + return rcu_dereference_all(scx_root); +} + +static inline bool scx_task_on_sched(struct scx_sched *sch, + const struct task_struct *p) +{ + return true; +} +#endif /* CONFIG_EXT_SUB_SCHED */ -- cgit v1.2.3 From 337ec00b1d9c676f637651c2cefddb8612b867ee Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Mar 2026 07:58:04 -1000 Subject: sched_ext: Implement cgroup sub-sched enabling and disabling The preceding changes implemented the framework to support cgroup sub-scheds and updated scheduling paths and kfuncs so that they have minimal but working support for sub-scheds. However, actual sub-sched enabling/disabling hasn't been implemented yet and all tasks stayed on scx_root. Implement cgroup sub-sched enabling and disabling to actually activate sub-scheds: - Both enable and disable operations bypass only the tasks in the subtree of the child being enabled or disabled to limit disruptions. - When enabling, all candidate tasks are first initialized for the child sched. Once that succeeds, the tasks are exited for the parent and then switched over to the child. This adds a bit of complication but guarantees that child scheduler failures are always contained. - Disabling works the same way in the other direction. However, when the parent may fail to initialize a task, disabling is propagated up to the parent. While this means that a parent sched fail due to a child sched event, the failure can only originate from the parent itself (its ops.init_task()). The only effect a malfunctioning child can have on the parent is attempting to move the tasks back to the parent. After this change, although not all the necessary mechanisms are in place yet, sub-scheds can take control of their tasks and schedule them. v2: Fix missing scx_cgroup_unlock()/percpu_up_write() in abort path (Cheng-Yang Chou). Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 1 + kernel/sched/ext.c | 285 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 280 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 3213e31c7979..f354d7d34306 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -88,6 +88,7 @@ enum scx_ent_flags { SCX_TASK_IN_CUSTODY = 1 << 1, /* in custody, needs ops.dequeue() when leaving */ SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */ SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ + SCX_TASK_SUB_INIT = 1 << 4, /* task being initialized for a sub sched */ SCX_TASK_STATE_SHIFT = 8, /* bit 8 and 9 are used to carry scx_task_state */ SCX_TASK_STATE_BITS = 2, diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 3f237b9da970..70d0f9e8ef61 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -51,6 +51,17 @@ DEFINE_STATIC_KEY_FALSE(__scx_switched_all); static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); +#ifdef CONFIG_EXT_SUB_SCHED +/* + * The sub sched being enabled. Used by scx_disable_and_exit_task() to exit + * tasks for the sub-sched being enabled. Use a global variable instead of a + * per-task field as all enables are serialized. + */ +static struct scx_sched *scx_enabling_sub_sched; +#else +#define scx_enabling_sub_sched (struct scx_sched *)NULL +#endif /* CONFIG_EXT_SUB_SCHED */ + /* * A monotically increasing sequence number that is incremented every time a * scheduler is enabled. This can be used by to check if any custom sched_ext @@ -3342,6 +3353,17 @@ static void scx_disable_and_exit_task(struct scx_sched *sch, { __scx_disable_and_exit_task(sch, p); + /* + * If set, @p exited between __scx_init_task() and scx_enable_task() in + * scx_sub_enable() and is initialized for both the associated sched and + * its parent. Disable and exit for the child too. + */ + if ((p->scx.flags & SCX_TASK_SUB_INIT) && + !WARN_ON_ONCE(!scx_enabling_sub_sched)) { + __scx_disable_and_exit_task(scx_enabling_sub_sched, p); + p->scx.flags &= ~SCX_TASK_SUB_INIT; + } + scx_set_task_sched(p, NULL); scx_set_task_state(p, SCX_TASK_NONE); } @@ -3377,9 +3399,14 @@ int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs) percpu_rwsem_assert_held(&scx_fork_rwsem); if (scx_init_task_enabled) { - ret = scx_init_task(scx_root, p, true); +#ifdef CONFIG_EXT_SUB_SCHED + struct scx_sched *sch = kargs->cset->dfl_cgrp->scx_sched; +#else + struct scx_sched *sch = scx_root; +#endif + ret = scx_init_task(sch, p, true); if (!ret) - scx_set_task_sched(p, scx_root); + scx_set_task_sched(p, sch); return ret; } @@ -4643,9 +4670,9 @@ static void scx_bypass(struct scx_sched *sch, bool bypass) struct rq *rq = cpu_rq(cpu); struct task_struct *p, *n; + raw_spin_lock(&scx_sched_lock); raw_spin_rq_lock(rq); - raw_spin_lock(&scx_sched_lock); scx_for_each_descendant_pre(pos, sch) { struct scx_sched_pcpu *pcpu = per_cpu_ptr(pos->pcpu, cpu); @@ -4654,6 +4681,7 @@ static void scx_bypass(struct scx_sched *sch, bool bypass) else pcpu->flags &= ~SCX_SCHED_PCPU_BYPASSING; } + raw_spin_unlock(&scx_sched_lock); /* @@ -4798,23 +4826,139 @@ static void drain_descendants(struct scx_sched *sch) wait_event(scx_unlink_waitq, list_empty(&sch->children)); } +static void scx_fail_parent(struct scx_sched *sch, + struct task_struct *failed, s32 fail_code) +{ + struct scx_sched *parent = scx_parent(sch); + struct scx_task_iter sti; + struct task_struct *p; + + scx_error(parent, "ops.init_task() failed (%d) for %s[%d] while disabling a sub-scheduler", + fail_code, failed->comm, failed->pid); + + /* + * Once $parent is bypassed, it's safe to put SCX_TASK_NONE tasks into + * it. This may cause downstream failures on the BPF side but $parent is + * dying anyway. + */ + scx_bypass(parent, true); + + scx_task_iter_start(&sti, sch->cgrp); + while ((p = scx_task_iter_next_locked(&sti))) { + if (scx_task_on_sched(parent, p)) + continue; + + scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { + scx_disable_and_exit_task(sch, p); + rcu_assign_pointer(p->scx.sched, parent); + } + } + scx_task_iter_stop(&sti); +} + static void scx_sub_disable(struct scx_sched *sch) { struct scx_sched *parent = scx_parent(sch); + struct scx_task_iter sti; + struct task_struct *p; + int ret; + /* + * Guarantee forward progress and wait for descendants to be disabled. + * To limit + * disruptions, $parent is not bypassed. Tasks are fully prepped and + * then inserted back into $parent. + */ + scx_bypass(sch, true); drain_descendants(sch); + /* + * Here, every runnable task is guaranteed to make forward progress and + * we can safely use blocking synchronization constructs. Actually + * disable ops. + */ mutex_lock(&scx_enable_mutex); percpu_down_write(&scx_fork_rwsem); scx_cgroup_lock(); set_cgroup_sched(sch_cgroup(sch), parent); - /* TODO - perform actual disabling here */ + scx_task_iter_start(&sti, sch->cgrp); + while ((p = scx_task_iter_next_locked(&sti))) { + struct rq *rq; + struct rq_flags rf; + + /* filter out duplicate visits */ + if (scx_task_on_sched(parent, p)) + continue; + + /* + * By the time control reaches here, all descendant schedulers + * should already have been disabled. + */ + WARN_ON_ONCE(!scx_task_on_sched(sch, p)); + + /* + * If $p is about to be freed, nothing prevents $sch from + * unloading before $p reaches sched_ext_free(). Disable and + * exit $p right away. + */ + if (!tryget_task_struct(p)) { + scx_disable_and_exit_task(sch, p); + continue; + } + + scx_task_iter_unlock(&sti); + + /* + * $p is READY or ENABLED on @sch. Initialize for $parent, + * disable and exit from @sch, and then switch over to $parent. + * + * If a task fails to initialize for $parent, the only available + * action is disabling $parent too. While this allows disabling + * of a child sched to cause the parent scheduler to fail, the + * failure can only originate from ops.init_task() of the + * parent. A child can't directly affect the parent through its + * own failures. + */ + ret = __scx_init_task(parent, p, false); + if (ret) { + scx_fail_parent(sch, p, ret); + put_task_struct(p); + break; + } + + rq = task_rq_lock(p, &rf); + scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { + /* + * $p is initialized for $parent and still attached to + * @sch. Disable and exit for @sch, switch over to + * $parent, override the state to READY to account for + * $p having already been initialized, and then enable. + */ + scx_disable_and_exit_task(sch, p); + scx_set_task_state(p, SCX_TASK_INIT); + rcu_assign_pointer(p->scx.sched, parent); + scx_set_task_state(p, SCX_TASK_READY); + scx_enable_task(parent, p); + } + task_rq_unlock(rq, p, &rf); + + put_task_struct(p); + } + scx_task_iter_stop(&sti); scx_cgroup_unlock(); percpu_up_write(&scx_fork_rwsem); + /* + * All tasks are moved off of @sch but there may still be on-going + * operations (e.g. ops.select_cpu()). Drain them by flushing RCU. Use + * the expedited version as ancestors may be waiting in bypass mode. + * Also, tell the parent that there is no need to keep running bypass + * DSQs for us. + */ + synchronize_rcu_expedited(); disable_bypass_dsp(sch); raw_spin_lock_irq(&scx_sched_lock); @@ -5933,13 +6077,30 @@ static struct scx_sched *find_parent_sched(struct cgroup *cgrp) return parent; } +static bool assert_task_ready_or_enabled(struct task_struct *p) +{ + enum scx_task_state state = scx_get_task_state(p); + + switch (state) { + case SCX_TASK_READY: + case SCX_TASK_ENABLED: + return true; + default: + WARN_ONCE(true, "sched_ext: Invalid task state %d for %s[%d] during enabling sub sched", + state, p->comm, p->pid); + return false; + } +} + static void scx_sub_enable_workfn(struct kthread_work *work) { struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work); struct sched_ext_ops *ops = cmd->ops; struct cgroup *cgrp; struct scx_sched *parent, *sch; - s32 ret; + struct scx_task_iter sti; + struct task_struct *p; + s32 i, ret; mutex_lock(&scx_enable_mutex); @@ -6011,6 +6172,12 @@ static void scx_sub_enable_workfn(struct kthread_work *work) } sch->sub_attached = true; + scx_bypass(sch, true); + + for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) + if (((void (**)(void))ops)[i]) + set_bit(i, sch->has_op); + percpu_down_write(&scx_fork_rwsem); scx_cgroup_lock(); @@ -6024,16 +6191,121 @@ static void scx_sub_enable_workfn(struct kthread_work *work) goto err_unlock_and_disable; } - /* TODO - perform actual enabling here */ + /* + * Initialize tasks for the new child $sch without exiting them for + * $parent so that the tasks can always be reverted back to $parent + * sched on child init failure. + */ + WARN_ON_ONCE(scx_enabling_sub_sched); + scx_enabling_sub_sched = sch; + + scx_task_iter_start(&sti, sch->cgrp); + while ((p = scx_task_iter_next_locked(&sti))) { + struct rq *rq; + struct rq_flags rf; + + /* + * Task iteration may visit the same task twice when racing + * against exiting. Use %SCX_TASK_SUB_INIT to mark tasks which + * finished __scx_init_task() and skip if set. + * + * A task may exit and get freed between __scx_init_task() + * completion and scx_enable_task(). In such cases, + * scx_disable_and_exit_task() must exit the task for both the + * parent and child scheds. + */ + if (p->scx.flags & SCX_TASK_SUB_INIT) + continue; + + /* see scx_root_enable() */ + if (!tryget_task_struct(p)) + continue; + + if (!assert_task_ready_or_enabled(p)) { + ret = -EINVAL; + goto abort; + } + + scx_task_iter_unlock(&sti); + + /* + * As $p is still on $parent, it can't be transitioned to INIT. + * Let's worry about task state later. Use __scx_init_task(). + */ + ret = __scx_init_task(sch, p, false); + if (ret) + goto abort; + + rq = task_rq_lock(p, &rf); + p->scx.flags |= SCX_TASK_SUB_INIT; + task_rq_unlock(rq, p, &rf); + + put_task_struct(p); + } + scx_task_iter_stop(&sti); + + /* + * All tasks are prepped. Disable/exit tasks for $parent and enable for + * the new @sch. + */ + scx_task_iter_start(&sti, sch->cgrp); + while ((p = scx_task_iter_next_locked(&sti))) { + /* + * Use clearing of %SCX_TASK_SUB_INIT to detect and skip + * duplicate iterations. + */ + if (!(p->scx.flags & SCX_TASK_SUB_INIT)) + continue; + + scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { + /* + * $p must be either READY or ENABLED. If ENABLED, + * __scx_disabled_and_exit_task() first disables and + * makes it READY. However, after exiting $p, it will + * leave $p as READY. + */ + assert_task_ready_or_enabled(p); + __scx_disable_and_exit_task(parent, p); + + /* + * $p is now only initialized for @sch and READY, which + * is what we want. Assign it to @sch and enable. + */ + rcu_assign_pointer(p->scx.sched, sch); + scx_enable_task(sch, p); + + p->scx.flags &= ~SCX_TASK_SUB_INIT; + } + } + scx_task_iter_stop(&sti); + + scx_enabling_sub_sched = NULL; scx_cgroup_unlock(); percpu_up_write(&scx_fork_rwsem); + scx_bypass(sch, false); + pr_info("sched_ext: BPF sub-scheduler \"%s\" enabled\n", sch->ops.name); kobject_uevent(&sch->kobj, KOBJ_ADD); ret = 0; goto out_unlock; +abort: + put_task_struct(p); + scx_task_iter_stop(&sti); + scx_enabling_sub_sched = NULL; + + scx_task_iter_start(&sti, sch->cgrp); + while ((p = scx_task_iter_next_locked(&sti))) { + if (p->scx.flags & SCX_TASK_SUB_INIT) { + __scx_disable_and_exit_task(sch, p); + p->scx.flags &= ~SCX_TASK_SUB_INIT; + } + } + scx_task_iter_stop(&sti); + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); out_put_cgrp: cgroup_put(cgrp); out_unlock: @@ -6042,6 +6314,7 @@ out_unlock: return; err_unlock_and_disable: + /* we'll soon enter disable path, keep bypass on */ scx_cgroup_unlock(); percpu_up_write(&scx_fork_rwsem); err_disable: -- cgit v1.2.3 From 3cd963fa915c494a6d0da0287bd10cb6f2204f9e Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 5 Mar 2026 10:42:57 +0000 Subject: net: stmmac: mdio_bus_data->default_an_inband is boolean default_an_inband is declared as an unsigned int, but is set to true/ false and is assigned to phylink_config's member of the same name which is a bool. Declare this also as a bool for consistency. Signed-off-by: Russell King (Oracle) Reviewed-by: Maxime Chevallier Tested-by: Maxime Chevallier Link: https://patch.msgid.link/E1vy6AT-0000000BtxD-2qm7@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 2fc169c7117e..678d03d6d3bd 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -86,10 +86,10 @@ struct stmmac_priv; struct stmmac_mdio_bus_data { unsigned int phy_mask; unsigned int pcs_mask; - unsigned int default_an_inband; int *irqs; int probed_phy_irq; bool needs_reset; + bool default_an_inband; }; struct stmmac_dma_cfg { -- cgit v1.2.3 From e4fd855c52ec5af34d920206190be29919fadca3 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 5 Mar 2026 10:43:02 +0000 Subject: net: stmmac: make pcs_mask and phy_mask u32 The PCS and PHY masks are passed to the mdio bus layer as phy_mask to prevent bus addresses between 0 and 31 inclusive being scanned, and this is declared as u32. Also declare these as u32 in stmmac for type consistency. Since this is a u32, use BIT_U32() rather than BIT() to generate values for these fields. Signed-off-by: Russell King (Oracle) Reviewed-by: Maxime Chevallier Tested-by: Maxime Chevallier Link: https://patch.msgid.link/E1vy6AY-0000000BtxJ-3smT@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c | 2 +- include/linux/stmmac.h | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c index ece2a0c38562..fc13bfb47783 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c @@ -699,7 +699,7 @@ static int intel_mgbe_common_data(struct pci_dev *pdev, /* Intel mgbe SGMII interface uses pcs-xcps */ if (plat->phy_interface == PHY_INTERFACE_MODE_SGMII || plat->phy_interface == PHY_INTERFACE_MODE_1000BASEX) { - plat->mdio_bus_data->pcs_mask = BIT(INTEL_MGBE_XPCS_ADDR); + plat->mdio_bus_data->pcs_mask = BIT_U32(INTEL_MGBE_XPCS_ADDR); plat->mdio_bus_data->default_an_inband = true; plat->select_pcs = intel_mgbe_select_pcs; } diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c index b913fe5af488..ada6c6ef1f5c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c @@ -168,7 +168,7 @@ static int loongson_gnet_data(struct pci_dev *pdev, loongson_default_data(pdev, plat); plat->phy_interface = PHY_INTERFACE_MODE_GMII; - plat->mdio_bus_data->phy_mask = ~(u32)BIT(2); + plat->mdio_bus_data->phy_mask = ~BIT_U32(2); plat->fix_mac_speed = loongson_gnet_fix_speed; return 0; diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 678d03d6d3bd..965ada809fdf 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -84,8 +84,8 @@ struct stmmac_priv; /* Platfrom data for platform device structure's platform_data field */ struct stmmac_mdio_bus_data { - unsigned int phy_mask; - unsigned int pcs_mask; + u32 phy_mask; + u32 pcs_mask; int *irqs; int probed_phy_irq; bool needs_reset; -- cgit v1.2.3 From 260d27b3aec9f30d68f9f3cacc674655897eb745 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 4 Mar 2026 21:17:28 +0100 Subject: net: phy: remove phy_attach 378e6523ebb1 ("net: bcmgenet: remove unused platform code") removed the last user of phy_attach(). So remove this function. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/8812176a-e319-4e9f-815d-99ea339df8b2@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy_device.c | 38 -------------------------------------- include/linux/phy.h | 2 -- 2 files changed, 40 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 3bd415710bf3..d1cbcfc3d2a6 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1895,44 +1895,6 @@ error_put_device: } EXPORT_SYMBOL(phy_attach_direct); -/** - * phy_attach - attach a network device to a particular PHY device - * @dev: network device to attach - * @bus_id: Bus ID of PHY device to attach - * @interface: PHY device's interface - * - * Description: Same as phy_attach_direct() except that a PHY bus_id - * string is passed instead of a pointer to a struct phy_device. - */ -struct phy_device *phy_attach(struct net_device *dev, const char *bus_id, - phy_interface_t interface) -{ - struct phy_device *phydev; - struct device *d; - int rc; - - if (!dev) - return ERR_PTR(-EINVAL); - - /* Search the list of PHY devices on the mdio bus for the - * PHY with the requested name - */ - d = bus_find_device_by_name(&mdio_bus_type, NULL, bus_id); - if (!d) { - pr_err("PHY %s not found\n", bus_id); - return ERR_PTR(-ENODEV); - } - phydev = to_phy_device(d); - - rc = phy_attach_direct(dev, phydev, phydev->dev_flags, interface); - put_device(d); - if (rc) - return ERR_PTR(rc); - - return phydev; -} -EXPORT_SYMBOL(phy_attach); - /** * phy_detach - detach a PHY device from its network device * @phydev: target phy_device struct diff --git a/include/linux/phy.h b/include/linux/phy.h index 6f9979a26892..e9b0d7427b0e 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2152,8 +2152,6 @@ int phy_suspend(struct phy_device *phydev); int phy_resume(struct phy_device *phydev); int __phy_resume(struct phy_device *phydev); int phy_loopback(struct phy_device *phydev, bool enable, int speed); -struct phy_device *phy_attach(struct net_device *dev, const char *bus_id, - phy_interface_t interface); struct phy_device *phy_find_next(struct mii_bus *bus, struct phy_device *pos); int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, u32 flags, phy_interface_t interface); -- cgit v1.2.3 From c9429bf56405a326845a8a35357b5bdf1dc4558c Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Tue, 24 Feb 2026 19:29:54 +0000 Subject: rhashtable: consolidate hash computation in rht_key_get_hash() The else-if and else branches in rht_key_get_hash() both compute a hash using either params.hashfn or jhash, differing only in the source of key_len (params.key_len vs ht->p.key_len). Merge the two branches into one by using the ternary `params.key_len ?: ht->p.key_len` to select the key length, removing the duplicated logic. This also improves the performance of the else branch which previously always used jhash and never fell through to jhash2. This branch is going to be used by BPF resizable hashmap, which wraps rhashtable: https://lore.kernel.org/bpf/20260205-rhash-v1-0-30dd6d63c462@meta.com/ Signed-off-by: Mykyta Yatsenko Signed-off-by: Herbert Xu --- include/linux/rhashtable.h | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 133ccb39137a..0480509a6339 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -129,10 +129,10 @@ static __always_inline unsigned int rht_key_get_hash(struct rhashtable *ht, unsigned int hash; /* params must be equal to ht->p if it isn't constant. */ - if (!__builtin_constant_p(params.key_len)) + if (!__builtin_constant_p(params.key_len)) { hash = ht->p.hashfn(key, ht->key_len, hash_rnd); - else if (params.key_len) { - unsigned int key_len = params.key_len; + } else { + unsigned int key_len = params.key_len ? : ht->p.key_len; if (params.hashfn) hash = params.hashfn(key, key_len, hash_rnd); @@ -140,13 +140,6 @@ static __always_inline unsigned int rht_key_get_hash(struct rhashtable *ht, hash = jhash(key, key_len, hash_rnd); else hash = jhash2(key, key_len / sizeof(u32), hash_rnd); - } else { - unsigned int key_len = ht->p.key_len; - - if (params.hashfn) - hash = params.hashfn(key, key_len, hash_rnd); - else - hash = jhash(key, key_len, hash_rnd); } return hash; -- cgit v1.2.3 From 30b0515342db48ac9ffd9999648de0f7ca1d6a87 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 7 Mar 2026 05:29:50 -1000 Subject: sched_ext: Add per-CPU data to DSQs Add per-CPU data structure to dispatch queues. Each DSQ now has a percpu scx_dsq_pcpu which contains a back-pointer to the DSQ. This will be used by future changes to implement per-CPU reenqueue tracking for user DSQs. init_dsq() now allocates the percpu data and can fail, so it returns an error code. All callers are updated to handle failures. exit_dsq() is added to free the percpu data and is called from all DSQ cleanup paths. In scx_bpf_create_dsq(), init_dsq() is called before rcu_read_lock() since alloc_percpu() requires GFP_KERNEL context, and dsq->sched is set afterwards. v2: Fix err_free_pcpu to only exit_dsq() initialized bypass DSQs (Andrea Righi). Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 5 +++ kernel/sched/ext.c | 87 +++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 77 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index f354d7d34306..98cc1f41b91e 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -62,6 +62,10 @@ enum scx_dsq_id_flags { SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU, }; +struct scx_dsq_pcpu { + struct scx_dispatch_q *dsq; +}; + /* * A dispatch queue (DSQ) can be either a FIFO or p->scx.dsq_vtime ordered * queue. A built-in DSQ is always a FIFO. The built-in local DSQs are used to @@ -79,6 +83,7 @@ struct scx_dispatch_q { struct rhash_head hash_node; struct llist_node free_node; struct scx_sched *sched; + struct scx_dsq_pcpu __percpu *pcpu; struct rcu_head rcu; }; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index d8ea12ddc206..aea09eb36873 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4020,15 +4020,42 @@ DEFINE_SCHED_CLASS(ext) = { #endif }; -static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id, - struct scx_sched *sch) +static s32 init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id, + struct scx_sched *sch) { + s32 cpu; + memset(dsq, 0, sizeof(*dsq)); raw_spin_lock_init(&dsq->lock); INIT_LIST_HEAD(&dsq->list); dsq->id = dsq_id; dsq->sched = sch; + + dsq->pcpu = alloc_percpu(struct scx_dsq_pcpu); + if (!dsq->pcpu) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu); + + pcpu->dsq = dsq; + } + + return 0; +} + +static void exit_dsq(struct scx_dispatch_q *dsq) +{ + free_percpu(dsq->pcpu); +} + +static void free_dsq_rcufn(struct rcu_head *rcu) +{ + struct scx_dispatch_q *dsq = container_of(rcu, struct scx_dispatch_q, rcu); + + exit_dsq(dsq); + kfree(dsq); } static void free_dsq_irq_workfn(struct irq_work *irq_work) @@ -4037,7 +4064,7 @@ static void free_dsq_irq_workfn(struct irq_work *irq_work) struct scx_dispatch_q *dsq, *tmp_dsq; llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node) - kfree_rcu(dsq, rcu); + call_rcu(&dsq->rcu, free_dsq_rcufn); } static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn); @@ -4234,15 +4261,17 @@ static void scx_sched_free_rcu_work(struct work_struct *work) cgroup_put(sch_cgroup(sch)); #endif /* CONFIG_EXT_SUB_SCHED */ - /* - * $sch would have entered bypass mode before the RCU grace period. As - * that blocks new deferrals, all deferred_reenq_local_node's must be - * off-list by now. - */ for_each_possible_cpu(cpu) { struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); + /* + * $sch would have entered bypass mode before the RCU grace + * period. As that blocks new deferrals, all + * deferred_reenq_local_node's must be off-list by now. + */ WARN_ON_ONCE(!list_empty(&pcpu->deferred_reenq_local.node)); + + exit_dsq(bypass_dsq(sch, cpu)); } free_percpu(sch->pcpu); @@ -5787,6 +5816,9 @@ static int alloc_kick_syncs(void) static void free_pnode(struct scx_sched_pnode *pnode) { + if (!pnode) + return; + exit_dsq(&pnode->global_dsq); kfree(pnode); } @@ -5798,7 +5830,10 @@ static struct scx_sched_pnode *alloc_pnode(struct scx_sched *sch, int node) if (!pnode) return NULL; - init_dsq(&pnode->global_dsq, SCX_DSQ_GLOBAL, sch); + if (init_dsq(&pnode->global_dsq, SCX_DSQ_GLOBAL, sch)) { + kfree(pnode); + return NULL; + } return pnode; } @@ -5809,7 +5844,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, { struct scx_sched *sch; s32 level = parent ? parent->level + 1 : 0; - s32 node, cpu, ret; + s32 node, cpu, ret, bypass_fail_cpu = nr_cpu_ids; sch = kzalloc_flex(*sch, ancestors, level); if (!sch) @@ -5848,8 +5883,13 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, goto err_free_pnode; } - for_each_possible_cpu(cpu) - init_dsq(bypass_dsq(sch, cpu), SCX_DSQ_BYPASS, sch); + for_each_possible_cpu(cpu) { + ret = init_dsq(bypass_dsq(sch, cpu), SCX_DSQ_BYPASS, sch); + if (ret) { + bypass_fail_cpu = cpu; + goto err_free_pcpu; + } + } for_each_possible_cpu(cpu) { struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); @@ -5931,6 +5971,11 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, err_stop_helper: kthread_destroy_worker(sch->helper); err_free_pcpu: + for_each_possible_cpu(cpu) { + if (cpu == bypass_fail_cpu) + break; + exit_dsq(bypass_dsq(sch, cpu)); + } free_percpu(sch->pcpu); err_free_pnode: for_each_node_state(node, N_POSSIBLE) @@ -7173,7 +7218,7 @@ void __init init_sched_ext_class(void) int n = cpu_to_node(cpu); /* local_dsq's sch will be set during scx_root_enable() */ - init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL, NULL); + BUG_ON(init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL, NULL)); INIT_LIST_HEAD(&rq->scx.runnable_list); INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals); @@ -7872,11 +7917,21 @@ __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node, const struct bpf_prog_a if (!dsq) return -ENOMEM; + /* + * init_dsq() must be called in GFP_KERNEL context. Init it with NULL + * @sch and update afterwards. + */ + ret = init_dsq(dsq, dsq_id, NULL); + if (ret) { + kfree(dsq); + return ret; + } + rcu_read_lock(); sch = scx_prog_sched(aux); if (sch) { - init_dsq(dsq, dsq_id, sch); + dsq->sched = sch; ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node, dsq_hash_params); } else { @@ -7884,8 +7939,10 @@ __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node, const struct bpf_prog_a } rcu_read_unlock(); - if (ret) + if (ret) { + exit_dsq(dsq); kfree(dsq); + } return ret; } -- cgit v1.2.3 From 35250720d6ed1e83e0d1e12b7e8bf7b8316d7d58 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 7 Mar 2026 05:29:50 -1000 Subject: sched_ext: Factor out nldsq_cursor_next_task() and nldsq_cursor_lost_task() Factor out cursor-based DSQ iteration from bpf_iter_scx_dsq_next() into nldsq_cursor_next_task() and the task-lost check from scx_dsq_move() into nldsq_cursor_lost_task() to prepare for reuse. As ->priv is only used to record dsq->seq for cursors, update INIT_DSQ_LIST_CURSOR() to take the DSQ pointer and set ->priv from dsq->seq so that users don't have to read it manually. Move scx_dsq_iter_flags enum earlier so nldsq_cursor_next_task() can use SCX_DSQ_ITER_REV. bypass_lb_cpu() now sets cursor.priv to dsq->seq but doesn't use it. Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 6 +- kernel/sched/ext.c | 154 +++++++++++++++++++++++++++++----------------- 2 files changed, 102 insertions(+), 58 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 98cc1f41b91e..303f57dfb947 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -157,11 +157,11 @@ struct scx_dsq_list_node { u32 priv; /* can be used by iter cursor */ }; -#define INIT_DSQ_LIST_CURSOR(__node, __flags, __priv) \ +#define INIT_DSQ_LIST_CURSOR(__cursor, __dsq, __flags) \ (struct scx_dsq_list_node) { \ - .node = LIST_HEAD_INIT((__node).node), \ + .node = LIST_HEAD_INIT((__cursor).node), \ .flags = SCX_DSQ_LNODE_ITER_CURSOR | (__flags), \ - .priv = (__priv), \ + .priv = READ_ONCE((__dsq)->seq), \ } struct scx_sched; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index aea09eb36873..f51e4c20cd95 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -570,9 +570,22 @@ static __always_inline bool scx_kf_allowed_on_arg_tasks(struct scx_sched *sch, return true; } +enum scx_dsq_iter_flags { + /* iterate in the reverse dispatch order */ + SCX_DSQ_ITER_REV = 1U << 16, + + __SCX_DSQ_ITER_HAS_SLICE = 1U << 30, + __SCX_DSQ_ITER_HAS_VTIME = 1U << 31, + + __SCX_DSQ_ITER_USER_FLAGS = SCX_DSQ_ITER_REV, + __SCX_DSQ_ITER_ALL_FLAGS = __SCX_DSQ_ITER_USER_FLAGS | + __SCX_DSQ_ITER_HAS_SLICE | + __SCX_DSQ_ITER_HAS_VTIME, +}; + /** * nldsq_next_task - Iterate to the next task in a non-local DSQ - * @dsq: user dsq being iterated + * @dsq: non-local dsq being iterated * @cur: current position, %NULL to start iteration * @rev: walk backwards * @@ -612,6 +625,85 @@ static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq, for ((p) = nldsq_next_task((dsq), NULL, false); (p); \ (p) = nldsq_next_task((dsq), (p), false)) +/** + * nldsq_cursor_next_task - Iterate to the next task given a cursor in a non-local DSQ + * @cursor: scx_dsq_list_node initialized with INIT_DSQ_LIST_CURSOR() + * @dsq: non-local dsq being iterated + * + * Find the next task in a cursor based iteration. The caller must have + * initialized @cursor using INIT_DSQ_LIST_CURSOR() and can release the DSQ lock + * between the iteration steps. + * + * Only tasks which were queued before @cursor was initialized are visible. This + * bounds the iteration and guarantees that vtime never jumps in the other + * direction while iterating. + */ +static struct task_struct *nldsq_cursor_next_task(struct scx_dsq_list_node *cursor, + struct scx_dispatch_q *dsq) +{ + bool rev = cursor->flags & SCX_DSQ_ITER_REV; + struct task_struct *p; + + lockdep_assert_held(&dsq->lock); + BUG_ON(!(cursor->flags & SCX_DSQ_LNODE_ITER_CURSOR)); + + if (list_empty(&cursor->node)) + p = NULL; + else + p = container_of(cursor, struct task_struct, scx.dsq_list); + + /* skip cursors and tasks that were queued after @cursor init */ + do { + p = nldsq_next_task(dsq, p, rev); + } while (p && unlikely(u32_before(cursor->priv, p->scx.dsq_seq))); + + if (p) { + if (rev) + list_move_tail(&cursor->node, &p->scx.dsq_list.node); + else + list_move(&cursor->node, &p->scx.dsq_list.node); + } else { + list_del_init(&cursor->node); + } + + return p; +} + +/** + * nldsq_cursor_lost_task - Test whether someone else took the task since iteration + * @cursor: scx_dsq_list_node initialized with INIT_DSQ_LIST_CURSOR() + * @rq: rq @p was on + * @dsq: dsq @p was on + * @p: target task + * + * @p is a task returned by nldsq_cursor_next_task(). The locks may have been + * dropped and re-acquired inbetween. Verify that no one else took or is in the + * process of taking @p from @dsq. + * + * On %false return, the caller can assume full ownership of @p. + */ +static bool nldsq_cursor_lost_task(struct scx_dsq_list_node *cursor, + struct rq *rq, struct scx_dispatch_q *dsq, + struct task_struct *p) +{ + lockdep_assert_rq_held(rq); + lockdep_assert_held(&dsq->lock); + + /* + * @p could have already left $src_dsq, got re-enqueud, or be in the + * process of being consumed by someone else. + */ + if (unlikely(p->scx.dsq != dsq || + u32_before(cursor->priv, p->scx.dsq_seq) || + p->scx.holding_cpu >= 0)) + return true; + + /* if @p has stayed on @dsq, its rq couldn't have changed */ + if (WARN_ON_ONCE(rq != task_rq(p))) + return true; + + return false; +} /* * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse] @@ -619,19 +711,6 @@ static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq, * changes without breaking backward compatibility. Can be used with * bpf_for_each(). See bpf_iter_scx_dsq_*(). */ -enum scx_dsq_iter_flags { - /* iterate in the reverse dispatch order */ - SCX_DSQ_ITER_REV = 1U << 16, - - __SCX_DSQ_ITER_HAS_SLICE = 1U << 30, - __SCX_DSQ_ITER_HAS_VTIME = 1U << 31, - - __SCX_DSQ_ITER_USER_FLAGS = SCX_DSQ_ITER_REV, - __SCX_DSQ_ITER_ALL_FLAGS = __SCX_DSQ_ITER_USER_FLAGS | - __SCX_DSQ_ITER_HAS_SLICE | - __SCX_DSQ_ITER_HAS_VTIME, -}; - struct bpf_iter_scx_dsq_kern { struct scx_dsq_list_node cursor; struct scx_dispatch_q *dsq; @@ -4497,7 +4576,7 @@ static u32 bypass_lb_cpu(struct scx_sched *sch, s32 donor, struct rq *donor_rq = cpu_rq(donor); struct scx_dispatch_q *donor_dsq = bypass_dsq(sch, donor); struct task_struct *p, *n; - struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, 0, 0); + struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, donor_dsq, 0); s32 delta = READ_ONCE(donor_dsq->nr) - nr_donor_target; u32 nr_balanced = 0, min_delta_us; @@ -7542,14 +7621,8 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, locked_rq = src_rq; raw_spin_lock(&src_dsq->lock); - /* - * Did someone else get to it? @p could have already left $src_dsq, got - * re-enqueud, or be in the process of being consumed by someone else. - */ - if (unlikely(p->scx.dsq != src_dsq || - u32_before(kit->cursor.priv, p->scx.dsq_seq) || - p->scx.holding_cpu >= 0) || - WARN_ON_ONCE(src_rq != task_rq(p))) { + /* did someone else get to it while we dropped the locks? */ + if (nldsq_cursor_lost_task(&kit->cursor, src_rq, src_dsq, p)) { raw_spin_unlock(&src_dsq->lock); goto out; } @@ -8188,8 +8261,7 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, if (!kit->dsq) return -ENOENT; - kit->cursor = INIT_DSQ_LIST_CURSOR(kit->cursor, flags, - READ_ONCE(kit->dsq->seq)); + kit->cursor = INIT_DSQ_LIST_CURSOR(kit->cursor, kit->dsq, flags); return 0; } @@ -8203,41 +8275,13 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, __bpf_kfunc struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) { struct bpf_iter_scx_dsq_kern *kit = (void *)it; - bool rev = kit->cursor.flags & SCX_DSQ_ITER_REV; - struct task_struct *p; - unsigned long flags; if (!kit->dsq) return NULL; - raw_spin_lock_irqsave(&kit->dsq->lock, flags); + guard(raw_spinlock_irqsave)(&kit->dsq->lock); - if (list_empty(&kit->cursor.node)) - p = NULL; - else - p = container_of(&kit->cursor, struct task_struct, scx.dsq_list); - - /* - * Only tasks which were queued before the iteration started are - * visible. This bounds BPF iterations and guarantees that vtime never - * jumps in the other direction while iterating. - */ - do { - p = nldsq_next_task(kit->dsq, p, rev); - } while (p && unlikely(u32_before(kit->cursor.priv, p->scx.dsq_seq))); - - if (p) { - if (rev) - list_move_tail(&kit->cursor.node, &p->scx.dsq_list.node); - else - list_move(&kit->cursor.node, &p->scx.dsq_list.node); - } else { - list_del_init(&kit->cursor.node); - } - - raw_spin_unlock_irqrestore(&kit->dsq->lock, flags); - - return p; + return nldsq_cursor_next_task(&kit->cursor, kit->dsq); } /** -- cgit v1.2.3 From 84b1a0ea0b7c23dec240783a592e480780efe459 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 7 Mar 2026 05:29:50 -1000 Subject: sched_ext: Implement scx_bpf_dsq_reenq() for user DSQs scx_bpf_dsq_reenq() currently only supports local DSQs. Extend it to support user-defined DSQs by adding a deferred re-enqueue mechanism similar to the local DSQ handling. Add per-cpu deferred_reenq_user_node/flags to scx_dsq_pcpu and deferred_reenq_users list to scx_rq. When scx_bpf_dsq_reenq() is called on a user DSQ, the DSQ's per-cpu node is added to the current rq's deferred list. process_deferred_reenq_users() then iterates the DSQ using the cursor helpers and re-enqueues each task. Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 6 ++ kernel/sched/ext.c | 128 +++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 1 + tools/sched_ext/scx_qmap.bpf.c | 57 +++++++++++++++++- 4 files changed, 190 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 303f57dfb947..e77504faa0bc 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -62,8 +62,14 @@ enum scx_dsq_id_flags { SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU, }; +struct scx_deferred_reenq_user { + struct list_head node; + u64 flags; +}; + struct scx_dsq_pcpu { struct scx_dispatch_q *dsq; + struct scx_deferred_reenq_user deferred_reenq_user; }; /* diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index f51e4c20cd95..805c6689c99a 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1180,6 +1180,18 @@ static void schedule_dsq_reenq(struct scx_sched *sch, struct scx_dispatch_q *dsq drl->flags |= reenq_flags; } + schedule_deferred(rq); + } else if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) { + struct rq *rq = this_rq(); + struct scx_dsq_pcpu *dsq_pcpu = per_cpu_ptr(dsq->pcpu, cpu_of(rq)); + struct scx_deferred_reenq_user *dru = &dsq_pcpu->deferred_reenq_user; + + scoped_guard (raw_spinlock_irqsave, &rq->scx.deferred_reenq_lock) { + if (list_empty(&dru->node)) + list_move_tail(&dru->node, &rq->scx.deferred_reenq_users); + dru->flags |= reenq_flags; + } + schedule_deferred(rq); } else { scx_error(sch, "DSQ 0x%llx not allowed for reenq", dsq->id); @@ -3784,12 +3796,108 @@ static void process_deferred_reenq_locals(struct rq *rq) } } +static void reenq_user(struct rq *rq, struct scx_dispatch_q *dsq, u64 reenq_flags) +{ + struct rq *locked_rq = rq; + struct scx_sched *sch = dsq->sched; + struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, dsq, 0); + struct task_struct *p; + s32 nr_enqueued = 0; + + lockdep_assert_rq_held(rq); + + raw_spin_lock(&dsq->lock); + + while (likely(!READ_ONCE(sch->bypass_depth))) { + struct rq *task_rq; + + p = nldsq_cursor_next_task(&cursor, dsq); + if (!p) + break; + + if (!task_should_reenq(p, reenq_flags)) + continue; + + task_rq = task_rq(p); + + if (locked_rq != task_rq) { + if (locked_rq) + raw_spin_rq_unlock(locked_rq); + if (unlikely(!raw_spin_rq_trylock(task_rq))) { + raw_spin_unlock(&dsq->lock); + raw_spin_rq_lock(task_rq); + raw_spin_lock(&dsq->lock); + } + locked_rq = task_rq; + + /* did we lose @p while switching locks? */ + if (nldsq_cursor_lost_task(&cursor, task_rq, dsq, p)) + continue; + } + + /* @p is on @dsq, its rq and @dsq are locked */ + dispatch_dequeue_locked(p, dsq); + raw_spin_unlock(&dsq->lock); + do_enqueue_task(task_rq, p, SCX_ENQ_REENQ, -1); + + if (!(++nr_enqueued % SCX_TASK_ITER_BATCH)) { + raw_spin_rq_unlock(locked_rq); + locked_rq = NULL; + cpu_relax(); + } + + raw_spin_lock(&dsq->lock); + } + + list_del_init(&cursor.node); + raw_spin_unlock(&dsq->lock); + + if (locked_rq != rq) { + if (locked_rq) + raw_spin_rq_unlock(locked_rq); + raw_spin_rq_lock(rq); + } +} + +static void process_deferred_reenq_users(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + + while (true) { + struct scx_dispatch_q *dsq; + u64 reenq_flags = 0; + + scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) { + struct scx_deferred_reenq_user *dru = + list_first_entry_or_null(&rq->scx.deferred_reenq_users, + struct scx_deferred_reenq_user, + node); + struct scx_dsq_pcpu *dsq_pcpu; + + if (!dru) + return; + + dsq_pcpu = container_of(dru, struct scx_dsq_pcpu, + deferred_reenq_user); + dsq = dsq_pcpu->dsq; + swap(dru->flags, reenq_flags); + list_del_init(&dru->node); + } + + BUG_ON(dsq->id & SCX_DSQ_FLAG_BUILTIN); + reenq_user(rq, dsq, reenq_flags); + } +} + static void run_deferred(struct rq *rq) { process_ddsp_deferred_locals(rq); if (!list_empty(&rq->scx.deferred_reenq_locals)) process_deferred_reenq_locals(rq); + + if (!list_empty(&rq->scx.deferred_reenq_users)) + process_deferred_reenq_users(rq); } #ifdef CONFIG_NO_HZ_FULL @@ -4119,6 +4227,7 @@ static s32 init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id, struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu); pcpu->dsq = dsq; + INIT_LIST_HEAD(&pcpu->deferred_reenq_user.node); } return 0; @@ -4126,6 +4235,23 @@ static s32 init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id, static void exit_dsq(struct scx_dispatch_q *dsq) { + s32 cpu; + + for_each_possible_cpu(cpu) { + struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu); + struct scx_deferred_reenq_user *dru = &pcpu->deferred_reenq_user; + struct rq *rq = cpu_rq(cpu); + + /* + * There must have been a RCU grace period since the last + * insertion and @dsq should be off the deferred list by now. + */ + if (WARN_ON_ONCE(!list_empty(&dru->node))) { + guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock); + list_del_init(&dru->node); + } + } + free_percpu(dsq->pcpu); } @@ -7308,6 +7434,7 @@ void __init init_sched_ext_class(void) BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); raw_spin_lock_init(&rq->scx.deferred_reenq_lock); INIT_LIST_HEAD(&rq->scx.deferred_reenq_locals); + INIT_LIST_HEAD(&rq->scx.deferred_reenq_users); rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn); rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn); @@ -8354,6 +8481,7 @@ __bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id, * supported: * * - Local DSQs (%SCX_DSQ_LOCAL or %SCX_DSQ_LOCAL_ON | $cpu) + * - User DSQs * * Re-enqueues are performed asynchronously. Can be called from anywhere. */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0794852524e7..893f89ce2a77 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -810,6 +810,7 @@ struct scx_rq { raw_spinlock_t deferred_reenq_lock; struct list_head deferred_reenq_locals; /* scheds requesting reenq of local DSQ */ + struct list_head deferred_reenq_users; /* user DSQs requesting reenq */ struct balance_callback deferred_bal_cb; struct irq_work deferred_irq_work; struct irq_work kick_cpus_irq_work; diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 83e8289e8c0c..a4a1b84fe359 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -26,8 +26,11 @@ enum consts { ONE_SEC_IN_NS = 1000000000, + ONE_MSEC_IN_NS = 1000000, + LOWPRI_INTV_NS = 10 * ONE_MSEC_IN_NS, SHARED_DSQ = 0, HIGHPRI_DSQ = 1, + LOWPRI_DSQ = 2, HIGHPRI_WEIGHT = 8668, /* this is what -20 maps to */ }; @@ -172,6 +175,9 @@ s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, if (!(tctx = lookup_task_ctx(p))) return -ESRCH; + if (p->scx.weight < 2 && !(p->flags & PF_KTHREAD)) + return prev_cpu; + cpu = pick_direct_dispatch_cpu(p, prev_cpu); if (cpu >= 0) { @@ -242,6 +248,13 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) return; } + /* see lowpri_timerfn() */ + if (__COMPAT_has_generic_reenq() && + p->scx.weight < 2 && !(p->flags & PF_KTHREAD) && !(enq_flags & SCX_ENQ_REENQ)) { + scx_bpf_dsq_insert(p, LOWPRI_DSQ, slice_ns, enq_flags); + return; + } + /* if select_cpu() wasn't called, try direct dispatch */ if (!__COMPAT_is_enq_cpu_selected(enq_flags) && (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) { @@ -873,6 +886,28 @@ static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) return 0; } +struct lowpri_timer { + struct bpf_timer timer; +}; + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, u32); + __type(value, struct lowpri_timer); +} lowpri_timer SEC(".maps"); + +/* + * Nice 19 tasks are put into the lowpri DSQ. Every 10ms, reenq is triggered and + * the tasks are transferred to SHARED_DSQ. + */ +static int lowpri_timerfn(void *map, int *key, struct bpf_timer *timer) +{ + scx_bpf_dsq_reenq(LOWPRI_DSQ, 0); + bpf_timer_start(timer, LOWPRI_INTV_NS, 0); + return 0; +} + s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) { u32 key = 0; @@ -894,14 +929,32 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) return ret; } + ret = scx_bpf_create_dsq(LOWPRI_DSQ, -1); + if (ret) + return ret; + timer = bpf_map_lookup_elem(&monitor_timer, &key); if (!timer) return -ESRCH; - bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC); bpf_timer_set_callback(timer, monitor_timerfn); + ret = bpf_timer_start(timer, ONE_SEC_IN_NS, 0); + if (ret) + return ret; - return bpf_timer_start(timer, ONE_SEC_IN_NS, 0); + if (__COMPAT_has_generic_reenq()) { + /* see lowpri_timerfn() */ + timer = bpf_map_lookup_elem(&lowpri_timer, &key); + if (!timer) + return -ESRCH; + bpf_timer_init(timer, &lowpri_timer, CLOCK_MONOTONIC); + bpf_timer_set_callback(timer, lowpri_timerfn); + ret = bpf_timer_start(timer, LOWPRI_INTV_NS, 0); + if (ret) + return ret; + } + + return 0; } void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei) -- cgit v1.2.3 From 7203d77d6e04f83f7b78838eed099d9cac31700b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 7 Mar 2026 05:29:50 -1000 Subject: sched_ext: Simplify task state handling Task states (NONE, INIT, READY, ENABLED) were defined in a separate enum with unshifted values and then shifted when stored in scx_entity.flags. Simplify by defining them as pre-shifted values directly in scx_ent_flags and removing the separate scx_task_state enum. This removes the need for shifting when reading/writing state values. scx_get_task_state() now returns the masked flags value directly. scx_set_task_state() accepts the pre-shifted state value. scx_dump_task() shifts down for display to maintain readable output. No functional changes. Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 28 ++++++++++++++++------------ kernel/sched/ext.c | 19 +++++++++---------- 2 files changed, 25 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index e77504faa0bc..e822b374b17f 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -93,7 +93,7 @@ struct scx_dispatch_q { struct rcu_head rcu; }; -/* scx_entity.flags */ +/* sched_ext_entity.flags */ enum scx_ent_flags { SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */ SCX_TASK_IN_CUSTODY = 1 << 1, /* in custody, needs ops.dequeue() when leaving */ @@ -101,21 +101,25 @@ enum scx_ent_flags { SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ SCX_TASK_SUB_INIT = 1 << 4, /* task being initialized for a sub sched */ - SCX_TASK_STATE_SHIFT = 8, /* bit 8 and 9 are used to carry scx_task_state */ + /* + * Bits 8 and 9 are used to carry task state: + * + * NONE ops.init_task() not called yet + * INIT ops.init_task() succeeded, but task can be cancelled + * READY fully initialized, but not in sched_ext + * ENABLED fully initialized and in sched_ext + */ + SCX_TASK_STATE_SHIFT = 8, /* bits 8 and 9 are used to carry task state */ SCX_TASK_STATE_BITS = 2, SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT, - SCX_TASK_CURSOR = 1 << 31, /* iteration cursor, not a task */ -}; - -/* scx_entity.flags & SCX_TASK_STATE_MASK */ -enum scx_task_state { - SCX_TASK_NONE, /* ops.init_task() not called yet */ - SCX_TASK_INIT, /* ops.init_task() succeeded, but task can be cancelled */ - SCX_TASK_READY, /* fully initialized, but not in sched_ext */ - SCX_TASK_ENABLED, /* fully initialized and in sched_ext */ + SCX_TASK_NONE = 0 << SCX_TASK_STATE_SHIFT, + SCX_TASK_INIT = 1 << SCX_TASK_STATE_SHIFT, + SCX_TASK_READY = 2 << SCX_TASK_STATE_SHIFT, + SCX_TASK_ENABLED = 3 << SCX_TASK_STATE_SHIFT, - SCX_TASK_NR_STATES, + /* iteration cursor, not a task */ + SCX_TASK_CURSOR = 1 << 31, }; /* scx_entity.dsq_flags */ diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index ee756f1a70e1..f55e1603fc8c 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3284,18 +3284,16 @@ static struct cgroup *tg_cgrp(struct task_group *tg) #endif /* CONFIG_EXT_GROUP_SCHED */ -static enum scx_task_state scx_get_task_state(const struct task_struct *p) +static u32 scx_get_task_state(const struct task_struct *p) { - return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT; + return p->scx.flags & SCX_TASK_STATE_MASK; } -static void scx_set_task_state(struct task_struct *p, enum scx_task_state state) +static void scx_set_task_state(struct task_struct *p, u32 state) { - enum scx_task_state prev_state = scx_get_task_state(p); + u32 prev_state = scx_get_task_state(p); bool warn = false; - BUILD_BUG_ON(SCX_TASK_NR_STATES > (1 << SCX_TASK_STATE_BITS)); - switch (state) { case SCX_TASK_NONE: break; @@ -3313,11 +3311,11 @@ static void scx_set_task_state(struct task_struct *p, enum scx_task_state state) return; } - WARN_ONCE(warn, "sched_ext: Invalid task state transition %d -> %d for %s[%d]", + WARN_ONCE(warn, "sched_ext: Invalid task state transition 0x%x -> 0x%x for %s[%d]", prev_state, state, p->comm, p->pid); p->scx.flags &= ~SCX_TASK_STATE_MASK; - p->scx.flags |= state << SCX_TASK_STATE_SHIFT; + p->scx.flags |= state; } static int __scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fork) @@ -5794,7 +5792,8 @@ static void scx_dump_task(struct scx_sched *sch, own_marker, sch_id_buf, jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies)); dump_line(s, " scx_state/flags=%u/0x%x dsq_flags=0x%x ops_state/qseq=%lu/%lu", - scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK, + scx_get_task_state(p) >> SCX_TASK_STATE_SHIFT, + p->scx.flags & ~SCX_TASK_STATE_MASK, p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK, ops_state >> SCX_OPSS_QSEQ_SHIFT); dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s", @@ -6558,7 +6557,7 @@ static struct scx_sched *find_parent_sched(struct cgroup *cgrp) static bool assert_task_ready_or_enabled(struct task_struct *p) { - enum scx_task_state state = scx_get_task_state(p); + u32 state = scx_get_task_state(p); switch (state) { case SCX_TASK_READY: -- cgit v1.2.3 From ce897abc21b2d5e74981ff2b848f3a08a580d50a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 7 Mar 2026 05:29:50 -1000 Subject: sched_ext: Add SCX_TASK_REENQ_REASON flags SCX_ENQ_REENQ indicates that a task is being re-enqueued but doesn't tell the BPF scheduler why. Add SCX_TASK_REENQ_REASON flags using bits 12-13 of p->scx.flags to communicate the reason during ops.enqueue(): - NONE: Not being reenqueued - KFUNC: Reenqueued by scx_bpf_dsq_reenq() and friends More reasons will be added. Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 15 +++++++++++++++ kernel/sched/ext.c | 25 ++++++++++++++++++++++--- kernel/sched/ext_internal.h | 10 +++------- 3 files changed, 40 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index e822b374b17f..60a4f65d0174 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -118,6 +118,21 @@ enum scx_ent_flags { SCX_TASK_READY = 2 << SCX_TASK_STATE_SHIFT, SCX_TASK_ENABLED = 3 << SCX_TASK_STATE_SHIFT, + /* + * Bits 12 and 13 are used to carry reenqueue reason. In addition to + * %SCX_ENQ_REENQ flag, ops.enqueue() can also test for + * %SCX_TASK_REENQ_REASON_NONE to distinguish reenqueues. + * + * NONE not being reenqueued + * KFUNC reenqueued by scx_bpf_dsq_reenq() and friends + */ + SCX_TASK_REENQ_REASON_SHIFT = 12, + SCX_TASK_REENQ_REASON_BITS = 2, + SCX_TASK_REENQ_REASON_MASK = ((1 << SCX_TASK_REENQ_REASON_BITS) - 1) << SCX_TASK_REENQ_REASON_SHIFT, + + SCX_TASK_REENQ_NONE = 0 << SCX_TASK_REENQ_REASON_SHIFT, + SCX_TASK_REENQ_KFUNC = 1 << SCX_TASK_REENQ_REASON_SHIFT, + /* iteration cursor, not a task */ SCX_TASK_CURSOR = 1 << 31, }; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index f55e1603fc8c..d5849ed4cd3e 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3728,8 +3728,10 @@ static void process_ddsp_deferred_locals(struct rq *rq) } } -static bool task_should_reenq(struct task_struct *p, u64 reenq_flags) +static bool task_should_reenq(struct task_struct *p, u64 reenq_flags, u32 *reason) { + *reason = SCX_TASK_REENQ_KFUNC; + if (reenq_flags & SCX_REENQ_ANY) return true; return false; @@ -3751,6 +3753,7 @@ static u32 reenq_local(struct scx_sched *sch, struct rq *rq, u64 reenq_flags) list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list, scx.dsq_list.node) { struct scx_sched *task_sch = scx_task_sched(p); + u32 reason; /* * If @p is being migrated, @p's current CPU may not agree with @@ -3769,16 +3772,24 @@ static u32 reenq_local(struct scx_sched *sch, struct rq *rq, u64 reenq_flags) if (!scx_is_descendant(task_sch, sch)) continue; - if (!task_should_reenq(p, reenq_flags)) + if (!task_should_reenq(p, reenq_flags, &reason)) continue; dispatch_dequeue(rq, p); + + if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK)) + p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; + p->scx.flags |= reason; + list_add_tail(&p->scx.dsq_list.node, &tasks); } list_for_each_entry_safe(p, n, &tasks, scx.dsq_list.node) { list_del_init(&p->scx.dsq_list.node); + do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); + + p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; nr_enqueued++; } @@ -3832,12 +3843,13 @@ static void reenq_user(struct rq *rq, struct scx_dispatch_q *dsq, u64 reenq_flag while (likely(!READ_ONCE(sch->bypass_depth))) { struct rq *task_rq; + u32 reason; p = nldsq_cursor_next_task(&cursor, dsq); if (!p) break; - if (!task_should_reenq(p, reenq_flags)) + if (!task_should_reenq(p, reenq_flags, &reason)) continue; task_rq = task_rq(p); @@ -3860,8 +3872,15 @@ static void reenq_user(struct rq *rq, struct scx_dispatch_q *dsq, u64 reenq_flag /* @p is on @dsq, its rq and @dsq are locked */ dispatch_dequeue_locked(p, dsq); raw_spin_unlock(&dsq->lock); + + if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK)) + p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; + p->scx.flags |= reason; + do_enqueue_task(task_rq, p, SCX_ENQ_REENQ, -1); + p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; + if (!(++nr_enqueued % SCX_TASK_ITER_BATCH)) { raw_spin_rq_unlock(locked_rq); locked_rq = NULL; diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h index d9eda2e8701c..f8df73044515 100644 --- a/kernel/sched/ext_internal.h +++ b/kernel/sched/ext_internal.h @@ -1080,13 +1080,9 @@ enum scx_enq_flags { SCX_ENQ_PREEMPT = 1LLU << 32, /* - * The task being enqueued was previously enqueued on the current CPU's - * %SCX_DSQ_LOCAL, but was removed from it in a call to the - * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was - * invoked in a ->cpu_release() callback, and the task is again - * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the - * task will not be scheduled on the CPU until at least the next invocation - * of the ->cpu_acquire() callback. + * The task being enqueued was previously enqueued on a DSQ, but was + * removed and is being re-enqueued. See SCX_TASK_REENQ_* flags to find + * out why a given task is being reenqueued. */ SCX_ENQ_REENQ = 1LLU << 40, -- cgit v1.2.3 From 1ea4b473504b6dc6a0d21c298519aff2d52433c9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 5 Mar 2026 19:55:41 +0000 Subject: locking/rwsem: Remove the list_head from struct rw_semaphore Instead of embedding a list_head in struct rw_semaphore, store a pointer to the first waiter. The list of waiters remains a doubly linked list so we can efficiently add to the tail of the list, remove from the front (or middle) of the list. Some of the list manipulation becomes more complicated, but it's a reasonable tradeoff on the slow paths to shrink some core data structures like struct inode. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260305195545.3707590-2-willy@infradead.org --- include/linux/rwsem.h | 8 ++--- kernel/locking/rwsem.c | 90 ++++++++++++++++++++++++++++++++------------------ 2 files changed, 62 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 9bf1d93d3d7b..e7829531c4ba 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -57,7 +57,7 @@ context_lock_struct(rw_semaphore) { struct optimistic_spin_queue osq; /* spinner MCS lock */ #endif raw_spinlock_t wait_lock; - struct list_head wait_list; + struct rwsem_waiter *first_waiter; #ifdef CONFIG_DEBUG_RWSEMS void *magic; #endif @@ -106,7 +106,7 @@ static inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore * .owner = ATOMIC_LONG_INIT(0), \ __RWSEM_OPT_INIT(name) \ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),\ - .wait_list = LIST_HEAD_INIT((name).wait_list), \ + .first_waiter = NULL, \ __RWSEM_DEBUG_INIT(name) \ __RWSEM_DEP_MAP_INIT(name) } @@ -129,9 +129,9 @@ do { \ * rwsem to see if somebody from an incompatible type is wanting access to the * lock. */ -static inline int rwsem_is_contended(struct rw_semaphore *sem) +static inline bool rwsem_is_contended(struct rw_semaphore *sem) { - return !list_empty(&sem->wait_list); + return sem->first_waiter != NULL; } #if defined(CONFIG_DEBUG_RWSEMS) || defined(CONFIG_DETECT_HUNG_TASK_BLOCKER) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 24df4d98f7d2..e66f37ebc6f6 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -72,7 +72,7 @@ #c, atomic_long_read(&(sem)->count), \ (unsigned long) sem->magic, \ atomic_long_read(&(sem)->owner), (long)current, \ - list_empty(&(sem)->wait_list) ? "" : "not ")) \ + (sem)->first_waiter ? "" : "not ")) \ debug_locks_off(); \ } while (0) #else @@ -321,7 +321,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, #endif atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); raw_spin_lock_init(&sem->wait_lock); - INIT_LIST_HEAD(&sem->wait_list); + sem->first_waiter = NULL; atomic_long_set(&sem->owner, 0L); #ifdef CONFIG_RWSEM_SPIN_ON_OWNER osq_lock_init(&sem->osq); @@ -341,8 +341,6 @@ struct rwsem_waiter { unsigned long timeout; bool handoff_set; }; -#define rwsem_first_waiter(sem) \ - list_first_entry(&sem->wait_list, struct rwsem_waiter, list) enum rwsem_wake_type { RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */ @@ -365,12 +363,21 @@ enum rwsem_wake_type { */ #define MAX_READERS_WAKEUP 0x100 -static inline void -rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) +static inline +bool __rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) { - lockdep_assert_held(&sem->wait_lock); - list_add_tail(&waiter->list, &sem->wait_list); - /* caller will set RWSEM_FLAG_WAITERS */ + if (list_empty(&waiter->list)) { + sem->first_waiter = NULL; + return true; + } + + if (sem->first_waiter == waiter) { + sem->first_waiter = list_first_entry(&waiter->list, + struct rwsem_waiter, list); + } + list_del(&waiter->list); + + return false; } /* @@ -385,14 +392,23 @@ static inline bool rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) { lockdep_assert_held(&sem->wait_lock); - list_del(&waiter->list); - if (likely(!list_empty(&sem->wait_list))) + if (__rwsem_del_waiter(sem, waiter)) return true; - atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count); return false; } +static inline +struct rwsem_waiter *next_waiter(const struct rw_semaphore *sem, + const struct rwsem_waiter *waiter) +{ + struct rwsem_waiter *next = list_first_entry(&waiter->list, + struct rwsem_waiter, list); + if (next == sem->first_waiter) + return NULL; + return next; +} + /* * handle the lock release when processes blocked on it that can now run * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must @@ -411,7 +427,7 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type, struct wake_q_head *wake_q) { - struct rwsem_waiter *waiter, *tmp; + struct rwsem_waiter *waiter, *next; long oldcount, woken = 0, adjustment = 0; struct list_head wlist; @@ -421,7 +437,7 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, * Take a peek at the queue head waiter such that we can determine * the wakeup(s) to perform. */ - waiter = rwsem_first_waiter(sem); + waiter = sem->first_waiter; if (waiter->type == RWSEM_WAITING_FOR_WRITE) { if (wake_type == RWSEM_WAKE_ANY) { @@ -506,25 +522,28 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, * put them into wake_q to be woken up later. */ INIT_LIST_HEAD(&wlist); - list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) { + do { + next = next_waiter(sem, waiter); if (waiter->type == RWSEM_WAITING_FOR_WRITE) continue; woken++; list_move_tail(&waiter->list, &wlist); + if (sem->first_waiter == waiter) + sem->first_waiter = next; /* * Limit # of readers that can be woken up per wakeup call. */ if (unlikely(woken >= MAX_READERS_WAKEUP)) break; - } + } while ((waiter = next) != NULL); adjustment = woken * RWSEM_READER_BIAS - adjustment; lockevent_cond_inc(rwsem_wake_reader, woken); oldcount = atomic_long_read(&sem->count); - if (list_empty(&sem->wait_list)) { + if (!sem->first_waiter) { /* * Combined with list_move_tail() above, this implies * rwsem_del_waiter(). @@ -545,7 +564,7 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, atomic_long_add(adjustment, &sem->count); /* 2nd pass */ - list_for_each_entry_safe(waiter, tmp, &wlist, list) { + list_for_each_entry_safe(waiter, next, &wlist, list) { struct task_struct *tsk; tsk = waiter->task; @@ -577,7 +596,7 @@ rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter, struct wake_q_head *wake_q) __releases(&sem->wait_lock) { - bool first = rwsem_first_waiter(sem) == waiter; + bool first = sem->first_waiter == waiter; wake_q_init(wake_q); @@ -603,7 +622,7 @@ rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter, static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, struct rwsem_waiter *waiter) { - struct rwsem_waiter *first = rwsem_first_waiter(sem); + struct rwsem_waiter *first = sem->first_waiter; long count, new; lockdep_assert_held(&sem->wait_lock); @@ -639,7 +658,7 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, new |= RWSEM_WRITER_LOCKED; new &= ~RWSEM_FLAG_HANDOFF; - if (list_is_singular(&sem->wait_list)) + if (list_empty(&first->list)) new &= ~RWSEM_FLAG_WAITERS; } } while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new)); @@ -659,7 +678,8 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on * success. */ - list_del(&waiter->list); + __rwsem_del_waiter(sem, waiter); + rwsem_set_owner(sem); return true; } @@ -994,7 +1014,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat { long adjustment = -RWSEM_READER_BIAS; long rcnt = (count >> RWSEM_READER_SHIFT); - struct rwsem_waiter waiter; + struct rwsem_waiter waiter, *first; DEFINE_WAKE_Q(wake_q); /* @@ -1019,7 +1039,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat */ if ((rcnt == 1) && (count & RWSEM_FLAG_WAITERS)) { raw_spin_lock_irq(&sem->wait_lock); - if (!list_empty(&sem->wait_list)) + if (sem->first_waiter) rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); @@ -1035,7 +1055,8 @@ queue: waiter.handoff_set = false; raw_spin_lock_irq(&sem->wait_lock); - if (list_empty(&sem->wait_list)) { + first = sem->first_waiter; + if (!first) { /* * In case the wait queue is empty and the lock isn't owned * by a writer, this reader can exit the slowpath and return @@ -1051,8 +1072,11 @@ queue: return sem; } adjustment += RWSEM_FLAG_WAITERS; + INIT_LIST_HEAD(&waiter.list); + sem->first_waiter = &waiter; + } else { + list_add_tail(&waiter.list, &first->list); } - rwsem_add_waiter(sem, &waiter); /* we're now waiting on the lock, but no longer actively locking */ count = atomic_long_add_return(adjustment, &sem->count); @@ -1110,7 +1134,7 @@ out_nolock: static struct rw_semaphore __sched * rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) { - struct rwsem_waiter waiter; + struct rwsem_waiter waiter, *first; DEFINE_WAKE_Q(wake_q); /* do optimistic spinning and steal lock if possible */ @@ -1129,10 +1153,10 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) waiter.handoff_set = false; raw_spin_lock_irq(&sem->wait_lock); - rwsem_add_waiter(sem, &waiter); - /* we're now waiting on the lock */ - if (rwsem_first_waiter(sem) != &waiter) { + first = sem->first_waiter; + if (first) { + list_add_tail(&waiter.list, &first->list); rwsem_cond_wake_waiter(sem, atomic_long_read(&sem->count), &wake_q); if (!wake_q_empty(&wake_q)) { @@ -1145,6 +1169,8 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) raw_spin_lock_irq(&sem->wait_lock); } } else { + INIT_LIST_HEAD(&waiter.list); + sem->first_waiter = &waiter; atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count); } @@ -1218,7 +1244,7 @@ static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - if (!list_empty(&sem->wait_list)) + if (sem->first_waiter) rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); @@ -1239,7 +1265,7 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - if (!list_empty(&sem->wait_list)) + if (sem->first_waiter) rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -- cgit v1.2.3 From b9bdd4b6840454ef87f61b6506c9635c57a81650 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 5 Mar 2026 19:55:42 +0000 Subject: locking/semaphore: Remove the list_head from struct semaphore Instead of embedding a list_head in struct semaphore, store a pointer to the first waiter. The list of waiters remains a doubly linked list so we can efficiently add to the tail of the list and remove from the front (or middle) of the list. Some of the list manipulation becomes more complicated, but it's a reasonable tradeoff on the slow paths to shrink data structures which embed a semaphore. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260305195545.3707590-3-willy@infradead.org --- drivers/acpi/osl.c | 2 +- include/linux/semaphore.h | 4 ++-- kernel/locking/semaphore.c | 41 +++++++++++++++++++++++++++++++---------- 3 files changed, 34 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index 5b777316b9ac..2af0db9210fe 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -1257,7 +1257,7 @@ acpi_status acpi_os_delete_semaphore(acpi_handle handle) ACPI_DEBUG_PRINT((ACPI_DB_MUTEX, "Deleting semaphore[%p].\n", handle)); - BUG_ON(!list_empty(&sem->wait_list)); + BUG_ON(sem->first_waiter); kfree(sem); sem = NULL; diff --git a/include/linux/semaphore.h b/include/linux/semaphore.h index 89706157e622..a4c8651ef021 100644 --- a/include/linux/semaphore.h +++ b/include/linux/semaphore.h @@ -15,7 +15,7 @@ struct semaphore { raw_spinlock_t lock; unsigned int count; - struct list_head wait_list; + struct semaphore_waiter *first_waiter; #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER unsigned long last_holder; @@ -33,7 +33,7 @@ struct semaphore { { \ .lock = __RAW_SPIN_LOCK_UNLOCKED((name).lock), \ .count = n, \ - .wait_list = LIST_HEAD_INIT((name).wait_list) \ + .first_waiter = NULL \ __LAST_HOLDER_SEMAPHORE_INITIALIZER \ } diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index 3ef032e22f7e..74d41433ba13 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -21,7 +21,7 @@ * too. * * The ->count variable represents how many more tasks can acquire this - * semaphore. If it's zero, there may be tasks waiting on the wait_list. + * semaphore. If it's zero, there may be waiters. */ #include @@ -226,7 +226,7 @@ void __sched up(struct semaphore *sem) hung_task_sem_clear_if_holder(sem); - if (likely(list_empty(&sem->wait_list))) + if (likely(!sem->first_waiter)) sem->count++; else __up(sem, &wake_q); @@ -244,6 +244,21 @@ struct semaphore_waiter { bool up; }; +static inline +void sem_del_waiter(struct semaphore *sem, struct semaphore_waiter *waiter) +{ + if (list_empty(&waiter->list)) { + sem->first_waiter = NULL; + return; + } + + if (sem->first_waiter == waiter) { + sem->first_waiter = list_first_entry(&waiter->list, + struct semaphore_waiter, list); + } + list_del(&waiter->list); +} + /* * Because this function is inlined, the 'state' parameter will be * constant, and thus optimised away by the compiler. Likewise the @@ -252,9 +267,15 @@ struct semaphore_waiter { static inline int __sched ___down_common(struct semaphore *sem, long state, long timeout) { - struct semaphore_waiter waiter; - - list_add_tail(&waiter.list, &sem->wait_list); + struct semaphore_waiter waiter, *first; + + first = sem->first_waiter; + if (first) { + list_add_tail(&waiter.list, &first->list); + } else { + INIT_LIST_HEAD(&waiter.list); + sem->first_waiter = &waiter; + } waiter.task = current; waiter.up = false; @@ -274,11 +295,11 @@ static inline int __sched ___down_common(struct semaphore *sem, long state, } timed_out: - list_del(&waiter.list); + sem_del_waiter(sem, &waiter); return -ETIME; interrupted: - list_del(&waiter.list); + sem_del_waiter(sem, &waiter); return -EINTR; } @@ -321,9 +342,9 @@ static noinline int __sched __down_timeout(struct semaphore *sem, long timeout) static noinline void __sched __up(struct semaphore *sem, struct wake_q_head *wake_q) { - struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list, - struct semaphore_waiter, list); - list_del(&waiter->list); + struct semaphore_waiter *waiter = sem->first_waiter; + + sem_del_waiter(sem, waiter); waiter->up = true; wake_q_add(wake_q, waiter->task); } -- cgit v1.2.3 From 25500ba7e77ce9d3d9b5a1929d41a2ee2e23f6fe Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 5 Mar 2026 19:55:43 +0000 Subject: locking/mutex: Remove the list_head from struct mutex Instead of embedding a list_head in struct mutex, store a pointer to the first waiter. The list of waiters remains a doubly linked list so we can efficiently add to the tail of the list, remove from the front (or middle) of the list. Some of the list manipulation becomes more complicated, but it's a reasonable tradeoff on the slow paths to shrink data structures which embed a mutex like struct file. Some of the debug checks have to be deleted because there's no equivalent to checking them in the new scheme (eg an empty waiter->list now means that it is the only waiter, not that the waiter is no longer on the list). Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260305195545.3707590-4-willy@infradead.org --- include/linux/mutex.h | 2 +- include/linux/mutex_types.h | 2 +- kernel/locking/mutex-debug.c | 5 +---- kernel/locking/mutex.c | 49 ++++++++++++++++++++++++-------------------- kernel/locking/ww_mutex.h | 25 +++++++--------------- 5 files changed, 37 insertions(+), 46 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 2f648ee204e7..c471b129f703 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -79,7 +79,7 @@ do { \ #define __MUTEX_INITIALIZER(lockname) \ { .owner = ATOMIC_LONG_INIT(0) \ , .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(lockname.wait_lock) \ - , .wait_list = LIST_HEAD_INIT(lockname.wait_list) \ + , .first_waiter = NULL \ __DEBUG_MUTEX_INITIALIZER(lockname) \ __DEP_MAP_MUTEX_INITIALIZER(lockname) } diff --git a/include/linux/mutex_types.h b/include/linux/mutex_types.h index 80975935ec48..a8f119f81177 100644 --- a/include/linux/mutex_types.h +++ b/include/linux/mutex_types.h @@ -44,7 +44,7 @@ context_lock_struct(mutex) { #ifdef CONFIG_MUTEX_SPIN_ON_OWNER struct optimistic_spin_queue osq; /* Spinner MCS lock */ #endif - struct list_head wait_list; + struct mutex_waiter *first_waiter; #ifdef CONFIG_DEBUG_MUTEXES void *magic; #endif diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 2c6b02d4699b..94930d506bcf 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -37,9 +37,8 @@ void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter) { lockdep_assert_held(&lock->wait_lock); - DEBUG_LOCKS_WARN_ON(list_empty(&lock->wait_list)); + DEBUG_LOCKS_WARN_ON(!lock->first_waiter); DEBUG_LOCKS_WARN_ON(waiter->magic != waiter); - DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); } void debug_mutex_free_waiter(struct mutex_waiter *waiter) @@ -62,7 +61,6 @@ void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, { struct mutex *blocked_on = __get_task_blocked_on(task); - DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); DEBUG_LOCKS_WARN_ON(waiter->task != task); DEBUG_LOCKS_WARN_ON(blocked_on && blocked_on != lock); @@ -74,7 +72,6 @@ void debug_mutex_unlock(struct mutex *lock) { if (likely(debug_locks)) { DEBUG_LOCKS_WARN_ON(lock->magic != lock); - DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); } } diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index c867f6c15530..95f1822122a1 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -47,7 +47,7 @@ static void __mutex_init_generic(struct mutex *lock) { atomic_long_set(&lock->owner, 0); raw_spin_lock_init(&lock->wait_lock); - INIT_LIST_HEAD(&lock->wait_list); + lock->first_waiter = NULL; #ifdef CONFIG_MUTEX_SPIN_ON_OWNER osq_lock_init(&lock->osq); #endif @@ -194,33 +194,42 @@ static inline void __mutex_clear_flag(struct mutex *lock, unsigned long flag) atomic_long_andnot(flag, &lock->owner); } -static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_waiter *waiter) -{ - return list_first_entry(&lock->wait_list, struct mutex_waiter, list) == waiter; -} - /* * Add @waiter to a given location in the lock wait_list and set the * FLAG_WAITERS flag if it's the first waiter. */ static void __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct list_head *list) + struct mutex_waiter *first) { hung_task_set_blocker(lock, BLOCKER_TYPE_MUTEX); debug_mutex_add_waiter(lock, waiter, current); - list_add_tail(&waiter->list, list); - if (__mutex_waiter_is_first(lock, waiter)) + if (!first) + first = lock->first_waiter; + + if (first) { + list_add_tail(&waiter->list, &first->list); + } else { + INIT_LIST_HEAD(&waiter->list); + lock->first_waiter = waiter; __mutex_set_flag(lock, MUTEX_FLAG_WAITERS); + } } static void __mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter) { - list_del(&waiter->list); - if (likely(list_empty(&lock->wait_list))) + if (list_empty(&waiter->list)) { __mutex_clear_flag(lock, MUTEX_FLAGS); + lock->first_waiter = NULL; + } else { + if (lock->first_waiter == waiter) { + lock->first_waiter = list_first_entry(&waiter->list, + struct mutex_waiter, list); + } + list_del(&waiter->list); + } debug_mutex_remove_waiter(lock, waiter, current); hung_task_clear_blocker(); @@ -340,7 +349,7 @@ bool ww_mutex_spin_on_owner(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, * Similarly, stop spinning if we are no longer the * first waiter. */ - if (waiter && !__mutex_waiter_is_first(lock, waiter)) + if (waiter && lock->first_waiter != waiter) return false; return true; @@ -645,7 +654,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas if (!use_ww_ctx) { /* add waiting tasks to the end of the waitqueue (FIFO): */ - __mutex_add_waiter(lock, &waiter, &lock->wait_list); + __mutex_add_waiter(lock, &waiter, NULL); } else { /* * Add in stamp order, waking up waiters that must kill @@ -691,7 +700,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas schedule_preempt_disabled(); - first = __mutex_waiter_is_first(lock, &waiter); + first = lock->first_waiter == &waiter; /* * As we likely have been woken up by task @@ -734,8 +743,7 @@ acquired: * Wound-Wait; we stole the lock (!first_waiter), check the * waiters as anyone might want to wound us. */ - if (!ww_ctx->is_wait_die && - !__mutex_waiter_is_first(lock, &waiter)) + if (!ww_ctx->is_wait_die && lock->first_waiter != &waiter) __ww_mutex_check_waiters(lock, ww_ctx, &wake_q); } @@ -931,6 +939,7 @@ EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible); static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip) { struct task_struct *next = NULL; + struct mutex_waiter *waiter; DEFINE_WAKE_Q(wake_q); unsigned long owner; unsigned long flags; @@ -962,12 +971,8 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne raw_spin_lock_irqsave(&lock->wait_lock, flags); debug_mutex_unlock(lock); - if (!list_empty(&lock->wait_list)) { - /* get the first entry from the wait-list: */ - struct mutex_waiter *waiter = - list_first_entry(&lock->wait_list, - struct mutex_waiter, list); - + waiter = lock->first_waiter; + if (waiter) { next = waiter->task; debug_mutex_wake_waiter(lock, waiter); diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 31a785afee6c..a0847e91ae04 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -8,20 +8,14 @@ static inline struct mutex_waiter * __ww_waiter_first(struct mutex *lock) { - struct mutex_waiter *w; - - w = list_first_entry(&lock->wait_list, struct mutex_waiter, list); - if (list_entry_is_head(w, &lock->wait_list, list)) - return NULL; - - return w; + return lock->first_waiter; } static inline struct mutex_waiter * __ww_waiter_next(struct mutex *lock, struct mutex_waiter *w) { w = list_next_entry(w, list); - if (list_entry_is_head(w, &lock->wait_list, list)) + if (lock->first_waiter == w) return NULL; return w; @@ -31,7 +25,7 @@ static inline struct mutex_waiter * __ww_waiter_prev(struct mutex *lock, struct mutex_waiter *w) { w = list_prev_entry(w, list); - if (list_entry_is_head(w, &lock->wait_list, list)) + if (lock->first_waiter == w) return NULL; return w; @@ -40,22 +34,17 @@ __ww_waiter_prev(struct mutex *lock, struct mutex_waiter *w) static inline struct mutex_waiter * __ww_waiter_last(struct mutex *lock) { - struct mutex_waiter *w; - - w = list_last_entry(&lock->wait_list, struct mutex_waiter, list); - if (list_entry_is_head(w, &lock->wait_list, list)) - return NULL; + struct mutex_waiter *w = lock->first_waiter; + if (w) + w = list_prev_entry(w, list); return w; } static inline void __ww_waiter_add(struct mutex *lock, struct mutex_waiter *waiter, struct mutex_waiter *pos) { - struct list_head *p = &lock->wait_list; - if (pos) - p = &pos->list; - __mutex_add_waiter(lock, waiter, p); + __mutex_add_waiter(lock, waiter, pos); } static inline struct task_struct * -- cgit v1.2.3 From 07574b8ebaac7927e2355b4f343b03b50e04494c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 20 Jan 2026 13:40:30 +0100 Subject: compiler-context-analysys: Add __cond_releases() Useful for things like unlock fastpaths, which on success release the lock. Suggested-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Marco Elver Link: https://patch.msgid.link/20260121111213.634625032@infradead.org --- include/linux/compiler-context-analysis.h | 32 +++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler-context-analysis.h b/include/linux/compiler-context-analysis.h index 00c074a2ccb0..a9317571e6af 100644 --- a/include/linux/compiler-context-analysis.h +++ b/include/linux/compiler-context-analysis.h @@ -320,6 +320,38 @@ static inline void _context_unsafe_alias(void **p) { } */ #define __releases(...) __releases_ctx_lock(__VA_ARGS__) +/* + * Clang's analysis does not care precisely about the value, only that it is + * either zero or non-zero. So the __cond_acquires() interface might be + * misleading if we say that @ret is the value returned if acquired. Instead, + * provide symbolic variants which we translate. + */ +#define __cond_acquires_impl_not_true(x, ...) __try_acquires##__VA_ARGS__##_ctx_lock(0, x) +#define __cond_acquires_impl_not_false(x, ...) __try_acquires##__VA_ARGS__##_ctx_lock(1, x) +#define __cond_acquires_impl_not_nonzero(x, ...) __try_acquires##__VA_ARGS__##_ctx_lock(0, x) +#define __cond_acquires_impl_not_0(x, ...) __try_acquires##__VA_ARGS__##_ctx_lock(1, x) +#define __cond_acquires_impl_not_nonnull(x, ...) __try_acquires##__VA_ARGS__##_ctx_lock(0, x) +#define __cond_acquires_impl_not_NULL(x, ...) __try_acquires##__VA_ARGS__##_ctx_lock(1, x) + +/** + * __cond_releases() - function attribute, function conditionally + * releases a context lock exclusively + * @ret: abstract value returned by function if context lock releases + * @x: context lock instance pointer + * + * Function attribute declaring that the function conditionally releases the + * given context lock instance @x exclusively. The associated context(s) must + * be active on entry. The function return value @ret denotes when the context + * lock is released. + * + * @ret may be one of: true, false, nonzero, 0, nonnull, NULL. + * + * NOTE: clang does not have a native attribute for this; instead implement + * it as an unconditional release and a conditional acquire for the + * inverted condition -- which is semantically equivalent. + */ +#define __cond_releases(ret, x) __releases(x) __cond_acquires_impl_not_##ret(x) + /** * __acquire() - function to acquire context lock exclusively * @x: context lock instance pointer -- cgit v1.2.3 From 5c4326231cde36fd5e90c41e403df9fac6238f4b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 20 Jan 2026 10:06:08 +0100 Subject: locking/mutex: Add context analysis Add compiler context analysis annotations. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260121111213.745353747@infradead.org --- include/linux/mutex.h | 2 +- include/linux/mutex_types.h | 2 +- kernel/locking/Makefile | 2 ++ kernel/locking/mutex.c | 33 ++++++++++++++++++++++++++++----- kernel/locking/mutex.h | 1 + kernel/locking/ww_mutex.h | 12 ++++++++++++ 6 files changed, 45 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index c471b129f703..734048c02f4f 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -183,7 +183,7 @@ static inline int __must_check __devm_mutex_init(struct device *dev, struct mute */ #ifdef CONFIG_DEBUG_LOCK_ALLOC extern void mutex_lock_nested(struct mutex *lock, unsigned int subclass) __acquires(lock); -extern void _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock); +extern void _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock) __acquires(lock); extern int __must_check mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) __cond_acquires(0, lock); extern int __must_check _mutex_lock_killable(struct mutex *lock, diff --git a/include/linux/mutex_types.h b/include/linux/mutex_types.h index a8f119f81177..24ed599fdda8 100644 --- a/include/linux/mutex_types.h +++ b/include/linux/mutex_types.h @@ -44,7 +44,7 @@ context_lock_struct(mutex) { #ifdef CONFIG_MUTEX_SPIN_ON_OWNER struct optimistic_spin_queue osq; /* Spinner MCS lock */ #endif - struct mutex_waiter *first_waiter; + struct mutex_waiter *first_waiter __guarded_by(&wait_lock); #ifdef CONFIG_DEBUG_MUTEXES void *magic; #endif diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index a114949eeed5..264447d606a6 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -3,6 +3,8 @@ # and is generally not a function of system call inputs. KCOV_INSTRUMENT := n +CONTEXT_ANALYSIS_mutex.o := y + obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o # Avoid recursion lockdep -> sanitizer -> ... -> lockdep & improve performance. diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 95f1822122a1..427187ff02db 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -46,8 +46,9 @@ static void __mutex_init_generic(struct mutex *lock) { atomic_long_set(&lock->owner, 0); - raw_spin_lock_init(&lock->wait_lock); - lock->first_waiter = NULL; + scoped_guard (raw_spinlock_init, &lock->wait_lock) { + lock->first_waiter = NULL; + } #ifdef CONFIG_MUTEX_SPIN_ON_OWNER osq_lock_init(&lock->osq); #endif @@ -150,6 +151,7 @@ EXPORT_SYMBOL(mutex_init_generic); * follow with a __mutex_trylock() before failing. */ static __always_inline bool __mutex_trylock_fast(struct mutex *lock) + __cond_acquires(true, lock) { unsigned long curr = (unsigned long)current; unsigned long zero = 0UL; @@ -163,6 +165,7 @@ static __always_inline bool __mutex_trylock_fast(struct mutex *lock) } static __always_inline bool __mutex_unlock_fast(struct mutex *lock) + __cond_releases(true, lock) { unsigned long curr = (unsigned long)current; @@ -201,6 +204,7 @@ static inline void __mutex_clear_flag(struct mutex *lock, unsigned long flag) static void __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct mutex_waiter *first) + __must_hold(&lock->wait_lock) { hung_task_set_blocker(lock, BLOCKER_TYPE_MUTEX); debug_mutex_add_waiter(lock, waiter, current); @@ -219,6 +223,7 @@ __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, static void __mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter) + __must_hold(&lock->wait_lock) { if (list_empty(&waiter->list)) { __mutex_clear_flag(lock, MUTEX_FLAGS); @@ -268,7 +273,8 @@ static void __mutex_handoff(struct mutex *lock, struct task_struct *task) * We also put the fastpath first in the kernel image, to make sure the * branch is predicted by the CPU as default-untaken. */ -static void __sched __mutex_lock_slowpath(struct mutex *lock); +static void __sched __mutex_lock_slowpath(struct mutex *lock) + __acquires(lock); /** * mutex_lock - acquire the mutex @@ -349,7 +355,7 @@ bool ww_mutex_spin_on_owner(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, * Similarly, stop spinning if we are no longer the * first waiter. */ - if (waiter && lock->first_waiter != waiter) + if (waiter && data_race(lock->first_waiter != waiter)) return false; return true; @@ -534,7 +540,8 @@ mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx, } #endif -static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip); +static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip) + __releases(lock); /** * mutex_unlock - release the mutex @@ -574,6 +581,7 @@ EXPORT_SYMBOL(mutex_unlock); * of a unlocked mutex is not allowed. */ void __sched ww_mutex_unlock(struct ww_mutex *lock) + __no_context_analysis { __ww_mutex_unlock(lock); mutex_unlock(&lock->base); @@ -587,6 +595,7 @@ static __always_inline int __sched __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclass, struct lockdep_map *nest_lock, unsigned long ip, struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) + __cond_acquires(0, lock) { DEFINE_WAKE_Q(wake_q); struct mutex_waiter waiter; @@ -780,6 +789,7 @@ err_early_kill: static int __sched __mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass, struct lockdep_map *nest_lock, unsigned long ip) + __cond_acquires(0, lock) { return __mutex_lock_common(lock, state, subclass, nest_lock, ip, NULL, false); } @@ -787,6 +797,7 @@ __mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass, static int __sched __ww_mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass, unsigned long ip, struct ww_acquire_ctx *ww_ctx) + __cond_acquires(0, lock) { return __mutex_lock_common(lock, state, subclass, NULL, ip, ww_ctx, true); } @@ -834,6 +845,7 @@ void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass) { __mutex_lock(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); + __acquire(lock); } EXPORT_SYMBOL_GPL(mutex_lock_nested); @@ -842,6 +854,7 @@ void __sched _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) { __mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_); + __acquire(lock); } EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); @@ -870,12 +883,14 @@ mutex_lock_io_nested(struct mutex *lock, unsigned int subclass) token = io_schedule_prepare(); __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_, NULL, 0); + __acquire(lock); io_schedule_finish(token); } EXPORT_SYMBOL_GPL(mutex_lock_io_nested); static inline int ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) + __cond_releases(nonzero, lock) { #ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH unsigned tmp; @@ -937,6 +952,7 @@ EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible); * Release the lock, slowpath: */ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip) + __releases(lock) { struct task_struct *next = NULL; struct mutex_waiter *waiter; @@ -945,6 +961,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne unsigned long flags; mutex_release(&lock->dep_map, ip); + __release(lock); /* * Release the lock before (potentially) taking the spinlock such that @@ -1066,24 +1083,29 @@ EXPORT_SYMBOL_GPL(mutex_lock_io); static noinline void __sched __mutex_lock_slowpath(struct mutex *lock) + __acquires(lock) { __mutex_lock(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); + __acquire(lock); } static noinline int __sched __mutex_lock_killable_slowpath(struct mutex *lock) + __cond_acquires(0, lock) { return __mutex_lock(lock, TASK_KILLABLE, 0, NULL, _RET_IP_); } static noinline int __sched __mutex_lock_interruptible_slowpath(struct mutex *lock) + __cond_acquires(0, lock) { return __mutex_lock(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_); } static noinline int __sched __ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) + __cond_acquires(0, lock) { return __ww_mutex_lock(&lock->base, TASK_UNINTERRUPTIBLE, 0, _RET_IP_, ctx); @@ -1092,6 +1114,7 @@ __ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) static noinline int __sched __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) + __cond_acquires(0, lock) { return __ww_mutex_lock(&lock->base, TASK_INTERRUPTIBLE, 0, _RET_IP_, ctx); diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 9ad4da8cea00..b94ef40c1f48 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -7,6 +7,7 @@ * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar */ #ifndef CONFIG_PREEMPT_RT +#include /* * This is the control structure for tasks blocked on mutex, which resides * on the blocked task's kernel stack: diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index a0847e91ae04..c50ea5dd3c44 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -7,12 +7,14 @@ static inline struct mutex_waiter * __ww_waiter_first(struct mutex *lock) + __must_hold(&lock->wait_lock) { return lock->first_waiter; } static inline struct mutex_waiter * __ww_waiter_next(struct mutex *lock, struct mutex_waiter *w) + __must_hold(&lock->wait_lock) { w = list_next_entry(w, list); if (lock->first_waiter == w) @@ -23,6 +25,7 @@ __ww_waiter_next(struct mutex *lock, struct mutex_waiter *w) static inline struct mutex_waiter * __ww_waiter_prev(struct mutex *lock, struct mutex_waiter *w) + __must_hold(&lock->wait_lock) { w = list_prev_entry(w, list); if (lock->first_waiter == w) @@ -33,6 +36,7 @@ __ww_waiter_prev(struct mutex *lock, struct mutex_waiter *w) static inline struct mutex_waiter * __ww_waiter_last(struct mutex *lock) + __must_hold(&lock->wait_lock) { struct mutex_waiter *w = lock->first_waiter; @@ -43,6 +47,7 @@ __ww_waiter_last(struct mutex *lock) static inline void __ww_waiter_add(struct mutex *lock, struct mutex_waiter *waiter, struct mutex_waiter *pos) + __must_hold(&lock->wait_lock) { __mutex_add_waiter(lock, waiter, pos); } @@ -60,16 +65,19 @@ __ww_mutex_has_waiters(struct mutex *lock) } static inline void lock_wait_lock(struct mutex *lock, unsigned long *flags) + __acquires(&lock->wait_lock) { raw_spin_lock_irqsave(&lock->wait_lock, *flags); } static inline void unlock_wait_lock(struct mutex *lock, unsigned long *flags) + __releases(&lock->wait_lock) { raw_spin_unlock_irqrestore(&lock->wait_lock, *flags); } static inline void lockdep_assert_wait_lock_held(struct mutex *lock) + __must_hold(&lock->wait_lock) { lockdep_assert_held(&lock->wait_lock); } @@ -296,6 +304,7 @@ static bool __ww_mutex_wound(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx, struct ww_acquire_ctx *hold_ctx, struct wake_q_head *wake_q) + __must_hold(&lock->wait_lock) { struct task_struct *owner = __ww_mutex_owner(lock); @@ -360,6 +369,7 @@ static bool __ww_mutex_wound(struct MUTEX *lock, static void __ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx, struct wake_q_head *wake_q) + __must_hold(&lock->wait_lock) { struct MUTEX_WAITER *cur; @@ -453,6 +463,7 @@ __ww_mutex_kill(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx) static inline int __ww_mutex_check_kill(struct MUTEX *lock, struct MUTEX_WAITER *waiter, struct ww_acquire_ctx *ctx) + __must_hold(&lock->wait_lock) { struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx); @@ -503,6 +514,7 @@ __ww_mutex_add_waiter(struct MUTEX_WAITER *waiter, struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx, struct wake_q_head *wake_q) + __must_hold(&lock->wait_lock) { struct MUTEX_WAITER *cur, *pos = NULL; bool is_wait_die; -- cgit v1.2.3 From 90bb681dcdf7e69c90b56a18f06c0389a0810b92 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 20 Jan 2026 18:17:50 +0100 Subject: locking/rtmutex: Add context analysis Add compiler context analysis annotations. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260121111213.851599178@infradead.org --- include/linux/rtmutex.h | 8 ++++---- kernel/locking/Makefile | 2 ++ kernel/locking/rtmutex.c | 18 +++++++++++++++++- kernel/locking/rtmutex_api.c | 2 ++ kernel/locking/rtmutex_common.h | 27 +++++++++++++++++++-------- kernel/locking/ww_mutex.h | 20 +++++++++++++++----- kernel/locking/ww_rt_mutex.c | 1 + scripts/context-analysis-suppression.txt | 1 + 8 files changed, 61 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index ede4c6bf6f22..78e7e588817c 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -22,8 +22,8 @@ extern int max_lock_depth; struct rt_mutex_base { raw_spinlock_t wait_lock; - struct rb_root_cached waiters; - struct task_struct *owner; + struct rb_root_cached waiters __guarded_by(&wait_lock); + struct task_struct *owner __guarded_by(&wait_lock); }; #define __RT_MUTEX_BASE_INITIALIZER(rtbasename) \ @@ -41,7 +41,7 @@ struct rt_mutex_base { */ static inline bool rt_mutex_base_is_locked(struct rt_mutex_base *lock) { - return READ_ONCE(lock->owner) != NULL; + return data_race(READ_ONCE(lock->owner) != NULL); } #ifdef CONFIG_RT_MUTEXES @@ -49,7 +49,7 @@ static inline bool rt_mutex_base_is_locked(struct rt_mutex_base *lock) static inline struct task_struct *rt_mutex_owner(struct rt_mutex_base *lock) { - unsigned long owner = (unsigned long) READ_ONCE(lock->owner); + unsigned long owner = (unsigned long) data_race(READ_ONCE(lock->owner)); return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS); } diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 264447d606a6..0c07de79388c 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -4,6 +4,8 @@ KCOV_INSTRUMENT := n CONTEXT_ANALYSIS_mutex.o := y +CONTEXT_ANALYSIS_rtmutex_api.o := y +CONTEXT_ANALYSIS_ww_rt_mutex.o := y obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index c80902eacd79..ccaba6148b61 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -94,6 +94,7 @@ static inline int __ww_mutex_check_kill(struct rt_mutex *lock, static __always_inline struct task_struct * rt_mutex_owner_encode(struct rt_mutex_base *lock, struct task_struct *owner) + __must_hold(&lock->wait_lock) { unsigned long val = (unsigned long)owner; @@ -105,6 +106,7 @@ rt_mutex_owner_encode(struct rt_mutex_base *lock, struct task_struct *owner) static __always_inline void rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner) + __must_hold(&lock->wait_lock) { /* * lock->wait_lock is held but explicit acquire semantics are needed @@ -114,12 +116,14 @@ rt_mutex_set_owner(struct rt_mutex_base *lock, struct task_struct *owner) } static __always_inline void rt_mutex_clear_owner(struct rt_mutex_base *lock) + __must_hold(&lock->wait_lock) { /* lock->wait_lock is held so the unlock provides release semantics. */ WRITE_ONCE(lock->owner, rt_mutex_owner_encode(lock, NULL)); } static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock) + __must_hold(&lock->wait_lock) { lock->owner = (struct task_struct *) ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); @@ -127,6 +131,7 @@ static __always_inline void clear_rt_mutex_waiters(struct rt_mutex_base *lock) static __always_inline void fixup_rt_mutex_waiters(struct rt_mutex_base *lock, bool acquire_lock) + __must_hold(&lock->wait_lock) { unsigned long owner, *p = (unsigned long *) &lock->owner; @@ -328,6 +333,7 @@ static __always_inline bool rt_mutex_cmpxchg_release(struct rt_mutex_base *lock, } static __always_inline void mark_rt_mutex_waiters(struct rt_mutex_base *lock) + __must_hold(&lock->wait_lock) { lock->owner = (struct task_struct *) ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); @@ -1206,6 +1212,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, struct ww_acquire_ctx *ww_ctx, enum rtmutex_chainwalk chwalk, struct wake_q_head *wake_q) + __must_hold(&lock->wait_lock) { struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex_waiter *top_waiter = waiter; @@ -1249,6 +1256,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, /* Check whether the waiter should back out immediately */ rtm = container_of(lock, struct rt_mutex, rtmutex); + __assume_ctx_lock(&rtm->rtmutex.wait_lock); res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx, wake_q); if (res) { raw_spin_lock(&task->pi_lock); @@ -1356,6 +1364,7 @@ static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh, } static int __sched __rt_mutex_slowtrylock(struct rt_mutex_base *lock) + __must_hold(&lock->wait_lock) { int ret = try_to_take_rt_mutex(lock, current, NULL); @@ -1505,7 +1514,7 @@ static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock, * - the VCPU on which owner runs is preempted */ if (!owner_on_cpu(owner) || need_resched() || - !rt_mutex_waiter_is_top_waiter(lock, waiter)) { + !data_race(rt_mutex_waiter_is_top_waiter(lock, waiter))) { res = false; break; } @@ -1538,6 +1547,7 @@ static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock, */ static void __sched remove_waiter(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) + __must_hold(&lock->wait_lock) { bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock)); struct task_struct *owner = rt_mutex_owner(lock); @@ -1613,6 +1623,8 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, struct task_struct *owner; int ret = 0; + __assume_ctx_lock(&rtm->rtmutex.wait_lock); + lockevent_inc(rtmutex_slow_block); for (;;) { /* Try to acquire the lock: */ @@ -1658,6 +1670,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock, struct rt_mutex_base *lock, struct rt_mutex_waiter *w) + __must_hold(&lock->wait_lock) { /* * If the result is not -EDEADLOCK or the caller requested @@ -1694,11 +1707,13 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, enum rtmutex_chainwalk chwalk, struct rt_mutex_waiter *waiter, struct wake_q_head *wake_q) + __must_hold(&lock->wait_lock) { struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex); struct ww_mutex *ww = ww_container_of(rtm); int ret; + __assume_ctx_lock(&rtm->rtmutex.wait_lock); lockdep_assert_held(&lock->wait_lock); lockevent_inc(rtmutex_slowlock); @@ -1750,6 +1765,7 @@ static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock, struct ww_acquire_ctx *ww_ctx, unsigned int state, struct wake_q_head *wake_q) + __must_hold(&lock->wait_lock) { struct rt_mutex_waiter waiter; int ret; diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index 59dbd29cb219..124219aea46e 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -526,6 +526,7 @@ static __always_inline int __mutex_lock_common(struct mutex *lock, unsigned int subclass, struct lockdep_map *nest_lock, unsigned long ip) + __acquires(lock) __no_context_analysis { int ret; @@ -647,6 +648,7 @@ EXPORT_SYMBOL(mutex_trylock); #endif /* !CONFIG_DEBUG_LOCK_ALLOC */ void __sched mutex_unlock(struct mutex *lock) + __releases(lock) __no_context_analysis { mutex_release(&lock->dep_map, _RET_IP_); __rt_mutex_unlock(&lock->rtmutex); diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index cf6ddd1b23a2..c38b7bdea7b3 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -79,12 +79,18 @@ struct rt_wake_q_head { * PI-futex support (proxy locking functions, etc.): */ extern void rt_mutex_init_proxy_locked(struct rt_mutex_base *lock, - struct task_struct *proxy_owner); -extern void rt_mutex_proxy_unlock(struct rt_mutex_base *lock); + struct task_struct *proxy_owner) + __must_hold(&lock->wait_lock); + +extern void rt_mutex_proxy_unlock(struct rt_mutex_base *lock) + __must_hold(&lock->wait_lock); + extern int __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter, struct task_struct *task, - struct wake_q_head *); + struct wake_q_head *) + __must_hold(&lock->wait_lock); + extern int rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter, struct task_struct *task); @@ -94,8 +100,9 @@ extern int rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock, extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter); -extern int rt_mutex_futex_trylock(struct rt_mutex_base *l); -extern int __rt_mutex_futex_trylock(struct rt_mutex_base *l); +extern int rt_mutex_futex_trylock(struct rt_mutex_base *lock); +extern int __rt_mutex_futex_trylock(struct rt_mutex_base *lock) + __must_hold(&lock->wait_lock); extern void rt_mutex_futex_unlock(struct rt_mutex_base *lock); extern bool __rt_mutex_futex_unlock(struct rt_mutex_base *lock, @@ -109,6 +116,7 @@ extern void rt_mutex_postunlock(struct rt_wake_q_head *wqh); */ #ifdef CONFIG_RT_MUTEXES static inline int rt_mutex_has_waiters(struct rt_mutex_base *lock) + __must_hold(&lock->wait_lock) { return !RB_EMPTY_ROOT(&lock->waiters.rb_root); } @@ -120,6 +128,7 @@ static inline int rt_mutex_has_waiters(struct rt_mutex_base *lock) */ static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) + __must_hold(&lock->wait_lock) { struct rb_node *leftmost = rb_first_cached(&lock->waiters); @@ -127,6 +136,7 @@ static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock, } static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *lock) + __must_hold(&lock->wait_lock) { struct rb_node *leftmost = rb_first_cached(&lock->waiters); struct rt_mutex_waiter *w = NULL; @@ -170,9 +180,10 @@ enum rtmutex_chainwalk { static inline void __rt_mutex_base_init(struct rt_mutex_base *lock) { - raw_spin_lock_init(&lock->wait_lock); - lock->waiters = RB_ROOT_CACHED; - lock->owner = NULL; + scoped_guard (raw_spinlock_init, &lock->wait_lock) { + lock->waiters = RB_ROOT_CACHED; + lock->owner = NULL; + } } /* Debug functions */ diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index c50ea5dd3c44..b1834ab7e782 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -4,6 +4,7 @@ #define MUTEX mutex #define MUTEX_WAITER mutex_waiter +#define WAIT_LOCK wait_lock static inline struct mutex_waiter * __ww_waiter_first(struct mutex *lock) @@ -86,9 +87,11 @@ static inline void lockdep_assert_wait_lock_held(struct mutex *lock) #define MUTEX rt_mutex #define MUTEX_WAITER rt_mutex_waiter +#define WAIT_LOCK rtmutex.wait_lock static inline struct rt_mutex_waiter * __ww_waiter_first(struct rt_mutex *lock) + __must_hold(&lock->rtmutex.wait_lock) { struct rb_node *n = rb_first(&lock->rtmutex.waiters.rb_root); if (!n) @@ -116,6 +119,7 @@ __ww_waiter_prev(struct rt_mutex *lock, struct rt_mutex_waiter *w) static inline struct rt_mutex_waiter * __ww_waiter_last(struct rt_mutex *lock) + __must_hold(&lock->rtmutex.wait_lock) { struct rb_node *n = rb_last(&lock->rtmutex.waiters.rb_root); if (!n) @@ -137,21 +141,25 @@ __ww_mutex_owner(struct rt_mutex *lock) static inline bool __ww_mutex_has_waiters(struct rt_mutex *lock) + __must_hold(&lock->rtmutex.wait_lock) { return rt_mutex_has_waiters(&lock->rtmutex); } static inline void lock_wait_lock(struct rt_mutex *lock, unsigned long *flags) + __acquires(&lock->rtmutex.wait_lock) { raw_spin_lock_irqsave(&lock->rtmutex.wait_lock, *flags); } static inline void unlock_wait_lock(struct rt_mutex *lock, unsigned long *flags) + __releases(&lock->rtmutex.wait_lock) { raw_spin_unlock_irqrestore(&lock->rtmutex.wait_lock, *flags); } static inline void lockdep_assert_wait_lock_held(struct rt_mutex *lock) + __must_hold(&lock->rtmutex.wait_lock) { lockdep_assert_held(&lock->rtmutex.wait_lock); } @@ -304,7 +312,7 @@ static bool __ww_mutex_wound(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx, struct ww_acquire_ctx *hold_ctx, struct wake_q_head *wake_q) - __must_hold(&lock->wait_lock) + __must_hold(&lock->WAIT_LOCK) { struct task_struct *owner = __ww_mutex_owner(lock); @@ -369,7 +377,7 @@ static bool __ww_mutex_wound(struct MUTEX *lock, static void __ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx, struct wake_q_head *wake_q) - __must_hold(&lock->wait_lock) + __must_hold(&lock->WAIT_LOCK) { struct MUTEX_WAITER *cur; @@ -396,6 +404,7 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { DEFINE_WAKE_Q(wake_q); unsigned long flags; + bool has_waiters; ww_mutex_lock_acquired(lock, ctx); @@ -417,7 +426,8 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) * __ww_mutex_add_waiter() and makes sure we either observe ww->ctx * and/or !empty list. */ - if (likely(!__ww_mutex_has_waiters(&lock->base))) + has_waiters = data_race(__ww_mutex_has_waiters(&lock->base)); + if (likely(!has_waiters)) return; /* @@ -463,7 +473,7 @@ __ww_mutex_kill(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx) static inline int __ww_mutex_check_kill(struct MUTEX *lock, struct MUTEX_WAITER *waiter, struct ww_acquire_ctx *ctx) - __must_hold(&lock->wait_lock) + __must_hold(&lock->WAIT_LOCK) { struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx); @@ -514,7 +524,7 @@ __ww_mutex_add_waiter(struct MUTEX_WAITER *waiter, struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx, struct wake_q_head *wake_q) - __must_hold(&lock->wait_lock) + __must_hold(&lock->WAIT_LOCK) { struct MUTEX_WAITER *cur, *pos = NULL; bool is_wait_die; diff --git a/kernel/locking/ww_rt_mutex.c b/kernel/locking/ww_rt_mutex.c index c7196de838ed..e07fb3b96bc3 100644 --- a/kernel/locking/ww_rt_mutex.c +++ b/kernel/locking/ww_rt_mutex.c @@ -90,6 +90,7 @@ ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) EXPORT_SYMBOL(ww_mutex_lock_interruptible); void __sched ww_mutex_unlock(struct ww_mutex *lock) + __no_context_analysis { struct rt_mutex *rtm = &lock->base; diff --git a/scripts/context-analysis-suppression.txt b/scripts/context-analysis-suppression.txt index fd8951d06706..1c51b6153f08 100644 --- a/scripts/context-analysis-suppression.txt +++ b/scripts/context-analysis-suppression.txt @@ -24,6 +24,7 @@ src:*include/linux/mutex*.h=emit src:*include/linux/rcupdate.h=emit src:*include/linux/refcount.h=emit src:*include/linux/rhashtable.h=emit +src:*include/linux/rtmutex*.h=emit src:*include/linux/rwlock*.h=emit src:*include/linux/rwsem.h=emit src:*include/linux/sched*=emit -- cgit v1.2.3 From 739690915ce1f017223ef4e6f3cc966ccfa3c861 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 6 Mar 2026 10:43:56 +0100 Subject: locking/rwsem: Add context analysis Add compiler context analysis annotations. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260306101417.GT1282955@noisy.programming.kicks-ass.net --- include/linux/rwsem.h | 4 ++-- kernel/locking/Makefile | 1 + kernel/locking/rwbase_rt.c | 1 + kernel/locking/rwsem.c | 27 ++++++++++++++++++++++++--- 4 files changed, 28 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index e7829531c4ba..6a1a7bae5f81 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -57,7 +57,7 @@ context_lock_struct(rw_semaphore) { struct optimistic_spin_queue osq; /* spinner MCS lock */ #endif raw_spinlock_t wait_lock; - struct rwsem_waiter *first_waiter; + struct rwsem_waiter *first_waiter __guarded_by(&wait_lock); #ifdef CONFIG_DEBUG_RWSEMS void *magic; #endif @@ -131,7 +131,7 @@ do { \ */ static inline bool rwsem_is_contended(struct rw_semaphore *sem) { - return sem->first_waiter != NULL; + return data_race(sem->first_waiter != NULL); } #if defined(CONFIG_DEBUG_RWSEMS) || defined(CONFIG_DETECT_HUNG_TASK_BLOCKER) diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 0c07de79388c..cee1901d4cff 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -6,6 +6,7 @@ KCOV_INSTRUMENT := n CONTEXT_ANALYSIS_mutex.o := y CONTEXT_ANALYSIS_rtmutex_api.o := y CONTEXT_ANALYSIS_ww_rt_mutex.o := y +CONTEXT_ANALYSIS_rwsem.o := y obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c index 9f4322c07486..82e078c0665a 100644 --- a/kernel/locking/rwbase_rt.c +++ b/kernel/locking/rwbase_rt.c @@ -186,6 +186,7 @@ static __always_inline void rwbase_read_unlock(struct rwbase_rt *rwb, static inline void __rwbase_write_unlock(struct rwbase_rt *rwb, int bias, unsigned long flags) + __releases(&rwb->rtmutex.wait_lock) { struct rt_mutex_base *rtm = &rwb->rtmutex; diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index e66f37ebc6f6..ba4cb74de064 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -72,7 +72,7 @@ #c, atomic_long_read(&(sem)->count), \ (unsigned long) sem->magic, \ atomic_long_read(&(sem)->owner), (long)current, \ - (sem)->first_waiter ? "" : "not ")) \ + rwsem_is_contended(sem) ? "" : "not ")) \ debug_locks_off(); \ } while (0) #else @@ -320,9 +320,10 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, sem->magic = sem; #endif atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); - raw_spin_lock_init(&sem->wait_lock); - sem->first_waiter = NULL; atomic_long_set(&sem->owner, 0L); + scoped_guard (raw_spinlock_init, &sem->wait_lock) { + sem->first_waiter = NULL; + } #ifdef CONFIG_RWSEM_SPIN_ON_OWNER osq_lock_init(&sem->osq); #endif @@ -365,6 +366,7 @@ enum rwsem_wake_type { static inline bool __rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) + __must_hold(&sem->wait_lock) { if (list_empty(&waiter->list)) { sem->first_waiter = NULL; @@ -401,6 +403,7 @@ rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) static inline struct rwsem_waiter *next_waiter(const struct rw_semaphore *sem, const struct rwsem_waiter *waiter) + __must_hold(&sem->wait_lock) { struct rwsem_waiter *next = list_first_entry(&waiter->list, struct rwsem_waiter, list); @@ -621,6 +624,7 @@ rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter, */ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, struct rwsem_waiter *waiter) + __must_hold(&sem->wait_lock) { struct rwsem_waiter *first = sem->first_waiter; long count, new; @@ -1558,6 +1562,7 @@ static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) * lock for reading */ void __sched down_read(struct rw_semaphore *sem) + __no_context_analysis { might_sleep(); rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); @@ -1567,6 +1572,7 @@ void __sched down_read(struct rw_semaphore *sem) EXPORT_SYMBOL(down_read); int __sched down_read_interruptible(struct rw_semaphore *sem) + __no_context_analysis { might_sleep(); rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); @@ -1581,6 +1587,7 @@ int __sched down_read_interruptible(struct rw_semaphore *sem) EXPORT_SYMBOL(down_read_interruptible); int __sched down_read_killable(struct rw_semaphore *sem) + __no_context_analysis { might_sleep(); rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); @@ -1598,6 +1605,7 @@ EXPORT_SYMBOL(down_read_killable); * trylock for reading -- returns 1 if successful, 0 if contention */ int down_read_trylock(struct rw_semaphore *sem) + __no_context_analysis { int ret = __down_read_trylock(sem); @@ -1611,6 +1619,7 @@ EXPORT_SYMBOL(down_read_trylock); * lock for writing */ void __sched down_write(struct rw_semaphore *sem) + __no_context_analysis { might_sleep(); rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); @@ -1622,6 +1631,7 @@ EXPORT_SYMBOL(down_write); * lock for writing */ int __sched down_write_killable(struct rw_semaphore *sem) + __no_context_analysis { might_sleep(); rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); @@ -1640,6 +1650,7 @@ EXPORT_SYMBOL(down_write_killable); * trylock for writing -- returns 1 if successful, 0 if contention */ int down_write_trylock(struct rw_semaphore *sem) + __no_context_analysis { int ret = __down_write_trylock(sem); @@ -1654,6 +1665,7 @@ EXPORT_SYMBOL(down_write_trylock); * release a read lock */ void up_read(struct rw_semaphore *sem) + __no_context_analysis { rwsem_release(&sem->dep_map, _RET_IP_); __up_read(sem); @@ -1664,6 +1676,7 @@ EXPORT_SYMBOL(up_read); * release a write lock */ void up_write(struct rw_semaphore *sem) + __no_context_analysis { rwsem_release(&sem->dep_map, _RET_IP_); __up_write(sem); @@ -1674,6 +1687,7 @@ EXPORT_SYMBOL(up_write); * downgrade write lock to read lock */ void downgrade_write(struct rw_semaphore *sem) + __no_context_analysis { lock_downgrade(&sem->dep_map, _RET_IP_); __downgrade_write(sem); @@ -1683,6 +1697,7 @@ EXPORT_SYMBOL(downgrade_write); #ifdef CONFIG_DEBUG_LOCK_ALLOC void down_read_nested(struct rw_semaphore *sem, int subclass) + __no_context_analysis { might_sleep(); rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); @@ -1691,6 +1706,7 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) EXPORT_SYMBOL(down_read_nested); int down_read_killable_nested(struct rw_semaphore *sem, int subclass) + __no_context_analysis { might_sleep(); rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); @@ -1705,6 +1721,7 @@ int down_read_killable_nested(struct rw_semaphore *sem, int subclass) EXPORT_SYMBOL(down_read_killable_nested); void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) + __no_context_analysis { might_sleep(); rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); @@ -1713,6 +1730,7 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) EXPORT_SYMBOL(_down_write_nest_lock); void down_read_non_owner(struct rw_semaphore *sem) + __no_context_analysis { might_sleep(); __down_read(sem); @@ -1727,6 +1745,7 @@ void down_read_non_owner(struct rw_semaphore *sem) EXPORT_SYMBOL(down_read_non_owner); void down_write_nested(struct rw_semaphore *sem, int subclass) + __no_context_analysis { might_sleep(); rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); @@ -1735,6 +1754,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) EXPORT_SYMBOL(down_write_nested); int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) + __no_context_analysis { might_sleep(); rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); @@ -1750,6 +1770,7 @@ int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) EXPORT_SYMBOL(down_write_killable_nested); void up_read_non_owner(struct rw_semaphore *sem) + __no_context_analysis { DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); __up_read(sem); -- cgit v1.2.3 From 27ab4f1e4909a674dfd03058fb9802cae2343a36 Mon Sep 17 00:00:00 2001 From: Vijendar Mukunda Date: Thu, 26 Feb 2026 12:25:54 +0530 Subject: soundwire: amd: refactor bandwidth calculation logic For current platforms(ACP6.3/ACP7.0/ACP7.1/ACP7.2), AMD SoundWire manager doesn't have banked registers for data port programming on Manager's side. Need to use fixed block offsets, hstart & hstop for manager ports. Earlier amd manager driver has support for 12 MHz as a bus clock frequency where frame rate is 48000 and number of bits is 500, frame shape as 50 x 10 with fixed block offset mapping based on port number. Got a new requirement to support 6 MHz as a bus clock frequency. For 6 MHz bus clock frequency amd manager driver needs to support two different frame shapes i.e number of bits as 250 with frame rate as 48000 and frame shape as 125 x 2 and for the second combination number of bits as 500 where frame rate is 24000 and frame shape is 50 x 10. Few SoundWire peripherals doesn't support 125 x 2 as a frame shape for 6 MHz bus clock frequency. They have explicit requirement for the frame shape. In this scenario, amd manager driver needs to use 50 x 10 as a frame shape where frame rate is 24000. Based on the platform and SoundWire topology for 6Mhz support frame shape will be decided which is part of SoundWire manager DisCo tables. For current platforms, amd manager driver supports only two bus clock frequencies(12 MHz & 6 MHz). Refactor bandwidth logic to support different bus clock frequencies. Signed-off-by: Vijendar Mukunda Reviewed-by: Pierre-Louis Bossart Link: https://patch.msgid.link/20260226065638.1251771-3-Vijendar.Mukunda@amd.com Signed-off-by: Vinod Koul --- drivers/soundwire/amd_manager.c | 57 ++++++++++++++++++++++++++++++++++++--- include/linux/soundwire/sdw_amd.h | 4 +++ 2 files changed, 57 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/soundwire/amd_manager.c b/drivers/soundwire/amd_manager.c index 61df5cecdccc..a3316efdf8ac 100644 --- a/drivers/soundwire/amd_manager.c +++ b/drivers/soundwire/amd_manager.c @@ -467,12 +467,16 @@ static u32 amd_sdw_read_ping_status(struct sdw_bus *bus) static int amd_sdw_compute_params(struct sdw_bus *bus, struct sdw_stream_runtime *stream) { + struct amd_sdw_manager *amd_manager = to_amd_sdw(bus); struct sdw_transport_data t_data = {0}; struct sdw_master_runtime *m_rt; struct sdw_port_runtime *p_rt; struct sdw_bus_params *b_params = &bus->params; int port_bo, hstart, hstop, sample_int; - unsigned int rate, bps; + unsigned int rate, bps, channels; + unsigned int stream_slot_size, max_slots; + static unsigned int next_offset[AMD_SDW_MAX_MANAGER_COUNT] = {1}; + unsigned int inst_id = amd_manager->instance; port_bo = 0; hstart = 1; @@ -483,11 +487,51 @@ static int amd_sdw_compute_params(struct sdw_bus *bus, struct sdw_stream_runtime list_for_each_entry(m_rt, &bus->m_rt_list, bus_node) { rate = m_rt->stream->params.rate; bps = m_rt->stream->params.bps; + channels = m_rt->stream->params.ch_count; sample_int = (bus->params.curr_dr_freq / rate); + + /* Compute slots required for this stream dynamically */ + stream_slot_size = bps * channels; + list_for_each_entry(p_rt, &m_rt->port_list, port_node) { - port_bo = (p_rt->num * 64) + 1; - dev_dbg(bus->dev, "p_rt->num=%d hstart=%d hstop=%d port_bo=%d\n", - p_rt->num, hstart, hstop, port_bo); + if (p_rt->num >= amd_manager->max_ports) { + dev_err(bus->dev, "Port %d exceeds max ports %d\n", + p_rt->num, amd_manager->max_ports); + return -EINVAL; + } + + if (!amd_manager->port_offset_map[p_rt->num]) { + /* + * port block offset calculation for 6MHz bus clock frequency with + * different frame sizes 50 x 10 and 125 x 2 + */ + if (bus->params.curr_dr_freq == 12000000) { + max_slots = bus->params.row * (bus->params.col - 1); + if (next_offset[inst_id] + stream_slot_size <= + (max_slots - 1)) { + amd_manager->port_offset_map[p_rt->num] = + next_offset[inst_id]; + next_offset[inst_id] += stream_slot_size; + } else { + dev_err(bus->dev, + "No space for port %d\n", p_rt->num); + return -ENOMEM; + } + } else { + /* + * port block offset calculation for 12MHz bus clock + * frequency + */ + amd_manager->port_offset_map[p_rt->num] = + (p_rt->num * 64) + 1; + } + } + port_bo = amd_manager->port_offset_map[p_rt->num]; + dev_dbg(bus->dev, + "Port=%d hstart=%d hstop=%d port_bo=%d slots=%d max_ports=%d\n", + p_rt->num, hstart, hstop, port_bo, stream_slot_size, + amd_manager->max_ports); + sdw_fill_xport_params(&p_rt->transport_params, p_rt->num, false, SDW_BLK_GRP_CNT_1, sample_int, port_bo, port_bo >> 8, hstart, hstop, @@ -1079,6 +1123,11 @@ static int amd_sdw_manager_probe(struct platform_device *pdev) default: return -EINVAL; } + amd_manager->max_ports = amd_manager->num_dout_ports + amd_manager->num_din_ports; + amd_manager->port_offset_map = devm_kcalloc(dev, amd_manager->max_ports, + sizeof(int), GFP_KERNEL); + if (!amd_manager->port_offset_map) + return -ENOMEM; prop = &amd_manager->bus.prop; prop->mclk_freq = AMD_SDW_BUS_BASE_FREQ; diff --git a/include/linux/soundwire/sdw_amd.h b/include/linux/soundwire/sdw_amd.h index fe31773d5210..470360a2723c 100644 --- a/include/linux/soundwire/sdw_amd.h +++ b/include/linux/soundwire/sdw_amd.h @@ -66,8 +66,10 @@ struct sdw_amd_dai_runtime { * @status: peripheral devices status array * @num_din_ports: number of input ports * @num_dout_ports: number of output ports + * @max_ports: total number of input ports and output ports * @cols_index: Column index in frame shape * @rows_index: Rows index in frame shape + * @port_offset_map: dynamic array to map port block offset * @instance: SoundWire manager instance * @quirks: SoundWire manager quirks * @wake_en_mask: wake enable mask per SoundWire manager @@ -92,10 +94,12 @@ struct amd_sdw_manager { int num_din_ports; int num_dout_ports; + int max_ports; int cols_index; int rows_index; + int *port_offset_map; u32 instance; u32 quirks; u32 wake_en_mask; -- cgit v1.2.3 From 493740d790cce709d285cd1022d16d05439b7d5b Mon Sep 17 00:00:00 2001 From: Arunpravin Paneer Selvam Date: Fri, 6 Mar 2026 11:31:54 +0530 Subject: drm/buddy: Improve offset-aligned allocation handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Large alignment requests previously forced the buddy allocator to search by alignment order, which often caused higher-order free blocks to be split even when a suitably aligned smaller region already existed within them. This led to excessive fragmentation, especially for workloads requesting small sizes with large alignment constraints. This change prioritizes the requested allocation size during the search and uses an augmented RB-tree field (subtree_max_alignment) to efficiently locate free blocks that satisfy both size and offset-alignment requirements. As a result, the allocator can directly select an aligned sub-region without splitting larger blocks unnecessarily. A practical example is the VKCTS test dEQP-VK.memory.allocation.basic.size_8KiB.reverse.count_4000, which repeatedly allocates 8 KiB buffers with a 256 KiB alignment. Previously, such allocations caused large blocks to be split aggressively, despite smaller aligned regions being sufficient. With this change, those aligned regions are reused directly, significantly reducing fragmentation. This improvement is visible in the amdgpu VRAM buddy allocator state (/sys/kernel/debug/dri/1/amdgpu_vram_mm). After the change, higher-order blocks are preserved and the number of low-order fragments is substantially reduced. Before: order- 5 free: 1936 MiB, blocks: 15490 order- 4 free: 967 MiB, blocks: 15486 order- 3 free: 483 MiB, blocks: 15485 order- 2 free: 241 MiB, blocks: 15486 order- 1 free: 241 MiB, blocks: 30948 After: order- 5 free: 493 MiB, blocks: 3941 order- 4 free: 246 MiB, blocks: 3943 order- 3 free: 123 MiB, blocks: 4101 order- 2 free: 61 MiB, blocks: 4101 order- 1 free: 61 MiB, blocks: 8018 By avoiding unnecessary splits, this change improves allocator efficiency and helps maintain larger contiguous free regions under heavy offset-aligned allocation workloads. v2:(Matthew) - Update augmented information along the path to the inserted node. v3: - Move the patch to gpu/buddy.c file. v4:(Matthew) - Use the helper instead of calling _ffs directly - Remove gpu_buddy_block_order(block) >= order check and drop order - Drop !node check as all callers handle this already - Return larger than any other possible alignment for __ffs64(0) - Replace __ffs with __ffs64 v5:(Matthew) - Drop subtree_max_alignment initialization at gpu_block_alloc() Signed-off-by: Arunpravin Paneer Selvam Suggested-by: Christian König Reviewed-by: Matthew Auld Link: https://patch.msgid.link/20260306060155.2114-1-Arunpravin.PaneerSelvam@amd.com --- drivers/gpu/buddy.c | 272 ++++++++++++++++++++++++++++++++++++++-------- include/linux/gpu_buddy.h | 2 + 2 files changed, 229 insertions(+), 45 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpu/buddy.c b/drivers/gpu/buddy.c index da5a1222f46b..52686672e99f 100644 --- a/drivers/gpu/buddy.c +++ b/drivers/gpu/buddy.c @@ -53,6 +53,25 @@ gpu_buddy_block_is_split(struct gpu_buddy_block *block) return gpu_buddy_block_state(block) == GPU_BUDDY_SPLIT; } +static unsigned int gpu_buddy_block_offset_alignment(struct gpu_buddy_block *block) +{ + u64 offset = gpu_buddy_block_offset(block); + + if (!offset) + /* + * __ffs64(0) is undefined; offset 0 is maximally aligned, so return + * a value greater than any possible alignment. + */ + return 64 + 1; + + return __ffs64(offset); +} + +RB_DECLARE_CALLBACKS_MAX(static, gpu_buddy_augment_cb, + struct gpu_buddy_block, rb, + unsigned int, subtree_max_alignment, + gpu_buddy_block_offset_alignment); + static struct gpu_buddy_block *gpu_block_alloc(struct gpu_buddy *mm, struct gpu_buddy_block *parent, unsigned int order, @@ -106,26 +125,42 @@ static bool rbtree_is_empty(struct rb_root *root) return RB_EMPTY_ROOT(root); } -static bool gpu_buddy_block_offset_less(const struct gpu_buddy_block *block, - const struct gpu_buddy_block *node) -{ - return gpu_buddy_block_offset(block) < gpu_buddy_block_offset(node); -} - -static bool rbtree_block_offset_less(struct rb_node *block, - const struct rb_node *node) -{ - return gpu_buddy_block_offset_less(rbtree_get_free_block(block), - rbtree_get_free_block(node)); -} - static void rbtree_insert(struct gpu_buddy *mm, struct gpu_buddy_block *block, enum gpu_buddy_free_tree tree) { - rb_add(&block->rb, - &mm->free_trees[tree][gpu_buddy_block_order(block)], - rbtree_block_offset_less); + struct rb_node **link, *parent = NULL; + unsigned int block_alignment, order; + struct gpu_buddy_block *node; + struct rb_root *root; + + order = gpu_buddy_block_order(block); + block_alignment = gpu_buddy_block_offset_alignment(block); + + root = &mm->free_trees[tree][order]; + link = &root->rb_node; + + while (*link) { + parent = *link; + node = rbtree_get_free_block(parent); + /* + * Manual augmentation update during insertion traversal. Required + * because rb_insert_augmented() only calls rotate callback during + * rotations. This ensures all ancestors on the insertion path have + * correct subtree_max_alignment values. + */ + if (node->subtree_max_alignment < block_alignment) + node->subtree_max_alignment = block_alignment; + + if (gpu_buddy_block_offset(block) < gpu_buddy_block_offset(node)) + link = &parent->rb_left; + else + link = &parent->rb_right; + } + + block->subtree_max_alignment = block_alignment; + rb_link_node(&block->rb, parent, link); + rb_insert_augmented(&block->rb, root, &gpu_buddy_augment_cb); } static void rbtree_remove(struct gpu_buddy *mm, @@ -138,7 +173,7 @@ static void rbtree_remove(struct gpu_buddy *mm, tree = get_block_tree(block); root = &mm->free_trees[tree][order]; - rb_erase(&block->rb, root); + rb_erase_augmented(&block->rb, root, &gpu_buddy_augment_cb); RB_CLEAR_NODE(&block->rb); } @@ -811,6 +846,127 @@ err_undo: return ERR_PTR(err); } +static bool +gpu_buddy_can_offset_align(u64 size, u64 min_block_size) +{ + return size < min_block_size && is_power_of_2(size); +} + +static bool gpu_buddy_subtree_can_satisfy(struct rb_node *node, + unsigned int alignment) +{ + struct gpu_buddy_block *block; + + block = rbtree_get_free_block(node); + return block->subtree_max_alignment >= alignment; +} + +static struct gpu_buddy_block * +gpu_buddy_find_block_aligned(struct gpu_buddy *mm, + enum gpu_buddy_free_tree tree, + unsigned int order, + unsigned int alignment, + unsigned long flags) +{ + struct rb_root *root = &mm->free_trees[tree][order]; + struct rb_node *rb = root->rb_node; + + while (rb) { + struct gpu_buddy_block *block = rbtree_get_free_block(rb); + struct rb_node *left_node = rb->rb_left, *right_node = rb->rb_right; + + if (right_node) { + if (gpu_buddy_subtree_can_satisfy(right_node, alignment)) { + rb = right_node; + continue; + } + } + + if (gpu_buddy_block_offset_alignment(block) >= alignment) + return block; + + if (left_node) { + if (gpu_buddy_subtree_can_satisfy(left_node, alignment)) { + rb = left_node; + continue; + } + } + + break; + } + + return NULL; +} + +static struct gpu_buddy_block * +gpu_buddy_offset_aligned_allocation(struct gpu_buddy *mm, + u64 size, + u64 min_block_size, + unsigned long flags) +{ + struct gpu_buddy_block *block = NULL; + unsigned int order, tmp, alignment; + struct gpu_buddy_block *buddy; + enum gpu_buddy_free_tree tree; + unsigned long pages; + int err; + + alignment = ilog2(min_block_size); + pages = size >> ilog2(mm->chunk_size); + order = fls(pages) - 1; + + tree = (flags & GPU_BUDDY_CLEAR_ALLOCATION) ? + GPU_BUDDY_CLEAR_TREE : GPU_BUDDY_DIRTY_TREE; + + for (tmp = order; tmp <= mm->max_order; ++tmp) { + block = gpu_buddy_find_block_aligned(mm, tree, tmp, + alignment, flags); + if (!block) { + tree = (tree == GPU_BUDDY_CLEAR_TREE) ? + GPU_BUDDY_DIRTY_TREE : GPU_BUDDY_CLEAR_TREE; + block = gpu_buddy_find_block_aligned(mm, tree, tmp, + alignment, flags); + } + + if (block) + break; + } + + if (!block) + return ERR_PTR(-ENOSPC); + + while (gpu_buddy_block_order(block) > order) { + struct gpu_buddy_block *left, *right; + + err = split_block(mm, block); + if (unlikely(err)) + goto err_undo; + + left = block->left; + right = block->right; + + if (gpu_buddy_block_offset_alignment(right) >= alignment) + block = right; + else + block = left; + } + + return block; + +err_undo: + /* + * We really don't want to leave around a bunch of split blocks, since + * bigger is better, so make sure we merge everything back before we + * free the allocated blocks. + */ + buddy = __get_buddy(block); + if (buddy && + (gpu_buddy_block_is_free(block) && + gpu_buddy_block_is_free(buddy))) + __gpu_buddy_free(mm, block, false); + return ERR_PTR(err); +} + static int __alloc_range(struct gpu_buddy *mm, struct list_head *dfs, u64 start, u64 size, @@ -1080,6 +1236,7 @@ EXPORT_SYMBOL(gpu_buddy_block_trim); static struct gpu_buddy_block * __gpu_buddy_alloc_blocks(struct gpu_buddy *mm, u64 start, u64 end, + u64 size, u64 min_block_size, unsigned int order, unsigned long flags) { @@ -1087,6 +1244,11 @@ __gpu_buddy_alloc_blocks(struct gpu_buddy *mm, /* Allocate traversing within the range */ return __gpu_buddy_alloc_range_bias(mm, start, end, order, flags); + else if (size < min_block_size) + /* Allocate from an offset-aligned region without size rounding */ + return gpu_buddy_offset_aligned_allocation(mm, size, + min_block_size, + flags); else /* Allocate from freetree */ return alloc_from_freetree(mm, order, flags); @@ -1158,8 +1320,11 @@ int gpu_buddy_alloc_blocks(struct gpu_buddy *mm, if (flags & GPU_BUDDY_CONTIGUOUS_ALLOCATION) { size = roundup_pow_of_two(size); min_block_size = size; - /* Align size value to min_block_size */ - } else if (!IS_ALIGNED(size, min_block_size)) { + /* + * Normalize the requested size to min_block_size for regular allocations. + * Offset-aligned allocations intentionally skip size rounding. + */ + } else if (!gpu_buddy_can_offset_align(size, min_block_size)) { size = round_up(size, min_block_size); } @@ -1179,43 +1344,60 @@ int gpu_buddy_alloc_blocks(struct gpu_buddy *mm, do { order = min(order, (unsigned int)fls(pages) - 1); BUG_ON(order > mm->max_order); - BUG_ON(order < min_order); + /* + * Regular allocations must not allocate blocks smaller than min_block_size. + * Offset-aligned allocations deliberately bypass this constraint. + */ + BUG_ON(size >= min_block_size && order < min_order); do { + unsigned int fallback_order; + block = __gpu_buddy_alloc_blocks(mm, start, end, + size, + min_block_size, order, flags); if (!IS_ERR(block)) break; - if (order-- == min_order) { - /* Try allocation through force merge method */ - if (mm->clear_avail && - !__force_merge(mm, start, end, min_order)) { - block = __gpu_buddy_alloc_blocks(mm, start, - end, - min_order, - flags); - if (!IS_ERR(block)) { - order = min_order; - break; - } - } + if (size < min_block_size) { + fallback_order = order; + } else if (order == min_order) { + fallback_order = min_order; + } else { + order--; + continue; + } - /* - * Try contiguous block allocation through - * try harder method. - */ - if (flags & GPU_BUDDY_CONTIGUOUS_ALLOCATION && - !(flags & GPU_BUDDY_RANGE_ALLOCATION)) - return __alloc_contig_try_harder(mm, - original_size, - original_min_size, - blocks); - err = -ENOSPC; - goto err_free; + /* Try allocation through force merge method */ + if (mm->clear_avail && + !__force_merge(mm, start, end, fallback_order)) { + block = __gpu_buddy_alloc_blocks(mm, start, + end, + size, + min_block_size, + fallback_order, + flags); + if (!IS_ERR(block)) { + order = fallback_order; + break; + } } + + /* + * Try contiguous block allocation through + * try harder method. + */ + if (flags & GPU_BUDDY_CONTIGUOUS_ALLOCATION && + !(flags & GPU_BUDDY_RANGE_ALLOCATION)) + return __alloc_contig_try_harder(mm, + original_size, + original_min_size, + blocks); + err = -ENOSPC; + goto err_free; } while (1); mark_allocated(mm, block); diff --git a/include/linux/gpu_buddy.h b/include/linux/gpu_buddy.h index f1fb6eff604a..5fa917ba5450 100644 --- a/include/linux/gpu_buddy.h +++ b/include/linux/gpu_buddy.h @@ -11,6 +11,7 @@ #include #include #include +#include /** * GPU_BUDDY_RANGE_ALLOCATION - Allocate within a specific address range @@ -128,6 +129,7 @@ struct gpu_buddy_block { }; /* private: */ struct list_head tmp_link; + unsigned int subtree_max_alignment; }; /* Order-zero must be at least SZ_4K */ -- cgit v1.2.3 From 5f88899ec7531e1680b1003f32584d7da5922902 Mon Sep 17 00:00:00 2001 From: Nuno Sá Date: Tue, 3 Mar 2026 10:25:00 +0000 Subject: dmaengine: Document cyclic transfer for dmaengine_prep_peripheral_dma_vec() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Document that the DMA_PREP_REPEAT flag can be used with the dmaengine_prep_peripheral_dma_vec() to mark a transfer as cyclic similar to dmaengine_prep_dma_cyclic(). Reviewed-by: Frank Li Signed-off-by: Nuno Sá Link: https://patch.msgid.link/20260303-axi-dac-cyclic-support-v2-1-0db27b4be95a@analog.com Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 99efe2b9b4ea..b3d251c9734e 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -996,7 +996,8 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_slave_single( * @vecs: The array of DMA vectors that should be transferred * @nents: The number of DMA vectors in the array * @dir: Specifies the direction of the data transfer - * @flags: DMA engine flags + * @flags: DMA engine flags - DMA_PREP_REPEAT can be used to mark a cyclic + * DMA transfer */ static inline struct dma_async_tx_descriptor *dmaengine_prep_peripheral_dma_vec( struct dma_chan *chan, const struct dma_vec *vecs, size_t nents, -- cgit v1.2.3 From 70fbea9f1a44d80a4c573c225f119022d6e21360 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 28 Feb 2026 17:12:13 -0800 Subject: dmaengine: ti-cppi5: fix all kernel-doc warnings Add missing struct member, function parameter, and enum value descriptions. Add missing function Returns: sections. Use correct function name in kernel-doc to avoid mismatched prototypes. These repair all kernel-doc warnings in ti-cppi5.h: Warning: include/linux/dma/ti-cppi5.h:27 struct member 'pkt_info1' not described in 'cppi5_desc_hdr_t' Warning: include/linux/dma/ti-cppi5.h:27 struct member 'pkt_info2' not described in 'cppi5_desc_hdr_t' Warning: include/linux/dma/ti-cppi5.h:50 struct member 'epib' not described in 'cppi5_host_desc_t' Warning: include/linux/dma/ti-cppi5.h:142 struct member 'epib' not described in 'cppi5_monolithic_desc_t' Warning: include/linux/dma/ti-cppi5.h:413 function parameter 'pkt_len' not described in 'cppi5_hdesc_set_pktlen' Warning: include/linux/dma/ti-cppi5.h:436 function parameter 'ps_flags' not described in 'cppi5_hdesc_set_psflags' Warning: include/linux/dma/ti-cppi5.h:509 function parameter 'hbuf_desc' not described in 'cppi5_hdesc_link_hbdesc' Warning: include/linux/dma/ti-cppi5.h:839 struct member 'dicnt3' not described in 'cppi5_tr_type15_t' Warning: include/linux/dma/ti-cppi5.h:970 function parameter 'desc_hdr' not described in 'cppi5_trdesc_init' Warning: include/linux/dma/ti-cppi5.h:184 No description found for return value of 'cppi5_desc_is_tdcm' Warning: include/linux/dma/ti-cppi5.h:198 No description found for return value of 'cppi5_desc_get_type' Warning: include/linux/dma/ti-cppi5.h:210 No description found for return value of 'cppi5_desc_get_errflags' Warning: include/linux/dma/ti-cppi5.h:448 expecting prototype for cppi5_hdesc_get_errflags(). Prototype was for cppi5_hdesc_get_pkttype() instead Warning: include/linux/dma/ti-cppi5.h:460 expecting prototype for cppi5_hdesc_get_errflags(). Prototype was for cppi5_hdesc_set_pkttype() instead Warning: include/linux/dma/ti-cppi5.h:1053 expecting prototype for cppi5_tr_cflag_set(). Prototype was for cppi5_tr_csf_set() instead Warning: include/linux/dma/ti-cppi5.h:651 Enum value 'CPPI5_TR_TYPE_MAX' not described in enum 'cppi5_tr_types' Warning: include/linux/dma/ti-cppi5.h:676 Enum value 'CPPI5_TR_EVENT_SIZE_MAX' not described in enum 'cppi5_tr_event_size' Warning: include/linux/dma/ti-cppi5.h:693 Enum value 'CPPI5_TR_TRIGGER_MAX' not described in enum 'cppi5_tr_trigger' Warning: include/linux/dma/ti-cppi5.h:714 Enum value 'CPPI5_TR_TRIGGER_TYPE_MAX' not described in enum 'cppi5_tr_trigger_type' Warning: include/linux/dma/ti-cppi5.h:890 Enum value 'CPPI5_TR_RESPONSE_STATUS_MAX' not described in enum 'cppi5_tr_resp_status_type' Warning: include/linux/dma/ti-cppi5.h:906 Enum value 'CPPI5_TR_RESPONSE_STATUS_SUBMISSION_MAX' not described in enum 'cppi5_tr_resp_status_submission' Warning: include/linux/dma/ti-cppi5.h:934 Enum value 'CPPI5_TR_RESPONSE_STATUS_UNSUPPORTED_MAX' not described in enum 'cppi5_tr_resp_status_unsupported' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260301011213.3063688-1-rdunlap@infradead.org Signed-off-by: Vinod Koul --- include/linux/dma/ti-cppi5.h | 53 +++++++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma/ti-cppi5.h b/include/linux/dma/ti-cppi5.h index c53c0f6e3b1a..3fe19b75ddf7 100644 --- a/include/linux/dma/ti-cppi5.h +++ b/include/linux/dma/ti-cppi5.h @@ -16,8 +16,8 @@ * struct cppi5_desc_hdr_t - Descriptor header, present in all types of * descriptors * @pkt_info0: Packet info word 0 (n/a in Buffer desc) - * @pkt_info0: Packet info word 1 (n/a in Buffer desc) - * @pkt_info0: Packet info word 2 (n/a in Buffer desc) + * @pkt_info1: Packet info word 1 (n/a in Buffer desc) + * @pkt_info2: Packet info word 2 (n/a in Buffer desc) * @src_dst_tag: Packet info word 3 (n/a in Buffer desc) */ struct cppi5_desc_hdr_t { @@ -35,7 +35,7 @@ struct cppi5_desc_hdr_t { * @buf_info1: word 8: Buffer valid data length * @org_buf_len: word 9: Original buffer length * @org_buf_ptr: word 10/11: Original buffer pointer - * @epib[0]: Extended Packet Info Data (optional, 4 words), and/or + * @epib: Extended Packet Info Data (optional, 4 words), and/or * Protocol Specific Data (optional, 0-128 bytes in * multiples of 4), and/or * Other Software Data (0-N bytes, optional) @@ -132,7 +132,7 @@ struct cppi5_desc_epib_t { /** * struct cppi5_monolithic_desc_t - Monolithic-mode packet descriptor * @hdr: Descriptor header - * @epib[0]: Extended Packet Info Data (optional, 4 words), and/or + * @epib: Extended Packet Info Data (optional, 4 words), and/or * Protocol Specific Data (optional, 0-128 bytes in * multiples of 4), and/or * Other Software Data (0-N bytes, optional) @@ -179,7 +179,7 @@ static inline void cppi5_desc_dump(void *desc, u32 size) * cppi5_desc_is_tdcm - check if the paddr indicates Teardown Complete Message * @paddr: Physical address of the packet popped from the ring * - * Returns true if the address indicates TDCM + * Returns: true if the address indicates TDCM */ static inline bool cppi5_desc_is_tdcm(dma_addr_t paddr) { @@ -190,7 +190,7 @@ static inline bool cppi5_desc_is_tdcm(dma_addr_t paddr) * cppi5_desc_get_type - get descriptor type * @desc_hdr: packet descriptor/TR header * - * Returns descriptor type: + * Returns: descriptor type: * CPPI5_INFO0_DESC_TYPE_VAL_HOST * CPPI5_INFO0_DESC_TYPE_VAL_MONO * CPPI5_INFO0_DESC_TYPE_VAL_TR @@ -205,7 +205,7 @@ static inline u32 cppi5_desc_get_type(struct cppi5_desc_hdr_t *desc_hdr) * cppi5_desc_get_errflags - get Error Flags from Desc * @desc_hdr: packet/TR descriptor header * - * Returns Error Flags from Packet/TR Descriptor + * Returns: Error Flags from Packet/TR Descriptor */ static inline u32 cppi5_desc_get_errflags(struct cppi5_desc_hdr_t *desc_hdr) { @@ -307,7 +307,7 @@ static inline void cppi5_desc_set_tags_ids(struct cppi5_desc_hdr_t *desc_hdr, * @psdata_size: PSDATA size * @sw_data_size: SWDATA size * - * Returns required Host Packet Descriptor size + * Returns: required Host Packet Descriptor size * 0 - if PSDATA > CPPI5_INFO0_HDESC_PSDATA_MAX_SIZE */ static inline u32 cppi5_hdesc_calc_size(bool epib, u32 psdata_size, @@ -381,6 +381,8 @@ cppi5_hdesc_update_psdata_size(struct cppi5_host_desc_t *desc, u32 psdata_size) /** * cppi5_hdesc_get_psdata_size - get PSdata size in bytes * @desc: Host packet descriptor + * + * Returns: PSdata size in bytes */ static inline u32 cppi5_hdesc_get_psdata_size(struct cppi5_host_desc_t *desc) { @@ -398,7 +400,7 @@ static inline u32 cppi5_hdesc_get_psdata_size(struct cppi5_host_desc_t *desc) * cppi5_hdesc_get_pktlen - get Packet Length from HDesc * @desc: Host packet descriptor * - * Returns Packet Length from Host Packet Descriptor + * Returns: Packet Length from Host Packet Descriptor */ static inline u32 cppi5_hdesc_get_pktlen(struct cppi5_host_desc_t *desc) { @@ -408,6 +410,7 @@ static inline u32 cppi5_hdesc_get_pktlen(struct cppi5_host_desc_t *desc) /** * cppi5_hdesc_set_pktlen - set Packet Length in HDesc * @desc: Host packet descriptor + * @pkt_len: Packet length to set */ static inline void cppi5_hdesc_set_pktlen(struct cppi5_host_desc_t *desc, u32 pkt_len) @@ -420,7 +423,7 @@ static inline void cppi5_hdesc_set_pktlen(struct cppi5_host_desc_t *desc, * cppi5_hdesc_get_psflags - get Protocol Specific Flags from HDesc * @desc: Host packet descriptor * - * Returns Protocol Specific Flags from Host Packet Descriptor + * Returns: Protocol Specific Flags from Host Packet Descriptor */ static inline u32 cppi5_hdesc_get_psflags(struct cppi5_host_desc_t *desc) { @@ -431,6 +434,7 @@ static inline u32 cppi5_hdesc_get_psflags(struct cppi5_host_desc_t *desc) /** * cppi5_hdesc_set_psflags - set Protocol Specific Flags in HDesc * @desc: Host packet descriptor + * @ps_flags: Protocol Specific flags to set */ static inline void cppi5_hdesc_set_psflags(struct cppi5_host_desc_t *desc, u32 ps_flags) @@ -442,8 +446,10 @@ static inline void cppi5_hdesc_set_psflags(struct cppi5_host_desc_t *desc, } /** - * cppi5_hdesc_get_errflags - get Packet Type from HDesc + * cppi5_hdesc_get_pkttype - get Packet Type from HDesc * @desc: Host packet descriptor + * + * Returns: Packet type */ static inline u32 cppi5_hdesc_get_pkttype(struct cppi5_host_desc_t *desc) { @@ -452,7 +458,7 @@ static inline u32 cppi5_hdesc_get_pkttype(struct cppi5_host_desc_t *desc) } /** - * cppi5_hdesc_get_errflags - set Packet Type in HDesc + * cppi5_hdesc_set_pkttype - set Packet Type in HDesc * @desc: Host packet descriptor * @pkt_type: Packet Type */ @@ -501,7 +507,7 @@ static inline void cppi5_hdesc_reset_to_original(struct cppi5_host_desc_t *desc) /** * cppi5_hdesc_link_hbdesc - link Host Buffer Descriptor to HDesc * @desc: Host Packet Descriptor - * @buf_desc: Host Buffer Descriptor physical address + * @hbuf_desc: Host Buffer Descriptor physical address * * add and link Host Buffer Descriptor to HDesc */ @@ -527,7 +533,7 @@ static inline void cppi5_hdesc_reset_hbdesc(struct cppi5_host_desc_t *desc) * cppi5_hdesc_epib_present - check if EPIB present * @desc_hdr: packet descriptor/TR header * - * Returns true if EPIB present in the packet + * Returns: true if EPIB present in the packet */ static inline bool cppi5_hdesc_epib_present(struct cppi5_desc_hdr_t *desc_hdr) { @@ -538,7 +544,7 @@ static inline bool cppi5_hdesc_epib_present(struct cppi5_desc_hdr_t *desc_hdr) * cppi5_hdesc_get_psdata - Get pointer on PSDATA * @desc: Host packet descriptor * - * Returns pointer on PSDATA in HDesc. + * Returns: pointer on PSDATA in HDesc. * NULL - if ps_data placed at the start of data buffer. */ static inline void *cppi5_hdesc_get_psdata(struct cppi5_host_desc_t *desc) @@ -568,7 +574,7 @@ static inline void *cppi5_hdesc_get_psdata(struct cppi5_host_desc_t *desc) * cppi5_hdesc_get_swdata - Get pointer on swdata * @desc: Host packet descriptor * - * Returns pointer on SWDATA in HDesc. + * Returns: pointer on SWDATA in HDesc. * NOTE. It's caller responsibility to be sure hdesc actually has swdata. */ static inline void *cppi5_hdesc_get_swdata(struct cppi5_host_desc_t *desc) @@ -648,6 +654,7 @@ enum cppi5_tr_types { CPPI5_TR_TYPE11, /* type12-14: Reserved */ CPPI5_TR_TYPE15 = 15, + /* private: */ CPPI5_TR_TYPE_MAX }; @@ -673,6 +680,7 @@ enum cppi5_tr_event_size { CPPI5_TR_EVENT_SIZE_ICNT1_DEC, CPPI5_TR_EVENT_SIZE_ICNT2_DEC, CPPI5_TR_EVENT_SIZE_ICNT3_DEC, + /* private: */ CPPI5_TR_EVENT_SIZE_MAX }; @@ -690,6 +698,7 @@ enum cppi5_tr_trigger { CPPI5_TR_TRIGGER_GLOBAL0, CPPI5_TR_TRIGGER_GLOBAL1, CPPI5_TR_TRIGGER_LOCAL_EVENT, + /* private: */ CPPI5_TR_TRIGGER_MAX }; @@ -711,6 +720,7 @@ enum cppi5_tr_trigger_type { CPPI5_TR_TRIGGER_TYPE_ICNT2_DEC, CPPI5_TR_TRIGGER_TYPE_ICNT3_DEC, CPPI5_TR_TRIGGER_TYPE_ALL, + /* private: */ CPPI5_TR_TRIGGER_TYPE_MAX }; @@ -815,7 +825,7 @@ struct cppi5_tr_type3_t { * destination * @dicnt1: Total loop iteration count for level 1 for destination * @dicnt2: Total loop iteration count for level 2 for destination - * @sicnt3: Total loop iteration count for level 3 (outermost) for + * @dicnt3: Total loop iteration count for level 3 (outermost) for * destination */ struct cppi5_tr_type15_t { @@ -887,6 +897,7 @@ enum cppi5_tr_resp_status_type { CPPI5_TR_RESPONSE_STATUS_UNSUPPORTED_ERR, CPPI5_TR_RESPONSE_STATUS_TRANSFER_EXCEPTION, CPPI5_TR_RESPONSE_STATUS__TEARDOWN_FLUSH, + /* private: */ CPPI5_TR_RESPONSE_STATUS_MAX }; @@ -903,6 +914,7 @@ enum cppi5_tr_resp_status_submission { CPPI5_TR_RESPONSE_STATUS_SUBMISSION_ICNT0, CPPI5_TR_RESPONSE_STATUS_SUBMISSION_FIFO_FULL, CPPI5_TR_RESPONSE_STATUS_SUBMISSION_OWN, + /* private: */ CPPI5_TR_RESPONSE_STATUS_SUBMISSION_MAX }; @@ -931,6 +943,7 @@ enum cppi5_tr_resp_status_unsupported { CPPI5_TR_RESPONSE_STATUS_UNSUPPORTED_DFMT, CPPI5_TR_RESPONSE_STATUS_UNSUPPORTED_SECTR, CPPI5_TR_RESPONSE_STATUS_UNSUPPORTED_AMODE_SPECIFIC, + /* private: */ CPPI5_TR_RESPONSE_STATUS_UNSUPPORTED_MAX }; @@ -939,7 +952,7 @@ enum cppi5_tr_resp_status_unsupported { * @tr_count: number of TR records * @tr_size: Nominal size of TR record (max) [16, 32, 64, 128] * - * Returns required TR Descriptor size + * Returns: required TR Descriptor size */ static inline size_t cppi5_trdesc_calc_size(u32 tr_count, u32 tr_size) { @@ -955,7 +968,7 @@ static inline size_t cppi5_trdesc_calc_size(u32 tr_count, u32 tr_size) /** * cppi5_trdesc_init - Init TR Descriptor - * @desc: TR Descriptor + * @desc_hdr: TR Descriptor * @tr_count: number of TR records * @tr_size: Nominal size of TR record (max) [16, 32, 64, 128] * @reload_idx: Absolute index to jump to on the 2nd and following passes @@ -1044,7 +1057,7 @@ static inline void cppi5_tr_set_trigger(cppi5_tr_flags_t *flags, } /** - * cppi5_tr_cflag_set - Update the Configuration specific flags + * cppi5_tr_csf_set - Update the Configuration specific flags * @flags: Pointer to the TR's flags * @csf: Configuration specific flags * -- cgit v1.2.3 From 7b84a00dd3528d980f1f35fd2c5015f72dc3f62a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 28 Feb 2026 17:12:03 -0800 Subject: dmaengine: qcom: qcom-gpi-dma.h: fix all kernel-doc warnings Add missing enum descriptions and spell one struct member correctly to avoid kernel-doc warnings: Warning: include/linux/dma/qcom-gpi-dma.h:15 Enum value 'SPI_TX' not described in enum 'spi_transfer_cmd' Warning: include/linux/dma/qcom-gpi-dma.h:15 Enum value 'SPI_RX' not described in enum 'spi_transfer_cmd' Warning: include/linux/dma/qcom-gpi-dma.h:15 Enum value 'SPI_DUPLEX' not described in enum 'spi_transfer_cmd' Warning: include/linux/dma/qcom-gpi-dma.h:80 struct member 'multi_msg' not described in 'gpi_i2c_config' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260301011203.3062658-1-rdunlap@infradead.org Signed-off-by: Vinod Koul --- include/linux/dma/qcom-gpi-dma.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dma/qcom-gpi-dma.h b/include/linux/dma/qcom-gpi-dma.h index 6680dd1a43c6..332be28427e4 100644 --- a/include/linux/dma/qcom-gpi-dma.h +++ b/include/linux/dma/qcom-gpi-dma.h @@ -8,6 +8,9 @@ /** * enum spi_transfer_cmd - spi transfer commands + * @SPI_TX: SPI peripheral TX command + * @SPI_RX: SPI peripheral RX command + * @SPI_DUPLEX: SPI peripheral Duplex command */ enum spi_transfer_cmd { SPI_TX = 1, @@ -64,7 +67,7 @@ enum i2c_op { * @set_config: set peripheral config * @rx_len: receive length for buffer * @op: i2c cmd - * @muli-msg: is part of multi i2c r-w msgs + * @multi_msg: is part of multi i2c r-w msgs */ struct gpi_i2c_config { u8 set_config; -- cgit v1.2.3 From 4d94ce88c77e74830a5b9d02ecb8286039ffa494 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 25 Feb 2026 09:17:00 +1100 Subject: VFS: unexport lock_rename(), lock_rename_child(), unlock_rename() These three function are now only used in namei.c, so they don't need to be exported. Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Link: https://patch.msgid.link/20260224222542.3458677-16-neilb@ownmail.net Signed-off-by: Christian Brauner --- Documentation/filesystems/porting.rst | 7 +++++++ fs/namei.c | 9 +++------ include/linux/namei.h | 3 --- 3 files changed, 10 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 1dd31ab417a2..d02aa57e4477 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -1368,3 +1368,10 @@ lifetime, consider using inode_set_cached_link() instead. lookup_one_qstr_excl() is no longer exported - use start_creating() or similar. +--- + +** mandatory** + +lock_rename(), lock_rename_child(), unlock_rename() are no +longer available. Use start_renaming() or similar. + diff --git a/fs/namei.c b/fs/namei.c index a5daa62399d7..77189335bbcc 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3775,7 +3775,7 @@ static struct dentry *lock_two_directories(struct dentry *p1, struct dentry *p2) /* * p1 and p2 should be directories on the same fs. */ -struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) +static struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) { if (p1 == p2) { inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); @@ -3785,12 +3785,11 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) mutex_lock(&p1->d_sb->s_vfs_rename_mutex); return lock_two_directories(p1, p2); } -EXPORT_SYMBOL(lock_rename); /* * c1 and p2 should be on the same fs. */ -struct dentry *lock_rename_child(struct dentry *c1, struct dentry *p2) +static struct dentry *lock_rename_child(struct dentry *c1, struct dentry *p2) { if (READ_ONCE(c1->d_parent) == p2) { /* @@ -3827,9 +3826,8 @@ struct dentry *lock_rename_child(struct dentry *c1, struct dentry *p2) mutex_unlock(&c1->d_sb->s_vfs_rename_mutex); return NULL; } -EXPORT_SYMBOL(lock_rename_child); -void unlock_rename(struct dentry *p1, struct dentry *p2) +static void unlock_rename(struct dentry *p1, struct dentry *p2) { inode_unlock(p1->d_inode); if (p1 != p2) { @@ -3837,7 +3835,6 @@ void unlock_rename(struct dentry *p1, struct dentry *p2) mutex_unlock(&p1->d_sb->s_vfs_rename_mutex); } } -EXPORT_SYMBOL(unlock_rename); /** * __start_renaming - lookup and lock names for rename diff --git a/include/linux/namei.h b/include/linux/namei.h index c7a7288cdd25..2ad6dd9987b9 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -165,9 +165,6 @@ extern int follow_down_one(struct path *); extern int follow_down(struct path *path, unsigned int flags); extern int follow_up(struct path *); -extern struct dentry *lock_rename(struct dentry *, struct dentry *); -extern struct dentry *lock_rename_child(struct dentry *, struct dentry *); -extern void unlock_rename(struct dentry *, struct dentry *); int start_renaming(struct renamedata *rd, int lookup_flags, struct qstr *old_last, struct qstr *new_last); int start_renaming_dentry(struct renamedata *rd, int lookup_flags, -- cgit v1.2.3 From 5645f805927c9bd4443e6143e487ef3ffea34aaf Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Fri, 6 Mar 2026 14:22:00 +0100 Subject: gpio: Document line value semantics It is not clearly documented that the GPIO driver API expect the driver to get/set the physical level of the GPIO line and the consumer API will get/set the logic level. Document this in relevant places. Reported-by: David Jander Signed-off-by: Linus Walleij Link: https://patch.msgid.link/20260306-gpio-doc-levels-v1-1-19928739e400@kernel.org Signed-off-by: Bartosz Golaszewski --- Documentation/driver-api/gpio/driver.rst | 27 +++++++++++++++++++++++++++ include/linux/gpio/driver.h | 10 ++++++++-- 2 files changed, 35 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/Documentation/driver-api/gpio/driver.rst b/Documentation/driver-api/gpio/driver.rst index 85d86f92c41b..a4f160b95089 100644 --- a/Documentation/driver-api/gpio/driver.rst +++ b/Documentation/driver-api/gpio/driver.rst @@ -87,6 +87,33 @@ atomic context on realtime kernels (inside hard IRQ handlers and similar contexts). Normally this should not be required. +GPIO level semantics +-------------------- + +The gpip_chip .get/set[_multiple]() line values are clamped to the boolean +space [0, 1], low level or high level. + +Low and high values are defined as physical low on the line in/out to the +connector such as a physical pad, pin or rail. + +The GPIO library has internal logic to handle lines that are active low, such +as indicated by overstrike or #name in a schematic, and the driver should not +try to second-guess the logic value of a line. + +The way GPIO values are handled by the consumers is that the library present +the *logical* value to the consumer. A line is *asserted* if its *logical* +value is 1, and *de-asserted* if its logical value is 0. If inversion is +required, this is handled by gpiolib and configured using hardware descriptions +such as device tree or ACPI that can clearly indicate if a line is active +high or low. + +Since electronics commonly insert inverters as driving stages or protection +buffers in front of a GPIO line it is necessary that this semantic is part +of the hardware description, so that consumers such as kernel drivers need +not worry about this, and can for example assert a RESET line tied to a GPIO +pin by setting it to logic 1 even if it is physically active low. + + GPIO electrical configuration ----------------------------- diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 5f5ddcbfa445..17511434ed07 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -343,11 +343,17 @@ struct gpio_irq_chip { * @direction_output: configures signal "offset" as output, returns 0 on * success or a negative error number. This can be omitted on input-only * or output-only gpio chips. - * @get: returns value for signal "offset", 0=low, 1=high, or negative error + * @get: returns value for signal "offset", 0=low, 1=high, or negative error. + * the low and high values are defined as physical low on the line + * in/out to the connector such as a physical pad, pin or rail. The GPIO + * library has internal logic to handle lines that are active low, such + * as indicated by overstrike or #name in a schematic, and the driver + * should not try to second-guess the logic value of a line. * @get_multiple: reads values for multiple signals defined by "mask" and * stores them in "bits", returns 0 on success or negative error * @set: assigns output value for signal "offset", returns 0 on success or - * negative error value + * negative error value. The output value follows the same semantic + * rules as for @get. * @set_multiple: assigns output values for multiple signals defined by * "mask", returns 0 on success or negative error value * @set_config: optional hook for all kinds of settings. Uses the same -- cgit v1.2.3 From ad9d28e68f4f9d15b9bde15e1ab79a4f85eff60e Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Fri, 6 Mar 2026 18:22:47 +0100 Subject: reset: gpio: simplify fallback device matching The of_args field of struct reset_controller_dev was introduced to allow the reset-gpio driver to pass the phandle arguments back to reset core. The thing is: it doesn't even have to do it. The core sets the platform data of the auxiliary device *AND* has access to it later on during the lookup. This means the field is unneeded and all can happen entirely in reset core. Remove the field from the public header and don't set it in reset-gpio.c. Retrieve the platform data in reset core when needed instead. Reviewed-by: Philipp Zabel Signed-off-by: Bartosz Golaszewski Signed-off-by: Philipp Zabel --- drivers/reset/core.c | 15 ++++++--------- drivers/reset/reset-gpio.c | 5 ----- include/linux/reset-controller.h | 4 ---- 3 files changed, 6 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/drivers/reset/core.c b/drivers/reset/core.c index 3845e77a8d32..954df36a242e 100644 --- a/drivers/reset/core.c +++ b/drivers/reset/core.c @@ -94,9 +94,6 @@ static const char *rcdev_name(struct reset_controller_dev *rcdev) if (rcdev->of_node) return rcdev->of_node->full_name; - if (rcdev->of_args) - return rcdev->of_args->np->full_name; - return NULL; } @@ -125,9 +122,6 @@ static int of_reset_simple_xlate(struct reset_controller_dev *rcdev, */ int reset_controller_register(struct reset_controller_dev *rcdev) { - if (rcdev->of_node && rcdev->of_args) - return -EINVAL; - if (!rcdev->of_xlate) { rcdev->of_reset_n_cells = 1; rcdev->of_xlate = of_reset_simple_xlate; @@ -1006,13 +1000,16 @@ static struct reset_controller_dev *__reset_find_rcdev(const struct of_phandle_a bool gpio_fallback) { struct reset_controller_dev *rcdev; + struct of_phandle_args *rc_args; lockdep_assert_held(&reset_list_mutex); list_for_each_entry(rcdev, &reset_controller_list, list) { - if (gpio_fallback) { - if (rcdev->of_args && of_phandle_args_equal(args, - rcdev->of_args)) + if (gpio_fallback && rcdev->dev && + device_is_compatible(rcdev->dev, "reset-gpio")) { + rc_args = dev_get_platdata(rcdev->dev); + + if (of_phandle_args_equal(args, rc_args)) return rcdev; } else { if (args->np == rcdev->of_node) diff --git a/drivers/reset/reset-gpio.c b/drivers/reset/reset-gpio.c index ad5bfe27aaef..6e1c4f990bc0 100644 --- a/drivers/reset/reset-gpio.c +++ b/drivers/reset/reset-gpio.c @@ -56,12 +56,8 @@ static int reset_gpio_probe(struct auxiliary_device *adev, const struct auxiliary_device_id *id) { struct device *dev = &adev->dev; - struct of_phandle_args *platdata = dev_get_platdata(dev); struct reset_gpio_priv *priv; - if (!platdata) - return -EINVAL; - priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; @@ -76,7 +72,6 @@ static int reset_gpio_probe(struct auxiliary_device *adev, priv->rc.ops = &reset_gpio_ops; priv->rc.owner = THIS_MODULE; priv->rc.dev = dev; - priv->rc.of_args = platdata; /* Cells to match GPIO specifier, but it's not really used */ priv->rc.of_reset_n_cells = 2; diff --git a/include/linux/reset-controller.h b/include/linux/reset-controller.h index 46514cb1b9e0..aa95b460fdf8 100644 --- a/include/linux/reset-controller.h +++ b/include/linux/reset-controller.h @@ -35,9 +35,6 @@ struct of_phandle_args; * @reset_control_head: head of internal list of requested reset controls * @dev: corresponding driver model device struct * @of_node: corresponding device tree node as phandle target - * @of_args: for reset-gpios controllers: corresponding phandle args with - * of_node and GPIO number complementing of_node; either this or - * of_node should be present * @of_reset_n_cells: number of cells in reset line specifiers * @of_xlate: translation function to translate from specifier as found in the * device tree to id as given to the reset control ops, defaults @@ -51,7 +48,6 @@ struct reset_controller_dev { struct list_head reset_control_head; struct device *dev; struct device_node *of_node; - const struct of_phandle_args *of_args; int of_reset_n_cells; int (*of_xlate)(struct reset_controller_dev *rcdev, const struct of_phandle_args *reset_spec); -- cgit v1.2.3 From 44a0acb2caca3bfd0ca459fbf0b19be75f1819c0 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Fri, 6 Mar 2026 18:22:53 +0100 Subject: reset: protect struct reset_controller_dev with its own mutex Currently we use a single, global mutex - misleadingly names reset_list_mutex - to protect the global list of reset devices, per-controller list of reset control handles and also internal fields of struct reset_control. Locking can be made a lot more fine-grained if we use a separate mutex for serializing operations on the list AND accessing the reset controller device. Signed-off-by: Bartosz Golaszewski Reviewed-by: Philipp Zabel Signed-off-by: Philipp Zabel --- drivers/reset/core.c | 44 ++++++++++++++++++++++++---------------- include/linux/reset-controller.h | 3 +++ 2 files changed, 30 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/drivers/reset/core.c b/drivers/reset/core.c index e6c12fbebca9..acd9d10b1ceb 100644 --- a/drivers/reset/core.c +++ b/drivers/reset/core.c @@ -131,6 +131,7 @@ int reset_controller_register(struct reset_controller_dev *rcdev) } INIT_LIST_HEAD(&rcdev->reset_control_head); + mutex_init(&rcdev->lock); guard(mutex)(&reset_list_mutex); @@ -143,6 +144,8 @@ EXPORT_SYMBOL_GPL(reset_controller_register); static void reset_controller_remove(struct reset_controller_dev *rcdev, struct reset_control *rstc) { + lockdep_assert_held(&rcdev->lock); + list_del(&rstc->list); module_put(rcdev->owner); put_device(rcdev->dev); @@ -156,19 +159,22 @@ void reset_controller_unregister(struct reset_controller_dev *rcdev) { struct reset_control *rstc, *pos; - guard(mutex)(&reset_list_mutex); - - list_del(&rcdev->list); + scoped_guard(mutex, &reset_list_mutex) + list_del(&rcdev->list); - /* - * Numb but don't free the remaining reset control handles that are - * still held by consumers. - */ - list_for_each_entry_safe(rstc, pos, &rcdev->reset_control_head, list) { - rcu_assign_pointer(rstc->rcdev, NULL); - synchronize_srcu(&rstc->srcu); - reset_controller_remove(rcdev, rstc); + scoped_guard(mutex, &rcdev->lock) { + /* + * Numb but don't free the remaining reset control handles that are + * still held by consumers. + */ + list_for_each_entry_safe(rstc, pos, &rcdev->reset_control_head, list) { + rcu_assign_pointer(rstc->rcdev, NULL); + synchronize_srcu(&rstc->srcu); + reset_controller_remove(rcdev, rstc); + } } + + mutex_destroy(&rcdev->lock); } EXPORT_SYMBOL_GPL(reset_controller_unregister); @@ -712,10 +718,12 @@ int reset_control_acquire(struct reset_control *rstc) if (!rcdev) return -ENODEV; - list_for_each_entry(rc, &rcdev->reset_control_head, list) { - if (rstc != rc && rstc->id == rc->id) { - if (rc->acquired) - return -EBUSY; + scoped_guard(mutex, &rcdev->lock) { + list_for_each_entry(rc, &rcdev->reset_control_head, list) { + if (rstc != rc && rstc->id == rc->id) { + if (rc->acquired) + return -EBUSY; + } } } @@ -806,7 +814,7 @@ __reset_control_get_internal(struct reset_controller_dev *rcdev, struct reset_control *rstc; int ret; - lockdep_assert_held(&reset_list_mutex); + lockdep_assert_held(&rcdev->lock); /* Expect callers to filter out OPTIONAL and DEASSERTED bits */ if (WARN_ON(flags & ~(RESET_CONTROL_FLAGS_BIT_SHARED | @@ -868,8 +876,10 @@ static void __reset_control_release(struct kref *kref) scoped_guard(srcu, &rstc->srcu) { rcdev = rcu_replace_pointer(rstc->rcdev, NULL, true); - if (rcdev) + if (rcdev) { + guard(mutex)(&rcdev->lock); reset_controller_remove(rcdev, rstc); + } } synchronize_srcu(&rstc->srcu); diff --git a/include/linux/reset-controller.h b/include/linux/reset-controller.h index aa95b460fdf8..185d2a9bd7cd 100644 --- a/include/linux/reset-controller.h +++ b/include/linux/reset-controller.h @@ -3,6 +3,7 @@ #define _LINUX_RESET_CONTROLLER_H_ #include +#include struct reset_controller_dev; @@ -40,6 +41,7 @@ struct of_phandle_args; * device tree to id as given to the reset control ops, defaults * to :c:func:`of_reset_simple_xlate`. * @nr_resets: number of reset controls in this reset controller device + * @lock: protects the reset control list from concurrent access */ struct reset_controller_dev { const struct reset_control_ops *ops; @@ -52,6 +54,7 @@ struct reset_controller_dev { int (*of_xlate)(struct reset_controller_dev *rcdev, const struct of_phandle_args *reset_spec); unsigned int nr_resets; + struct mutex lock; }; #if IS_ENABLED(CONFIG_RESET_CONTROLLER) -- cgit v1.2.3 From ba8dbbb14b7e6734afbb5ba37d0679831aa3d590 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Fri, 6 Mar 2026 18:22:56 +0100 Subject: reset: convert the core API to using firmware nodes In order to simplify the commit converting the internals of reset core to using firmware nodes, first convert the user-facing API. Modify the signature of the core consumer functions but leave the specialized wrappers as is to avoid modifying users for now. No functional change intended. Reviewed-by: Philipp Zabel Signed-off-by: Bartosz Golaszewski Signed-off-by: Philipp Zabel --- Documentation/driver-api/reset.rst | 1 - drivers/reset/core.c | 33 ++++++++++++++++------------- include/linux/reset.h | 43 +++++++++++++++++++++++++------------- 3 files changed, 46 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/Documentation/driver-api/reset.rst b/Documentation/driver-api/reset.rst index f773100daaa4..7a6571849664 100644 --- a/Documentation/driver-api/reset.rst +++ b/Documentation/driver-api/reset.rst @@ -198,7 +198,6 @@ query the reset line status using reset_control_status(). reset_control_rearm reset_control_put of_reset_control_get_count - of_reset_control_array_get devm_reset_control_array_get reset_control_get_count diff --git a/drivers/reset/core.c b/drivers/reset/core.c index f1b644a86ad0..0da5079ea9db 100644 --- a/drivers/reset/core.c +++ b/drivers/reset/core.c @@ -1061,7 +1061,7 @@ static int __reset_add_reset_gpio_device(struct device_node *np, rgpio_dev->of_args = *args; /* * We keep the device_node reference, but of_args.np is put at the end - * of __of_reset_control_get(), so get it one more time. + * of __fwnode_reset_control_get(), so get it one more time. * Hold reference as long as rgpio_dev memory is valid. */ of_node_get(rgpio_dev->of_args.np); @@ -1115,18 +1115,19 @@ static struct reset_controller_dev *__reset_find_rcdev(const struct of_phandle_a } struct reset_control * -__of_reset_control_get(struct device_node *node, const char *id, int index, - enum reset_control_flags flags) +__fwnode_reset_control_get(struct fwnode_handle *fwnode, const char *id, int index, + enum reset_control_flags flags) { bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL; bool gpio_fallback = false; + struct device_node *node = to_of_node(fwnode); struct reset_control *rstc = ERR_PTR(-EINVAL); struct reset_controller_dev *rcdev; struct of_phandle_args args; int rstc_id; int ret; - if (!node) + if (!fwnode) return ERR_PTR(-EINVAL); if (id) { @@ -1193,7 +1194,7 @@ out_put: return rstc; } -EXPORT_SYMBOL_GPL(__of_reset_control_get); +EXPORT_SYMBOL_GPL(__fwnode_reset_control_get); struct reset_control *__reset_control_get(struct device *dev, const char *id, int index, enum reset_control_flags flags) @@ -1201,12 +1202,13 @@ struct reset_control *__reset_control_get(struct device *dev, const char *id, bool shared = flags & RESET_CONTROL_FLAGS_BIT_SHARED; bool acquired = flags & RESET_CONTROL_FLAGS_BIT_ACQUIRED; bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL; + struct fwnode_handle *fwnode = dev_fwnode(dev); if (WARN_ON(shared && acquired)) return ERR_PTR(-EINVAL); - if (dev->of_node) - return __of_reset_control_get(dev->of_node, id, index, flags); + if (fwnode) + return __fwnode_reset_control_get(fwnode, id, index, flags); return optional ? NULL : ERR_PTR(-ENOENT); } @@ -1468,23 +1470,24 @@ static int fwnode_reset_control_get_count(struct fwnode_handle *fwnode) } /** - * of_reset_control_array_get - Get a list of reset controls using - * device node. + * fwnode_reset_control_array_get - Get a list of reset controls using + * a firmware node. * - * @np: device node for the device that requests the reset controls array + * @fwnode: firmware node for the device that requests the reset controls array * @flags: whether reset controls are shared, optional, acquired * * Returns pointer to allocated reset_control on success or error on failure */ struct reset_control * -of_reset_control_array_get(struct device_node *np, enum reset_control_flags flags) +fwnode_reset_control_array_get(struct fwnode_handle *fwnode, + enum reset_control_flags flags) { bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL; struct reset_control_array *resets; struct reset_control *rstc; int num, i; - num = fwnode_reset_control_get_count(of_fwnode_handle(np)); + num = fwnode_reset_control_get_count(fwnode); if (num < 0) return optional ? NULL : ERR_PTR(num); @@ -1494,7 +1497,7 @@ of_reset_control_array_get(struct device_node *np, enum reset_control_flags flag resets->num_rstcs = num; for (i = 0; i < num; i++) { - rstc = __of_reset_control_get(np, NULL, i, flags); + rstc = __fwnode_reset_control_get(fwnode, NULL, i, flags); if (IS_ERR(rstc)) goto err_rst; resets->rstc[i] = rstc; @@ -1511,7 +1514,7 @@ err_rst: return rstc; } -EXPORT_SYMBOL_GPL(of_reset_control_array_get); +EXPORT_SYMBOL_GPL(fwnode_reset_control_array_get); /** * devm_reset_control_array_get - Resource managed reset control array get @@ -1535,7 +1538,7 @@ devm_reset_control_array_get(struct device *dev, enum reset_control_flags flags) if (!ptr) return ERR_PTR(-ENOMEM); - rstc = of_reset_control_array_get(dev->of_node, flags); + rstc = fwnode_reset_control_array_get(dev_fwnode(dev), flags); if (IS_ERR_OR_NULL(rstc)) { devres_free(ptr); return rstc; diff --git a/include/linux/reset.h b/include/linux/reset.h index 44f9e3415f92..9c391cf0c822 100644 --- a/include/linux/reset.h +++ b/include/linux/reset.h @@ -5,10 +5,12 @@ #include #include #include +#include #include struct device; struct device_node; +struct fwnode_handle; struct reset_control; /** @@ -84,7 +86,7 @@ int reset_control_bulk_deassert(int num_rstcs, struct reset_control_bulk_data *r int reset_control_bulk_acquire(int num_rstcs, struct reset_control_bulk_data *rstcs); void reset_control_bulk_release(int num_rstcs, struct reset_control_bulk_data *rstcs); -struct reset_control *__of_reset_control_get(struct device_node *node, +struct reset_control *__fwnode_reset_control_get(struct fwnode_handle *fwnode, const char *id, int index, enum reset_control_flags flags); struct reset_control *__reset_control_get(struct device *dev, const char *id, int index, enum reset_control_flags flags); @@ -103,7 +105,8 @@ int __devm_reset_control_bulk_get(struct device *dev, int num_rstcs, struct reset_control *devm_reset_control_array_get(struct device *dev, enum reset_control_flags flags); -struct reset_control *of_reset_control_array_get(struct device_node *np, enum reset_control_flags); +struct reset_control *fwnode_reset_control_array_get(struct fwnode_handle *fwnode, + enum reset_control_flags); int reset_control_get_count(struct device *dev); @@ -152,8 +155,8 @@ static inline int __device_reset(struct device *dev, bool optional) return optional ? 0 : -ENOTSUPP; } -static inline struct reset_control *__of_reset_control_get( - struct device_node *node, +static inline struct reset_control *__fwnode_reset_control_get( + struct fwnode_handle *fwnode, const char *id, int index, enum reset_control_flags flags) { bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL; @@ -242,7 +245,7 @@ devm_reset_control_array_get(struct device *dev, enum reset_control_flags flags) } static inline struct reset_control * -of_reset_control_array_get(struct device_node *np, enum reset_control_flags flags) +fwnode_reset_control_array_get(struct fwnode_handle *fwnode, enum reset_control_flags flags) { bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL; @@ -500,7 +503,8 @@ reset_control_bulk_get_optional_shared(struct device *dev, int num_rstcs, static inline struct reset_control *of_reset_control_get_exclusive( struct device_node *node, const char *id) { - return __of_reset_control_get(node, id, 0, RESET_CONTROL_EXCLUSIVE); + return __fwnode_reset_control_get(of_fwnode_handle(node), id, 0, + RESET_CONTROL_EXCLUSIVE); } /** @@ -520,7 +524,8 @@ static inline struct reset_control *of_reset_control_get_exclusive( static inline struct reset_control *of_reset_control_get_optional_exclusive( struct device_node *node, const char *id) { - return __of_reset_control_get(node, id, 0, RESET_CONTROL_OPTIONAL_EXCLUSIVE); + return __fwnode_reset_control_get(of_fwnode_handle(node), id, 0, + RESET_CONTROL_OPTIONAL_EXCLUSIVE); } /** @@ -545,7 +550,8 @@ static inline struct reset_control *of_reset_control_get_optional_exclusive( static inline struct reset_control *of_reset_control_get_shared( struct device_node *node, const char *id) { - return __of_reset_control_get(node, id, 0, RESET_CONTROL_SHARED); + return __fwnode_reset_control_get(of_fwnode_handle(node), id, 0, + RESET_CONTROL_SHARED); } /** @@ -562,7 +568,8 @@ static inline struct reset_control *of_reset_control_get_shared( static inline struct reset_control *of_reset_control_get_exclusive_by_index( struct device_node *node, int index) { - return __of_reset_control_get(node, NULL, index, RESET_CONTROL_EXCLUSIVE); + return __fwnode_reset_control_get(of_fwnode_handle(node), NULL, index, + RESET_CONTROL_EXCLUSIVE); } /** @@ -590,7 +597,8 @@ static inline struct reset_control *of_reset_control_get_exclusive_by_index( static inline struct reset_control *of_reset_control_get_shared_by_index( struct device_node *node, int index) { - return __of_reset_control_get(node, NULL, index, RESET_CONTROL_SHARED); + return __fwnode_reset_control_get(of_fwnode_handle(node), NULL, index, + RESET_CONTROL_SHARED); } /** @@ -1032,30 +1040,35 @@ devm_reset_control_array_get_optional_shared(struct device *dev) static inline struct reset_control * of_reset_control_array_get_exclusive(struct device_node *node) { - return of_reset_control_array_get(node, RESET_CONTROL_EXCLUSIVE); + return fwnode_reset_control_array_get(of_fwnode_handle(node), + RESET_CONTROL_EXCLUSIVE); } static inline struct reset_control * of_reset_control_array_get_exclusive_released(struct device_node *node) { - return of_reset_control_array_get(node, RESET_CONTROL_EXCLUSIVE_RELEASED); + return fwnode_reset_control_array_get(of_fwnode_handle(node), + RESET_CONTROL_EXCLUSIVE_RELEASED); } static inline struct reset_control * of_reset_control_array_get_shared(struct device_node *node) { - return of_reset_control_array_get(node, RESET_CONTROL_SHARED); + return fwnode_reset_control_array_get(of_fwnode_handle(node), + RESET_CONTROL_SHARED); } static inline struct reset_control * of_reset_control_array_get_optional_exclusive(struct device_node *node) { - return of_reset_control_array_get(node, RESET_CONTROL_OPTIONAL_EXCLUSIVE); + return fwnode_reset_control_array_get(of_fwnode_handle(node), + RESET_CONTROL_OPTIONAL_EXCLUSIVE); } static inline struct reset_control * of_reset_control_array_get_optional_shared(struct device_node *node) { - return of_reset_control_array_get(node, RESET_CONTROL_OPTIONAL_SHARED); + return fwnode_reset_control_array_get(of_fwnode_handle(node), + RESET_CONTROL_OPTIONAL_SHARED); } #endif -- cgit v1.2.3 From 9035073d0ef1de813c6335239250248bfe0a64aa Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Fri, 6 Mar 2026 18:22:57 +0100 Subject: reset: convert reset core to using firmware nodes With everything else now in place, we can convert the remaining parts of the reset subsystem to becoming fwnode-agnostic - meaning it will work with all kinds of firmware nodes, not only devicetree. To that end: extend struct reset_controller_dev with fields taking information relevant for using firmware nodes (which mirrors what we already do for OF-nodes) and limit using of_ APIs only to where it's absolutely necessary (mostly around the of_xlate callback). For backward compatibility of existing drivers we still support OF-nodes but firmware nodes become the preferred method. Signed-off-by: Bartosz Golaszewski Reviewed-by: Philipp Zabel Signed-off-by: Philipp Zabel --- drivers/reset/core.c | 166 +++++++++++++++++++++++---------------- include/linux/reset-controller.h | 14 +++- 2 files changed, 112 insertions(+), 68 deletions(-) (limited to 'include/linux') diff --git a/drivers/reset/core.c b/drivers/reset/core.c index 0da5079ea9db..e625cf59cfb0 100644 --- a/drivers/reset/core.c +++ b/drivers/reset/core.c @@ -81,13 +81,13 @@ struct reset_control_array { /** * struct reset_gpio_lookup - lookup key for ad-hoc created reset-gpio devices - * @of_args: phandle to the reset controller with all the args like GPIO number + * @ref_args: Reference to the reset controller with all the args like GPIO number * @swnode: Software node containing the reference to the GPIO provider * @list: list entry for the reset_gpio_lookup_list * @adev: Auxiliary device representing the reset controller */ struct reset_gpio_lookup { - struct of_phandle_args of_args; + struct fwnode_reference_args ref_args; struct fwnode_handle *swnode; struct list_head list; struct auxiliary_device adev; @@ -98,24 +98,24 @@ static const char *rcdev_name(struct reset_controller_dev *rcdev) if (rcdev->dev) return dev_name(rcdev->dev); - if (rcdev->of_node) - return rcdev->of_node->full_name; + if (rcdev->fwnode) + return fwnode_get_name(rcdev->fwnode); return NULL; } /** - * of_reset_simple_xlate - translate reset_spec to the reset line number + * fwnode_reset_simple_xlate - translate reset_spec to the reset line number * @rcdev: a pointer to the reset controller device - * @reset_spec: reset line specifier as found in the device tree + * @reset_spec: reset line specifier as found in firmware * - * This static translation function is used by default if of_xlate in - * :c:type:`reset_controller_dev` is not set. It is useful for all reset - * controllers with 1:1 mapping, where reset lines can be indexed by number - * without gaps. + * This static translation function is used by default if neither fwnode_xlate + * not of_xlate in :c:type:`reset_controller_dev` is not set. It is useful for + * all reset controllers with 1:1 mapping, where reset lines can be indexed by + * number without gaps. */ -static int of_reset_simple_xlate(struct reset_controller_dev *rcdev, - const struct of_phandle_args *reset_spec) +static int fwnode_reset_simple_xlate(struct reset_controller_dev *rcdev, + const struct fwnode_reference_args *reset_spec) { if (reset_spec->args[0] >= rcdev->nr_resets) return -EINVAL; @@ -129,9 +129,23 @@ static int of_reset_simple_xlate(struct reset_controller_dev *rcdev, */ int reset_controller_register(struct reset_controller_dev *rcdev) { - if (!rcdev->of_xlate) { - rcdev->of_reset_n_cells = 1; - rcdev->of_xlate = of_reset_simple_xlate; + if ((rcdev->of_node && rcdev->fwnode) || (rcdev->of_xlate && rcdev->fwnode_xlate)) + return -EINVAL; + + if (!rcdev->of_node && !rcdev->fwnode) { + rcdev->fwnode = dev_fwnode(rcdev->dev); + if (!rcdev->fwnode) + return -EINVAL; + } + + if (rcdev->of_node) { + rcdev->fwnode = of_fwnode_handle(rcdev->of_node); + rcdev->fwnode_reset_n_cells = rcdev->of_reset_n_cells; + } + + if (rcdev->fwnode && !rcdev->fwnode_xlate) { + rcdev->fwnode_reset_n_cells = 1; + rcdev->fwnode_xlate = fwnode_reset_simple_xlate; } INIT_LIST_HEAD(&rcdev->reset_control_head); @@ -931,7 +945,7 @@ static int reset_create_gpio_aux_device(struct reset_gpio_lookup *rgpio_dev, adev->id = id; adev->name = "gpio"; adev->dev.parent = parent; - adev->dev.platform_data = &rgpio_dev->of_args; + adev->dev.platform_data = &rgpio_dev->ref_args; adev->dev.release = reset_gpio_aux_device_release; device_set_node(&adev->dev, rgpio_dev->swnode); @@ -951,18 +965,18 @@ static int reset_create_gpio_aux_device(struct reset_gpio_lookup *rgpio_dev, return 0; } -static void reset_gpio_add_devlink(struct device_node *np, +static void reset_gpio_add_devlink(struct fwnode_handle *fwnode, struct reset_gpio_lookup *rgpio_dev) { struct device *consumer; /* - * We must use get_dev_from_fwnode() and not of_find_device_by_node() + * We must use get_dev_from_fwnode() and not ref_find_device_by_node() * because the latter only considers the platform bus while we want to * get consumers of any kind that can be associated with firmware * nodes: auxiliary, soundwire, etc. */ - consumer = get_dev_from_fwnode(of_fwnode_handle(np)); + consumer = get_dev_from_fwnode(fwnode); if (consumer) { if (!device_link_add(consumer, &rgpio_dev->adev.dev, DL_FLAG_AUTOREMOVE_CONSUMER)) @@ -982,15 +996,23 @@ static void reset_gpio_add_devlink(struct device_node *np, */ } +/* TODO: move it out into drivers/base/ */ +static bool fwnode_reference_args_equal(const struct fwnode_reference_args *left, + const struct fwnode_reference_args *right) +{ + return left->fwnode == right->fwnode && left->nargs == right->nargs && + !memcmp(left->args, right->args, sizeof(left->args[0]) * left->nargs); +} + /* * @np: OF-node associated with the consumer - * @args: phandle to the GPIO provider with all the args like GPIO number + * @args: Reference to the GPIO provider with all the args like GPIO number */ -static int __reset_add_reset_gpio_device(struct device_node *np, - const struct of_phandle_args *args) +static int __reset_add_reset_gpio_device(struct fwnode_handle *fwnode, + const struct fwnode_reference_args *args) { struct property_entry properties[3] = { }; - unsigned int offset, of_flags, lflags; + unsigned int offset, flags, lflags; struct reset_gpio_lookup *rgpio_dev; struct device *parent; int ret, prop = 0; @@ -1001,7 +1023,7 @@ static int __reset_add_reset_gpio_device(struct device_node *np, * args[1]: GPIO flags * TODO: Handle other cases. */ - if (args->args_count != 2) + if (args->nargs != 2) return -ENOENT; /* @@ -1012,7 +1034,7 @@ static int __reset_add_reset_gpio_device(struct device_node *np, lockdep_assert_not_held(&reset_list_mutex); offset = args->args[0]; - of_flags = args->args[1]; + flags = args->args[1]; /* * Later we map GPIO flags between OF and Linux, however not all @@ -1022,33 +1044,31 @@ static int __reset_add_reset_gpio_device(struct device_node *np, * FIXME: Find a better way of translating OF flags to GPIO lookup * flags. */ - if (of_flags > GPIO_ACTIVE_LOW) { + if (flags > GPIO_ACTIVE_LOW) { pr_err("reset-gpio code does not support GPIO flags %u for GPIO %u\n", - of_flags, offset); + flags, offset); return -EINVAL; } struct gpio_device *gdev __free(gpio_device_put) = - gpio_device_find_by_fwnode(of_fwnode_handle(args->np)); + gpio_device_find_by_fwnode(args->fwnode); if (!gdev) return -EPROBE_DEFER; guard(mutex)(&reset_gpio_lookup_mutex); list_for_each_entry(rgpio_dev, &reset_gpio_lookup_list, list) { - if (args->np == rgpio_dev->of_args.np) { - if (of_phandle_args_equal(args, &rgpio_dev->of_args)) { - /* - * Already on the list, create the device link - * and stop here. - */ - reset_gpio_add_devlink(np, rgpio_dev); - return 0; - } + if (fwnode_reference_args_equal(args, &rgpio_dev->ref_args)) { + /* + * Already on the list, create the device link + * and stop here. + */ + reset_gpio_add_devlink(fwnode, rgpio_dev); + return 0; } } - lflags = GPIO_PERSISTENT | (of_flags & GPIO_ACTIVE_LOW); + lflags = GPIO_PERSISTENT | (flags & GPIO_ACTIVE_LOW); parent = gpio_device_to_device(gdev); properties[prop++] = PROPERTY_ENTRY_STRING("compatible", "reset-gpio"); properties[prop++] = PROPERTY_ENTRY_GPIO("reset-gpios", parent->fwnode, offset, lflags); @@ -1058,43 +1078,43 @@ static int __reset_add_reset_gpio_device(struct device_node *np, if (!rgpio_dev) return -ENOMEM; - rgpio_dev->of_args = *args; + rgpio_dev->ref_args = *args; /* - * We keep the device_node reference, but of_args.np is put at the end - * of __fwnode_reset_control_get(), so get it one more time. + * We keep the fwnode_handle reference, but ref_args.fwnode is put at + * the end of __fwnode_reset_control_get(), so get it one more time. * Hold reference as long as rgpio_dev memory is valid. */ - of_node_get(rgpio_dev->of_args.np); + fwnode_handle_get(rgpio_dev->ref_args.fwnode); rgpio_dev->swnode = fwnode_create_software_node(properties, NULL); if (IS_ERR(rgpio_dev->swnode)) { ret = PTR_ERR(rgpio_dev->swnode); - goto err_put_of_node; + goto err_put_fwnode; } ret = reset_create_gpio_aux_device(rgpio_dev, parent); if (ret) goto err_del_swnode; - reset_gpio_add_devlink(np, rgpio_dev); + reset_gpio_add_devlink(fwnode, rgpio_dev); list_add(&rgpio_dev->list, &reset_gpio_lookup_list); return 0; err_del_swnode: fwnode_remove_software_node(rgpio_dev->swnode); -err_put_of_node: - of_node_put(rgpio_dev->of_args.np); +err_put_fwnode: + fwnode_handle_put(rgpio_dev->ref_args.fwnode); kfree(rgpio_dev); return ret; } -static struct reset_controller_dev *__reset_find_rcdev(const struct of_phandle_args *args, - bool gpio_fallback) +static struct reset_controller_dev * +__reset_find_rcdev(const struct fwnode_reference_args *args, bool gpio_fallback) { + struct fwnode_reference_args *rc_args; struct reset_controller_dev *rcdev; - struct of_phandle_args *rc_args; lockdep_assert_held(&reset_list_mutex); @@ -1103,10 +1123,10 @@ static struct reset_controller_dev *__reset_find_rcdev(const struct of_phandle_a device_is_compatible(rcdev->dev, "reset-gpio")) { rc_args = dev_get_platdata(rcdev->dev); - if (of_phandle_args_equal(args, rc_args)) + if (fwnode_reference_args_equal(args, rc_args)) return rcdev; } else { - if (args->np == rcdev->of_node) + if (args->fwnode == rcdev->fwnode) return rcdev; } } @@ -1120,27 +1140,26 @@ __fwnode_reset_control_get(struct fwnode_handle *fwnode, const char *id, int ind { bool optional = flags & RESET_CONTROL_FLAGS_BIT_OPTIONAL; bool gpio_fallback = false; - struct device_node *node = to_of_node(fwnode); struct reset_control *rstc = ERR_PTR(-EINVAL); struct reset_controller_dev *rcdev; - struct of_phandle_args args; - int rstc_id; + struct fwnode_reference_args args; + struct of_phandle_args of_args; + int rstc_id = -EINVAL; int ret; if (!fwnode) return ERR_PTR(-EINVAL); if (id) { - index = of_property_match_string(node, - "reset-names", id); + index = fwnode_property_match_string(fwnode, "reset-names", id); if (index == -EILSEQ) return ERR_PTR(index); if (index < 0) return optional ? NULL : ERR_PTR(-ENOENT); } - ret = of_parse_phandle_with_args(node, "resets", "#reset-cells", - index, &args); + ret = fwnode_property_get_reference_args(fwnode, "resets", "#reset-cells", + 0, index, &args); if (ret == -EINVAL) return ERR_PTR(ret); if (ret) { @@ -1151,16 +1170,16 @@ __fwnode_reset_control_get(struct fwnode_handle *fwnode, const char *id, int ind * There can be only one reset-gpio for regular devices, so * don't bother with the "reset-gpios" phandle index. */ - ret = of_parse_phandle_with_args(node, "reset-gpios", "#gpio-cells", - 0, &args); + ret = fwnode_property_get_reference_args(fwnode, "reset-gpios", + "#gpio-cells", 0, 0, &args); if (ret) return optional ? NULL : ERR_PTR(ret); gpio_fallback = true; - ret = __reset_add_reset_gpio_device(node, &args); + ret = __reset_add_reset_gpio_device(fwnode, &args); if (ret) { - of_node_put(args.np); + fwnode_handle_put(args.fwnode); return ERR_PTR(ret); } } @@ -1173,15 +1192,30 @@ __fwnode_reset_control_get(struct fwnode_handle *fwnode, const char *id, int ind goto out_put; } - if (WARN_ON(args.args_count != rcdev->of_reset_n_cells)) { + if (WARN_ON(args.nargs != rcdev->fwnode_reset_n_cells)) { rstc = ERR_PTR(-EINVAL); goto out_put; } - rstc_id = rcdev->of_xlate(rcdev, &args); + if (rcdev->of_xlate && is_of_node(fwnode)) { + ret = of_parse_phandle_with_args(to_of_node(fwnode), + gpio_fallback ? "reset-gpios" : "resets", + gpio_fallback ? "#gpio-cells" : "#reset-cells", + gpio_fallback ? 0 : index, + &of_args); + if (ret) { + rstc = ERR_PTR(ret); + goto out_put; + } + + rstc_id = rcdev->of_xlate(rcdev, &of_args); + of_node_put(of_args.np); + } else if (rcdev->fwnode_xlate) { + rstc_id = rcdev->fwnode_xlate(rcdev, &args); + } if (rstc_id < 0) { rstc = ERR_PTR(rstc_id); - goto out_put; + goto out_put; } flags &= ~RESET_CONTROL_FLAGS_BIT_OPTIONAL; @@ -1190,7 +1224,7 @@ __fwnode_reset_control_get(struct fwnode_handle *fwnode, const char *id, int ind rstc = __reset_control_get_internal(rcdev, rstc_id, flags); out_put: - of_node_put(args.np); + fwnode_handle_put(args.fwnode); return rstc; } diff --git a/include/linux/reset-controller.h b/include/linux/reset-controller.h index 185d2a9bd7cd..52a5a4e81f18 100644 --- a/include/linux/reset-controller.h +++ b/include/linux/reset-controller.h @@ -5,6 +5,8 @@ #include #include +struct fwnode_handle; +struct fwnode_reference_args; struct reset_controller_dev; /** @@ -38,8 +40,12 @@ struct of_phandle_args; * @of_node: corresponding device tree node as phandle target * @of_reset_n_cells: number of cells in reset line specifiers * @of_xlate: translation function to translate from specifier as found in the - * device tree to id as given to the reset control ops, defaults - * to :c:func:`of_reset_simple_xlate`. + * device tree to id as given to the reset control ops + * @fwnode: firmware node associated with this device + * @fwnode_reset_n_cells: number of cells in reset line specifiers + * @fwnode_xlate: translation function to translate from firmware specifier to + * id as given to the reset control ops, defaults to + * :c:func:`fwnode_reset_simple_xlate` * @nr_resets: number of reset controls in this reset controller device * @lock: protects the reset control list from concurrent access */ @@ -53,6 +59,10 @@ struct reset_controller_dev { int of_reset_n_cells; int (*of_xlate)(struct reset_controller_dev *rcdev, const struct of_phandle_args *reset_spec); + struct fwnode_handle *fwnode; + int fwnode_reset_n_cells; + int (*fwnode_xlate)(struct reset_controller_dev *rcdev, + const struct fwnode_reference_args *reset_spec); unsigned int nr_resets; struct mutex lock; }; -- cgit v1.2.3 From b6420bd5aa0c374331bad6c0fa2eb5f0f87cf5a0 Mon Sep 17 00:00:00 2001 From: Jialu Xu Date: Sat, 7 Mar 2026 11:06:26 +0800 Subject: gpio: remove of_get_named_gpio() and All in-tree consumers have been converted to the descriptor-based API. Remove the deprecated of_get_named_gpio() helper, delete the header, and drop the corresponding entry from MAINTAINERS. Also remove the completed TODO item for this cleanup. Signed-off-by: Jialu Xu Reviewed-by: Linus Walleij Link: https://patch.msgid.link/02ABDA1F9E3FAF1F+20260307030623.3495092-6-xujialu@vimux.org Signed-off-by: Bartosz Golaszewski --- MAINTAINERS | 1 - drivers/gpio/TODO | 28 ---------------------------- drivers/gpio/gpiolib-of.c | 27 --------------------------- include/linux/of_gpio.h | 38 -------------------------------------- 4 files changed, 94 deletions(-) delete mode 100644 include/linux/of_gpio.h (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 55af015174a5..24b3f8d2a64c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10959,7 +10959,6 @@ F: drivers/gpio/ F: include/dt-bindings/gpio/ F: include/linux/gpio.h F: include/linux/gpio/ -F: include/linux/of_gpio.h K: (devm_)?gpio_(request|free|direction|get|set) K: GPIOD_FLAGS_BIT_NONEXCLUSIVE K: devm_gpiod_unhinge diff --git a/drivers/gpio/TODO b/drivers/gpio/TODO index 5acaeab029ec..7ce80fde1f17 100644 --- a/drivers/gpio/TODO +++ b/drivers/gpio/TODO @@ -58,34 +58,6 @@ Work items: ------------------------------------------------------------------------------- -Get rid of - -This header and helpers appeared at one point when there was no proper -driver infrastructure for doing simpler MMIO GPIO devices and there was -no core support for parsing device tree GPIOs from the core library with -the [devm_]gpiod_get() calls we have today that will implicitly go into -the device tree back-end. It is legacy and should not be used in new code. - -Work items: - -- Change all consumer drivers that #include to - #include and stop doing custom parsing of the - GPIO lines from the device tree. This can be tricky and often involves - changing board files, etc. - -- Pull semantics for legacy device tree (OF) GPIO lookups into - gpiolib-of.c: in some cases subsystems are doing custom flags and - lookups for polarity inversion, open drain and what not. As we now - handle this with generic OF bindings, pull all legacy handling into - gpiolib so the library API becomes narrow and deep and handle all - legacy bindings internally. (See e.g. commits 6953c57ab172, - 6a537d48461d etc) - -- Delete when all the above is complete and everything - uses or instead. - -------------------------------------------------------------------------------- - Collect drivers Collect GPIO drivers from arch/* and other places that should be placed diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c index 3bdd9af67447..c512d735e85f 100644 --- a/drivers/gpio/gpiolib-of.c +++ b/drivers/gpio/gpiolib-of.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -446,32 +445,6 @@ out: return desc; } -/** - * of_get_named_gpio() - Get a GPIO number to use with GPIO API - * @np: device node to get GPIO from - * @propname: Name of property containing gpio specifier(s) - * @index: index of the GPIO - * - * **DEPRECATED** This function is deprecated and must not be used in new code. - * - * Returns: - * GPIO number to use with Linux generic GPIO API, or one of the errno - * value on the error condition. - */ -int of_get_named_gpio(const struct device_node *np, const char *propname, - int index) -{ - struct gpio_desc *desc; - - desc = of_get_named_gpiod_flags(np, propname, index, NULL); - - if (IS_ERR(desc)) - return PTR_ERR(desc); - else - return desc_to_gpio(desc); -} -EXPORT_SYMBOL_GPL(of_get_named_gpio); - /* Converts gpio_lookup_flags into bitmask of GPIO_* values */ static unsigned long of_convert_gpio_flags(enum of_gpio_flags flags) { diff --git a/include/linux/of_gpio.h b/include/linux/of_gpio.h deleted file mode 100644 index d0f66a5e1b2a..000000000000 --- a/include/linux/of_gpio.h +++ /dev/null @@ -1,38 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ -/* - * OF helpers for the GPIO API - * - * Copyright (c) 2007-2008 MontaVista Software, Inc. - * - * Author: Anton Vorontsov - */ - -#ifndef __LINUX_OF_GPIO_H -#define __LINUX_OF_GPIO_H - -#include -#include -#include /* FIXME: Shouldn't be here */ -#include - -struct device_node; - -#ifdef CONFIG_OF_GPIO - -extern int of_get_named_gpio(const struct device_node *np, - const char *list_name, int index); - -#else /* CONFIG_OF_GPIO */ - -#include - -/* Drivers may not strictly depend on the GPIO support, so let them link. */ -static inline int of_get_named_gpio(const struct device_node *np, - const char *propname, int index) -{ - return -ENOSYS; -} - -#endif /* CONFIG_OF_GPIO */ - -#endif /* __LINUX_OF_GPIO_H */ -- cgit v1.2.3 From 9840bb66e7e5dffd72b03201318f154a10b06b4a Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sat, 7 Mar 2026 14:54:31 -0500 Subject: vfs: remove externs from fs.h on functions modified by i_ino widening Christoph says, in response to one of the patches in the i_ino widening series, which changes the prototype of several functions in fs.h: "Can you please drop all these pointless externs while you're at it?" Remove extern keyword from functions touched by that patch (and a few that happened to be nearby). Also add missing argument names to declarations that lacked them. Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20260307-iino-u64-v2-1-a1a1696e0653@kernel.org Signed-off-by: Christian Brauner --- include/linux/fs.h | 62 +++++++++++++++++++++++++++--------------------------- 1 file changed, 31 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index e820c14f9c5a..23f36a2613a3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2912,27 +2912,27 @@ static inline bool name_contains_dotdot(const char *name) #include /* needed for stackable file system support */ -extern loff_t default_llseek(struct file *file, loff_t offset, int whence); +loff_t default_llseek(struct file *file, loff_t offset, int whence); -extern loff_t vfs_llseek(struct file *file, loff_t offset, int whence); +loff_t vfs_llseek(struct file *file, loff_t offset, int whence); -extern int inode_init_always_gfp(struct super_block *, struct inode *, gfp_t); +int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp); static inline int inode_init_always(struct super_block *sb, struct inode *inode) { return inode_init_always_gfp(sb, inode, GFP_NOFS); } -extern void inode_init_once(struct inode *); -extern void address_space_init_once(struct address_space *mapping); -extern struct inode * igrab(struct inode *); -extern ino_t iunique(struct super_block *, ino_t); -extern int inode_needs_sync(struct inode *inode); -extern int inode_just_drop(struct inode *inode); +void inode_init_once(struct inode *inode); +void address_space_init_once(struct address_space *mapping); +struct inode *igrab(struct inode *inode); +ino_t iunique(struct super_block *sb, ino_t max_reserved); +int inode_needs_sync(struct inode *inode); +int inode_just_drop(struct inode *inode); static inline int inode_generic_drop(struct inode *inode) { return !inode->i_nlink || inode_unhashed(inode); } -extern void d_mark_dontcache(struct inode *inode); +void d_mark_dontcache(struct inode *inode); struct inode *ilookup5_nowait(struct super_block *sb, u64 hashval, int (*test)(struct inode *, void *), void *data, @@ -2944,31 +2944,31 @@ struct inode *ilookup(struct super_block *sb, u64 ino); struct inode *inode_insert5(struct inode *inode, u64 hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data); -struct inode *iget5_locked(struct super_block *, u64, +struct inode *iget5_locked(struct super_block *sb, u64 hashval, int (*test)(struct inode *, void *), - int (*set)(struct inode *, void *), void *); -struct inode *iget5_locked_rcu(struct super_block *, u64, + int (*set)(struct inode *, void *), void *data); +struct inode *iget5_locked_rcu(struct super_block *sb, u64 hashval, int (*test)(struct inode *, void *), - int (*set)(struct inode *, void *), void *); -struct inode *iget_locked(struct super_block *, u64); -struct inode *find_inode_nowait(struct super_block *, u64, + int (*set)(struct inode *, void *), void *data); +struct inode *iget_locked(struct super_block *sb, u64 ino); +struct inode *find_inode_nowait(struct super_block *sb, u64 hashval, int (*match)(struct inode *, u64, void *), void *data); -struct inode *find_inode_rcu(struct super_block *, u64, - int (*)(struct inode *, void *), void *); -struct inode *find_inode_by_ino_rcu(struct super_block *, u64); -int insert_inode_locked4(struct inode *, u64, - int (*test)(struct inode *, void *), void *); -extern int insert_inode_locked(struct inode *); +struct inode *find_inode_rcu(struct super_block *sb, u64 hashval, + int (*test)(struct inode *, void *), void *data); +struct inode *find_inode_by_ino_rcu(struct super_block *sb, u64 ino); +int insert_inode_locked4(struct inode *inode, u64 hashval, + int (*test)(struct inode *, void *), void *data); +int insert_inode_locked(struct inode *inode); #ifdef CONFIG_DEBUG_LOCK_ALLOC -extern void lockdep_annotate_inode_mutex_key(struct inode *inode); +void lockdep_annotate_inode_mutex_key(struct inode *inode); #else static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { }; #endif -extern void unlock_new_inode(struct inode *); -extern void discard_new_inode(struct inode *); -extern unsigned int get_next_ino(void); -extern void evict_inodes(struct super_block *sb); +void unlock_new_inode(struct inode *inode); +void discard_new_inode(struct inode *inode); +unsigned int get_next_ino(void); +void evict_inodes(struct super_block *sb); void dump_mapping(const struct address_space *); /* @@ -3013,21 +3013,21 @@ int setattr_should_drop_sgid(struct mnt_idmap *idmap, */ #define alloc_inode_sb(_sb, _cache, _gfp) kmem_cache_alloc_lru(_cache, &_sb->s_inode_lru, _gfp) -void __insert_inode_hash(struct inode *, u64 hashval); +void __insert_inode_hash(struct inode *inode, u64 hashval); static inline void insert_inode_hash(struct inode *inode) { __insert_inode_hash(inode, inode->i_ino); } -extern void __remove_inode_hash(struct inode *); +void __remove_inode_hash(struct inode *inode); static inline void remove_inode_hash(struct inode *inode) { if (!inode_unhashed(inode) && !hlist_fake(&inode->i_hash)) __remove_inode_hash(inode); } -extern void inode_sb_list_add(struct inode *inode); -extern void inode_lru_list_add(struct inode *inode); +void inode_sb_list_add(struct inode *inode); +void inode_lru_list_add(struct inode *inode); int generic_file_mmap(struct file *, struct vm_area_struct *); int generic_file_mmap_prepare(struct vm_area_desc *desc); -- cgit v1.2.3 From a6fe20d67dc7d512f9b5dc11c5777fb1e1ff70ce Mon Sep 17 00:00:00 2001 From: Maciej Strozek Date: Fri, 6 Mar 2026 15:28:10 +0000 Subject: mfd: cs42l43: Add support for the B variant Introducing CS42L43B codec, a variant of CS42L43 which can be driven by the same driver. Changes in CS42L43 driver specific for CS42L43B: - Decimator 1 and 2 are dedicated to ADC, can't be selected for PDM - Decimators 3 and 4 are connected to PDM1 - Added Decimator 5 and 6 for PDM2 - Supports SoundWire Clock Gearing - Updated ROM requiring no patching - Reduced RAM space - Each ISRC has 4 decimators now Signed-off-by: Maciej Strozek Acked-by: Lee Jones Reviewed-by: Charles Keepax Link: https://patch.msgid.link/20260306152829.3130530-4-mstrozek@opensource.cirrus.com Signed-off-by: Mark Brown --- drivers/mfd/cs42l43-i2c.c | 7 ++- drivers/mfd/cs42l43-sdw.c | 4 +- drivers/mfd/cs42l43.c | 93 ++++++++++++++++++++++++++++++++++------ drivers/mfd/cs42l43.h | 2 +- include/linux/mfd/cs42l43-regs.h | 76 ++++++++++++++++++++++++++++++++ include/linux/mfd/cs42l43.h | 1 + 6 files changed, 166 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/drivers/mfd/cs42l43-i2c.c b/drivers/mfd/cs42l43-i2c.c index a2ab001a600a..0a0ab5e549a5 100644 --- a/drivers/mfd/cs42l43-i2c.c +++ b/drivers/mfd/cs42l43-i2c.c @@ -47,6 +47,7 @@ static int cs42l43_i2c_probe(struct i2c_client *i2c) cs42l43->irq = i2c->irq; /* A device on an I2C is always attached by definition. */ cs42l43->attached = true; + cs42l43->variant_id = (long)device_get_match_data(cs42l43->dev); cs42l43->regmap = devm_regmap_init_i2c(i2c, &cs42l43_i2c_regmap); if (IS_ERR(cs42l43->regmap)) @@ -58,7 +59,8 @@ static int cs42l43_i2c_probe(struct i2c_client *i2c) #if IS_ENABLED(CONFIG_OF) static const struct of_device_id cs42l43_of_match[] = { - { .compatible = "cirrus,cs42l43", }, + { .compatible = "cirrus,cs42l43", .data = (void *)CS42L43_DEVID_VAL }, + { .compatible = "cirrus,cs42l43b", .data = (void *)CS42L43B_DEVID_VAL }, {} }; MODULE_DEVICE_TABLE(of, cs42l43_of_match); @@ -66,7 +68,8 @@ MODULE_DEVICE_TABLE(of, cs42l43_of_match); #if IS_ENABLED(CONFIG_ACPI) static const struct acpi_device_id cs42l43_acpi_match[] = { - { "CSC4243", 0 }, + { "CSC4243", CS42L43_DEVID_VAL }, + { "CSC2A3B", CS42L43B_DEVID_VAL }, {} }; MODULE_DEVICE_TABLE(acpi, cs42l43_acpi_match); diff --git a/drivers/mfd/cs42l43-sdw.c b/drivers/mfd/cs42l43-sdw.c index 023f7e1a30f8..794c98378175 100644 --- a/drivers/mfd/cs42l43-sdw.c +++ b/drivers/mfd/cs42l43-sdw.c @@ -178,6 +178,7 @@ static int cs42l43_sdw_probe(struct sdw_slave *sdw, const struct sdw_device_id * cs42l43->dev = dev; cs42l43->sdw = sdw; + cs42l43->variant_id = (long)id->driver_data; cs42l43->regmap = devm_regmap_init_sdw(sdw, &cs42l43_sdw_regmap); if (IS_ERR(cs42l43->regmap)) @@ -188,7 +189,8 @@ static int cs42l43_sdw_probe(struct sdw_slave *sdw, const struct sdw_device_id * } static const struct sdw_device_id cs42l43_sdw_id[] = { - SDW_SLAVE_ENTRY(0x01FA, 0x4243, 0), + SDW_SLAVE_ENTRY(0x01FA, 0x4243, (void *) CS42L43_DEVID_VAL), + SDW_SLAVE_ENTRY(0x01FA, 0x2A3B, (void *) CS42L43B_DEVID_VAL), {} }; MODULE_DEVICE_TABLE(sdw, cs42l43_sdw_id); diff --git a/drivers/mfd/cs42l43.c b/drivers/mfd/cs42l43.c index 107cfb983fec..166881751e69 100644 --- a/drivers/mfd/cs42l43.c +++ b/drivers/mfd/cs42l43.c @@ -115,9 +115,14 @@ const struct reg_default cs42l43_reg_default[CS42L43_N_DEFAULTS] = { { CS42L43_DECIM_HPF_WNF_CTRL2, 0x00000001 }, { CS42L43_DECIM_HPF_WNF_CTRL3, 0x00000001 }, { CS42L43_DECIM_HPF_WNF_CTRL4, 0x00000001 }, + { CS42L43B_DECIM_HPF_WNF_CTRL5, 0x00000001 }, + { CS42L43B_DECIM_HPF_WNF_CTRL6, 0x00000001 }, { CS42L43_DMIC_PDM_CTRL, 0x00000000 }, { CS42L43_DECIM_VOL_CTRL_CH1_CH2, 0x20122012 }, { CS42L43_DECIM_VOL_CTRL_CH3_CH4, 0x20122012 }, + { CS42L43B_DECIM_VOL_CTRL_CH1_CH2, 0x20122012 }, + { CS42L43B_DECIM_VOL_CTRL_CH3_CH4, 0x20122012 }, + { CS42L43B_DECIM_VOL_CTRL_CH5_CH6, 0x20122012 }, { CS42L43_INTP_VOLUME_CTRL1, 0x00000180 }, { CS42L43_INTP_VOLUME_CTRL2, 0x00000180 }, { CS42L43_AMP1_2_VOL_RAMP, 0x00000022 }, @@ -155,8 +160,12 @@ const struct reg_default cs42l43_reg_default[CS42L43_N_DEFAULTS] = { { CS42L43_SWIRE_DP2_CH2_INPUT, 0x00000000 }, { CS42L43_SWIRE_DP3_CH1_INPUT, 0x00000000 }, { CS42L43_SWIRE_DP3_CH2_INPUT, 0x00000000 }, + { CS42L43B_SWIRE_DP3_CH3_INPUT, 0x00000000 }, + { CS42L43B_SWIRE_DP3_CH4_INPUT, 0x00000000 }, { CS42L43_SWIRE_DP4_CH1_INPUT, 0x00000000 }, { CS42L43_SWIRE_DP4_CH2_INPUT, 0x00000000 }, + { CS42L43B_SWIRE_DP4_CH3_INPUT, 0x00000000 }, + { CS42L43B_SWIRE_DP4_CH4_INPUT, 0x00000000 }, { CS42L43_ASRC_INT1_INPUT1, 0x00000000 }, { CS42L43_ASRC_INT2_INPUT1, 0x00000000 }, { CS42L43_ASRC_INT3_INPUT1, 0x00000000 }, @@ -169,10 +178,14 @@ const struct reg_default cs42l43_reg_default[CS42L43_N_DEFAULTS] = { { CS42L43_ISRC1INT2_INPUT1, 0x00000000 }, { CS42L43_ISRC1DEC1_INPUT1, 0x00000000 }, { CS42L43_ISRC1DEC2_INPUT1, 0x00000000 }, + { CS42L43B_ISRC1DEC3_INPUT1, 0x00000000 }, + { CS42L43B_ISRC1DEC4_INPUT1, 0x00000000 }, { CS42L43_ISRC2INT1_INPUT1, 0x00000000 }, { CS42L43_ISRC2INT2_INPUT1, 0x00000000 }, { CS42L43_ISRC2DEC1_INPUT1, 0x00000000 }, { CS42L43_ISRC2DEC2_INPUT1, 0x00000000 }, + { CS42L43B_ISRC2DEC3_INPUT1, 0x00000000 }, + { CS42L43B_ISRC2DEC4_INPUT1, 0x00000000 }, { CS42L43_EQ1MIX_INPUT1, 0x00800000 }, { CS42L43_EQ1MIX_INPUT2, 0x00800000 }, { CS42L43_EQ1MIX_INPUT3, 0x00800000 }, @@ -269,6 +282,8 @@ EXPORT_SYMBOL_NS_GPL(cs42l43_reg_default, "MFD_CS42L43"); bool cs42l43_readable_register(struct device *dev, unsigned int reg) { + struct cs42l43 *cs42l43 = dev_get_drvdata(dev); + switch (reg) { case CS42L43_DEVID: case CS42L43_REVID: @@ -292,7 +307,6 @@ bool cs42l43_readable_register(struct device *dev, unsigned int reg) case CS42L43_ADC_B_CTRL1 ... CS42L43_ADC_B_CTRL2: case CS42L43_DECIM_HPF_WNF_CTRL1 ... CS42L43_DECIM_HPF_WNF_CTRL4: case CS42L43_DMIC_PDM_CTRL: - case CS42L43_DECIM_VOL_CTRL_CH1_CH2 ... CS42L43_DECIM_VOL_CTRL_CH3_CH4: case CS42L43_INTP_VOLUME_CTRL1 ... CS42L43_INTP_VOLUME_CTRL2: case CS42L43_AMP1_2_VOL_RAMP: case CS42L43_ASP_CTRL: @@ -387,8 +401,16 @@ bool cs42l43_readable_register(struct device *dev, unsigned int reg) case CS42L43_BOOT_CONTROL: case CS42L43_BLOCK_EN: case CS42L43_SHUTTER_CONTROL: - case CS42L43_MCU_SW_REV ... CS42L43_MCU_RAM_MAX: - return true; + case CS42L43B_MCU_SW_REV ... CS42L43B_MCU_RAM_MAX: + return true; // registers present on all variants + case CS42L43_MCU_SW_REV ... CS42L43B_MCU_SW_REV - 1: + case CS42L43B_MCU_RAM_MAX + 1 ... CS42L43_MCU_RAM_MAX: + case CS42L43_DECIM_VOL_CTRL_CH1_CH2 ... CS42L43_DECIM_VOL_CTRL_CH3_CH4: + return cs42l43->variant_id == CS42L43_DEVID_VAL; // regs only in CS42L43 variant + case CS42L43B_DECIM_VOL_CTRL_CH1_CH2 ... CS42L43B_DECIM_HPF_WNF_CTRL6: + case CS42L43B_SWIRE_DP3_CH3_INPUT ... CS42L43B_SWIRE_DP4_CH4_INPUT: + case CS42L43B_ISRC1DEC3_INPUT1 ... CS42L43B_ISRC2DEC4_INPUT1: + return cs42l43->variant_id == CS42L43B_DEVID_VAL; // regs only in CS42L43B variant default: return false; } @@ -597,15 +619,27 @@ static int cs42l43_wait_for_attach(struct cs42l43 *cs42l43) static int cs42l43_mcu_stage_2_3(struct cs42l43 *cs42l43, bool shadow) { unsigned int need_reg = CS42L43_NEED_CONFIGS; + unsigned int boot_reg; unsigned int val; int ret; - if (shadow) - need_reg = CS42L43_FW_SH_BOOT_CFG_NEED_CONFIGS; + switch (cs42l43->variant_id) { + case CS42L43_DEVID_VAL: + if (shadow) + need_reg = CS42L43_FW_SH_BOOT_CFG_NEED_CONFIGS; + boot_reg = CS42L43_BOOT_STATUS; + break; + case CS42L43B_DEVID_VAL: + need_reg = CS42L43B_NEED_CONFIGS; + boot_reg = CS42L43B_BOOT_STATUS; + break; + default: + return -EINVAL; + } regmap_write(cs42l43->regmap, need_reg, 0); - ret = regmap_read_poll_timeout(cs42l43->regmap, CS42L43_BOOT_STATUS, + ret = regmap_read_poll_timeout(cs42l43->regmap, boot_reg, val, (val == CS42L43_MCU_BOOT_STAGE3), CS42L43_MCU_POLL_US, CS42L43_MCU_CMD_TIMEOUT_US); if (ret) { @@ -644,13 +678,25 @@ static int cs42l43_mcu_stage_3_2(struct cs42l43 *cs42l43) */ static int cs42l43_mcu_disable(struct cs42l43 *cs42l43) { - unsigned int val; + unsigned int val, cfg_reg, ctrl_reg; int ret; - regmap_write(cs42l43->regmap, CS42L43_FW_MISSION_CTRL_MM_MCU_CFG_REG, - CS42L43_FW_MISSION_CTRL_MM_MCU_CFG_DISABLE_VAL); - regmap_write(cs42l43->regmap, CS42L43_FW_MISSION_CTRL_MM_CTRL_SELECTION, - CS42L43_FW_MM_CTRL_MCU_SEL_MASK); + switch (cs42l43->variant_id) { + case CS42L43_DEVID_VAL: + cfg_reg = CS42L43_FW_MISSION_CTRL_MM_MCU_CFG_REG; + ctrl_reg = CS42L43_FW_MISSION_CTRL_MM_CTRL_SELECTION; + break; + case CS42L43B_DEVID_VAL: + cfg_reg = CS42L43B_FW_MISSION_CTRL_MM_MCU_CFG_REG; + ctrl_reg = CS42L43B_FW_MISSION_CTRL_MM_CTRL_SELECTION; + break; + default: + return -EINVAL; + } + + regmap_write(cs42l43->regmap, cfg_reg, CS42L43_FW_MISSION_CTRL_MM_MCU_CFG_DISABLE_VAL); + regmap_write(cs42l43->regmap, ctrl_reg, CS42L43_FW_MM_CTRL_MCU_SEL_MASK); + regmap_write(cs42l43->regmap, CS42L43_MCU_SW_INTERRUPT, CS42L43_CONTROL_IND_MASK); regmap_write(cs42l43->regmap, CS42L43_MCU_SW_INTERRUPT, 0); @@ -740,18 +786,32 @@ static int cs42l43_mcu_update_step(struct cs42l43 *cs42l43) { unsigned int mcu_rev, bios_rev, boot_status, secure_cfg; bool patched, shadow; + int boot_status_reg, mcu_sw_rev_reg; int ret; + switch (cs42l43->variant_id) { + case CS42L43_DEVID_VAL: + boot_status_reg = CS42L43_BOOT_STATUS; + mcu_sw_rev_reg = CS42L43_MCU_SW_REV; + break; + case CS42L43B_DEVID_VAL: + boot_status_reg = CS42L43B_BOOT_STATUS; + mcu_sw_rev_reg = CS42L43B_MCU_SW_REV; + break; + default: + return -EINVAL; + } + /* Clear any stale software interrupt bits. */ regmap_read(cs42l43->regmap, CS42L43_SOFT_INT, &mcu_rev); - ret = regmap_read(cs42l43->regmap, CS42L43_BOOT_STATUS, &boot_status); + ret = regmap_read(cs42l43->regmap, boot_status_reg, &boot_status); if (ret) { dev_err(cs42l43->dev, "Failed to read boot status: %d\n", ret); return ret; } - ret = regmap_read(cs42l43->regmap, CS42L43_MCU_SW_REV, &mcu_rev); + ret = regmap_read(cs42l43->regmap, mcu_sw_rev_reg, &mcu_rev); if (ret) { dev_err(cs42l43->dev, "Failed to read firmware revision: %d\n", ret); return ret; @@ -918,6 +978,13 @@ static void cs42l43_boot_work(struct work_struct *work) switch (devid) { case CS42L43_DEVID_VAL: + case CS42L43B_DEVID_VAL: + if (devid != cs42l43->variant_id) { + dev_err(cs42l43->dev, + "Device ID (0x%06x) does not match variant ID (0x%06lx)\n", + devid, cs42l43->variant_id); + goto err; + } break; default: dev_err(cs42l43->dev, "Unrecognised devid: 0x%06x\n", devid); diff --git a/drivers/mfd/cs42l43.h b/drivers/mfd/cs42l43.h index f3da783930f5..a0068f6572e2 100644 --- a/drivers/mfd/cs42l43.h +++ b/drivers/mfd/cs42l43.h @@ -9,7 +9,7 @@ #ifndef CS42L43_CORE_INT_H #define CS42L43_CORE_INT_H -#define CS42L43_N_DEFAULTS 176 +#define CS42L43_N_DEFAULTS 189 struct dev_pm_ops; struct device; diff --git a/include/linux/mfd/cs42l43-regs.h b/include/linux/mfd/cs42l43-regs.h index c39a49269cb7..68831f113589 100644 --- a/include/linux/mfd/cs42l43-regs.h +++ b/include/linux/mfd/cs42l43-regs.h @@ -1181,4 +1181,80 @@ /* CS42L43_FW_MISSION_CTRL_MM_MCU_CFG_REG */ #define CS42L43_FW_MISSION_CTRL_MM_MCU_CFG_DISABLE_VAL 0xF05AA50F +/* CS42L43B VARIANT REGISTERS */ +#define CS42L43B_DEVID_VAL 0x0042A43B + +#define CS42L43B_DECIM_VOL_CTRL_CH1_CH2 0x00008280 +#define CS42L43B_DECIM_VOL_CTRL_CH3_CH4 0x00008284 + +#define CS42L43B_DECIM_VOL_CTRL_CH5_CH6 0x00008290 +#define CS42L43B_DECIM_VOL_CTRL_UPDATE 0x0000829C + +#define CS42L43B_DECIM_HPF_WNF_CTRL5 0x000082A0 +#define CS42L43B_DECIM_HPF_WNF_CTRL6 0x000082A4 + +#define CS42L43B_SWIRE_DP3_CH3_INPUT 0x0000C320 +#define CS42L43B_SWIRE_DP3_CH4_INPUT 0x0000C330 +#define CS42L43B_SWIRE_DP4_CH3_INPUT 0x0000C340 +#define CS42L43B_SWIRE_DP4_CH4_INPUT 0x0000C350 + +#define CS42L43B_ISRC1DEC3_INPUT1 0x0000C780 +#define CS42L43B_ISRC1DEC4_INPUT1 0x0000C790 +#define CS42L43B_ISRC2DEC3_INPUT1 0x0000C7A0 +#define CS42L43B_ISRC2DEC4_INPUT1 0x0000C7B0 + +#define CS42L43B_FW_MISSION_CTRL_NEED_CONFIGS 0x00117E00 +#define CS42L43B_FW_MISSION_CTRL_HAVE_CONFIGS 0x00117E04 +#define CS42L43B_FW_MISSION_CTRL_PATCH_START_ADDR_REG 0x00117E08 +#define CS42L43B_FW_MISSION_CTRL_MM_CTRL_SELECTION 0x00117E0C +#define CS42L43B_FW_MISSION_CTRL_MM_MCU_CFG_REG 0x00117E10 + +#define CS42L43B_MCU_SW_REV 0x00117314 +#define CS42L43B_PATCH_START_ADDR 0x00117318 +#define CS42L43B_CONFIG_SELECTION 0x0011731C +#define CS42L43B_NEED_CONFIGS 0x00117320 +#define CS42L43B_BOOT_STATUS 0x00117330 + +#define CS42L43B_FW_MISSION_CTRL_NEED_CONFIGS 0x00117E00 +#define CS42L43B_FW_MISSION_CTRL_HAVE_CONFIGS 0x00117E04 +#define CS42L43B_FW_MISSION_CTRL_PATCH_START_ADDR_REG 0x00117E08 +#define CS42L43B_FW_MISSION_CTRL_MM_CTRL_SELECTION 0x00117E0C +#define CS42L43B_FW_MISSION_CTRL_MM_MCU_CFG_REG 0x00117E10 + +#define CS42L43B_MCU_RAM_MAX 0x00117FFF + +/* CS42L43B_DECIM_DECIM_VOL_CTRL_CH5_CH6 */ +#define CS42L43B_DECIM6_MUTE_MASK 0x80000000 +#define CS42L43B_DECIM6_MUTE_SHIFT 31 +#define CS42L43B_DECIM6_VOL_MASK 0x3FC00000 +#define CS42L43B_DECIM6_VOL_SHIFT 22 +#define CS42L43B_DECIM6_PATH1_VOL_FALL_RATE_MASK 0x00380000 +#define CS42L43B_DECIM6_PATH1_VOL_FALL_RATE_SHIFT 19 +#define CS42L43B_DECIM6_PATH1_VOL_RISE_RATE_MASK 0x00070000 +#define CS42L43B_DECIM6_PATH1_VOL_RISE_RATE_SHIFT 16 +#define CS42L43B_DECIM5_MUTE_MASK 0x00008000 +#define CS42L43B_DECIM5_MUTE_SHIFT 15 +#define CS42L43B_DECIM5_VOL_MASK 0x00003FC0 +#define CS42L43B_DECIM5_VOL_SHIFT 6 +#define CS42L43B_DECIM5_PATH1_VOL_FALL_RATE_MASK 0x00000038 +#define CS42L43B_DECIM5_PATH1_VOL_FALL_RATE_SHIFT 3 +#define CS42L43B_DECIM5_PATH1_VOL_RISE_RATE_MASK 0x00000007 +#define CS42L43B_DECIM5_PATH1_VOL_RISE_RATE_SHIFT 0 + +/* CS42L43B_DECIM_VOL_CTRL_UPDATE */ +#define CS42L43B_DECIM6_PATH1_VOL_TRIG_MASK 0x00000800 +#define CS42L43B_DECIM6_PATH1_VOL_TRIG_SHIFT 11 +#define CS42L43B_DECIM5_PATH1_VOL_TRIG_MASK 0x00000100 +#define CS42L43B_DECIM5_PATH1_VOL_TRIG_SHIFT 8 +#define CS42L43B_DECIM4_VOL_UPDATE_MASK 0x00000020 +#define CS42L43B_DECIM4_VOL_UPDATE_SHIFT 5 + +/* CS42L43_ISRC1_CTRL..CS42L43_ISRC2_CTRL */ +#define CS42L43B_ISRC_DEC4_EN_MASK 0x00000008 +#define CS42L43B_ISRC_DEC4_EN_SHIFT 3 +#define CS42L43B_ISRC_DEC4_EN_WIDTH 1 +#define CS42L43B_ISRC_DEC3_EN_MASK 0x00000004 +#define CS42L43B_ISRC_DEC3_EN_SHIFT 2 +#define CS42L43B_ISRC_DEC3_EN_WIDTH 1 + #endif /* CS42L43_CORE_REGS_H */ diff --git a/include/linux/mfd/cs42l43.h b/include/linux/mfd/cs42l43.h index 2239d8585e78..ff0f7e365a19 100644 --- a/include/linux/mfd/cs42l43.h +++ b/include/linux/mfd/cs42l43.h @@ -98,6 +98,7 @@ struct cs42l43 { bool sdw_pll_active; bool attached; bool hw_lock; + long variant_id; }; #endif /* CS42L43_CORE_EXT_H */ -- cgit v1.2.3 From 3a005126c9d7f30093627a6f329656c358e16b3a Mon Sep 17 00:00:00 2001 From: Frank Li Date: Wed, 25 Feb 2026 16:41:37 -0500 Subject: dmaengine: of_dma: Add devm_of_dma_controller_register() Add a managed API, devm_of_dma_controller_register(), to simplify DMA engine controller registration by automatically handling resource cleanup. Signed-off-by: Frank Li Link: https://patch.msgid.link/20260225-mxsdma-module-v3-1-8f798b13baa6@nxp.com Signed-off-by: Vinod Koul --- include/linux/of_dma.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of_dma.h b/include/linux/of_dma.h index fd706cdf255c..16b08234d03b 100644 --- a/include/linux/of_dma.h +++ b/include/linux/of_dma.h @@ -38,6 +38,26 @@ extern int of_dma_controller_register(struct device_node *np, void *data); extern void of_dma_controller_free(struct device_node *np); +static void __of_dma_controller_free(void *np) +{ + of_dma_controller_free(np); +} + +static inline int +devm_of_dma_controller_register(struct device *dev, struct device_node *np, + struct dma_chan *(*of_dma_xlate) + (struct of_phandle_args *, struct of_dma *), + void *data) +{ + int ret; + + ret = of_dma_controller_register(np, of_dma_xlate, data); + if (ret) + return ret; + + return devm_add_action_or_reset(dev, __of_dma_controller_free, np); +} + extern int of_dma_router_register(struct device_node *np, void *(*of_dma_route_allocate) (struct of_phandle_args *, struct of_dma *), @@ -64,6 +84,15 @@ static inline void of_dma_controller_free(struct device_node *np) { } +static inline int +devm_of_dma_controller_register(struct device *dev, struct device_node *np, + struct dma_chan *(*of_dma_xlate) + (struct of_phandle_args *, struct of_dma *), + void *data) +{ + return -ENODEV; +} + static inline int of_dma_router_register(struct device_node *np, void *(*of_dma_route_allocate) (struct of_phandle_args *, struct of_dma *), -- cgit v1.2.3 From 9ded47ad003f09a94b6a710b5c47f4aa5ceb7429 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 24 Feb 2026 09:25:54 +0100 Subject: fbdev: defio: Disconnect deferred I/O from the lifetime of struct fb_info Hold state of deferred I/O in struct fb_deferred_io_state. Allocate an instance as part of initializing deferred I/O and remove it only after the final mapping has been closed. If the fb_info and the contained deferred I/O meanwhile goes away, clear struct fb_deferred_io_state.info to invalidate the mapping. Any access will then result in a SIGBUS signal. Fixes a long-standing problem, where a device hot-unplug happens while user space still has an active mapping of the graphics memory. The hot- unplug frees the instance of struct fb_info. Accessing the memory will operate on undefined state. Signed-off-by: Thomas Zimmermann Fixes: 60b59beafba8 ("fbdev: mm: Deferred IO support") Cc: Helge Deller Cc: linux-fbdev@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: stable@vger.kernel.org # v2.6.22+ Signed-off-by: Helge Deller --- drivers/video/fbdev/core/fb_defio.c | 178 ++++++++++++++++++++++++++++-------- include/linux/fb.h | 4 +- 2 files changed, 145 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/drivers/video/fbdev/core/fb_defio.c b/drivers/video/fbdev/core/fb_defio.c index ca48b89a323d..93bd2f696fa4 100644 --- a/drivers/video/fbdev/core/fb_defio.c +++ b/drivers/video/fbdev/core/fb_defio.c @@ -24,6 +24,75 @@ #include #include +/* + * struct fb_deferred_io_state + */ + +struct fb_deferred_io_state { + struct kref ref; + + struct mutex lock; /* mutex that protects the pageref list */ + /* fields protected by lock */ + struct fb_info *info; +}; + +static struct fb_deferred_io_state *fb_deferred_io_state_alloc(void) +{ + struct fb_deferred_io_state *fbdefio_state; + + fbdefio_state = kzalloc_obj(*fbdefio_state); + if (!fbdefio_state) + return NULL; + + kref_init(&fbdefio_state->ref); + mutex_init(&fbdefio_state->lock); + + return fbdefio_state; +} + +static void fb_deferred_io_state_release(struct fb_deferred_io_state *fbdefio_state) +{ + mutex_destroy(&fbdefio_state->lock); + + kfree(fbdefio_state); +} + +static void fb_deferred_io_state_get(struct fb_deferred_io_state *fbdefio_state) +{ + kref_get(&fbdefio_state->ref); +} + +static void __fb_deferred_io_state_release(struct kref *ref) +{ + struct fb_deferred_io_state *fbdefio_state = + container_of(ref, struct fb_deferred_io_state, ref); + + fb_deferred_io_state_release(fbdefio_state); +} + +static void fb_deferred_io_state_put(struct fb_deferred_io_state *fbdefio_state) +{ + kref_put(&fbdefio_state->ref, __fb_deferred_io_state_release); +} + +/* + * struct vm_operations_struct + */ + +static void fb_deferred_io_vm_open(struct vm_area_struct *vma) +{ + struct fb_deferred_io_state *fbdefio_state = vma->vm_private_data; + + fb_deferred_io_state_get(fbdefio_state); +} + +static void fb_deferred_io_vm_close(struct vm_area_struct *vma) +{ + struct fb_deferred_io_state *fbdefio_state = vma->vm_private_data; + + fb_deferred_io_state_put(fbdefio_state); +} + static struct page *fb_deferred_io_get_page(struct fb_info *info, unsigned long offs) { struct fb_deferred_io *fbdefio = info->fbdefio; @@ -121,25 +190,46 @@ static void fb_deferred_io_pageref_put(struct fb_deferred_io_pageref *pageref, /* this is to find and return the vmalloc-ed fb pages */ static vm_fault_t fb_deferred_io_fault(struct vm_fault *vmf) { + struct fb_info *info; unsigned long offset; struct page *page; - struct fb_info *info = vmf->vma->vm_private_data; + vm_fault_t ret; + struct fb_deferred_io_state *fbdefio_state = vmf->vma->vm_private_data; + + mutex_lock(&fbdefio_state->lock); + + info = fbdefio_state->info; + if (!info) { + ret = VM_FAULT_SIGBUS; /* our device is gone */ + goto err_mutex_unlock; + } offset = vmf->pgoff << PAGE_SHIFT; - if (offset >= info->fix.smem_len) - return VM_FAULT_SIGBUS; + if (offset >= info->fix.smem_len) { + ret = VM_FAULT_SIGBUS; + goto err_mutex_unlock; + } page = fb_deferred_io_get_page(info, offset); - if (!page) - return VM_FAULT_SIGBUS; + if (!page) { + ret = VM_FAULT_SIGBUS; + goto err_mutex_unlock; + } if (!vmf->vma->vm_file) fb_err(info, "no mapping available\n"); BUG_ON(!info->fbdefio->mapping); + mutex_unlock(&fbdefio_state->lock); + vmf->page = page; + return 0; + +err_mutex_unlock: + mutex_unlock(&fbdefio_state->lock); + return ret; } int fb_deferred_io_fsync(struct file *file, loff_t start, loff_t end, int datasync) @@ -166,15 +256,24 @@ EXPORT_SYMBOL_GPL(fb_deferred_io_fsync); * Adds a page to the dirty list. Call this from struct * vm_operations_struct.page_mkwrite. */ -static vm_fault_t fb_deferred_io_track_page(struct fb_info *info, unsigned long offset, - struct page *page) +static vm_fault_t fb_deferred_io_track_page(struct fb_deferred_io_state *fbdefio_state, + unsigned long offset, struct page *page) { - struct fb_deferred_io *fbdefio = info->fbdefio; + struct fb_info *info; + struct fb_deferred_io *fbdefio; struct fb_deferred_io_pageref *pageref; vm_fault_t ret; /* protect against the workqueue changing the page list */ - mutex_lock(&fbdefio->lock); + mutex_lock(&fbdefio_state->lock); + + info = fbdefio_state->info; + if (!info) { + ret = VM_FAULT_SIGBUS; /* our device is gone */ + goto err_mutex_unlock; + } + + fbdefio = info->fbdefio; pageref = fb_deferred_io_pageref_get(info, offset, page); if (WARN_ON_ONCE(!pageref)) { @@ -192,50 +291,38 @@ static vm_fault_t fb_deferred_io_track_page(struct fb_info *info, unsigned long */ lock_page(pageref->page); - mutex_unlock(&fbdefio->lock); + mutex_unlock(&fbdefio_state->lock); /* come back after delay to process the deferred IO */ schedule_delayed_work(&info->deferred_work, fbdefio->delay); return VM_FAULT_LOCKED; err_mutex_unlock: - mutex_unlock(&fbdefio->lock); + mutex_unlock(&fbdefio_state->lock); return ret; } -/* - * fb_deferred_io_page_mkwrite - Mark a page as written for deferred I/O - * @fb_info: The fbdev info structure - * @vmf: The VM fault - * - * This is a callback we get when userspace first tries to - * write to the page. We schedule a workqueue. That workqueue - * will eventually mkclean the touched pages and execute the - * deferred framebuffer IO. Then if userspace touches a page - * again, we repeat the same scheme. - * - * Returns: - * VM_FAULT_LOCKED on success, or a VM_FAULT error otherwise. - */ -static vm_fault_t fb_deferred_io_page_mkwrite(struct fb_info *info, struct vm_fault *vmf) +static vm_fault_t fb_deferred_io_page_mkwrite(struct fb_deferred_io_state *fbdefio_state, + struct vm_fault *vmf) { unsigned long offset = vmf->pgoff << PAGE_SHIFT; struct page *page = vmf->page; file_update_time(vmf->vma->vm_file); - return fb_deferred_io_track_page(info, offset, page); + return fb_deferred_io_track_page(fbdefio_state, offset, page); } -/* vm_ops->page_mkwrite handler */ static vm_fault_t fb_deferred_io_mkwrite(struct vm_fault *vmf) { - struct fb_info *info = vmf->vma->vm_private_data; + struct fb_deferred_io_state *fbdefio_state = vmf->vma->vm_private_data; - return fb_deferred_io_page_mkwrite(info, vmf); + return fb_deferred_io_page_mkwrite(fbdefio_state, vmf); } static const struct vm_operations_struct fb_deferred_io_vm_ops = { + .open = fb_deferred_io_vm_open, + .close = fb_deferred_io_vm_close, .fault = fb_deferred_io_fault, .page_mkwrite = fb_deferred_io_mkwrite, }; @@ -252,7 +339,10 @@ int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma) vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); if (!(info->flags & FBINFO_VIRTFB)) vm_flags_set(vma, VM_IO); - vma->vm_private_data = info; + vma->vm_private_data = info->fbdefio_state; + + fb_deferred_io_state_get(info->fbdefio_state); /* released in vma->vm_ops->close() */ + return 0; } EXPORT_SYMBOL_GPL(fb_deferred_io_mmap); @@ -263,9 +353,10 @@ static void fb_deferred_io_work(struct work_struct *work) struct fb_info *info = container_of(work, struct fb_info, deferred_work.work); struct fb_deferred_io_pageref *pageref, *next; struct fb_deferred_io *fbdefio = info->fbdefio; + struct fb_deferred_io_state *fbdefio_state = info->fbdefio_state; /* here we wrprotect the page's mappings, then do all deferred IO. */ - mutex_lock(&fbdefio->lock); + mutex_lock(&fbdefio_state->lock); #ifdef CONFIG_MMU list_for_each_entry(pageref, &fbdefio->pagereflist, list) { struct page *page = pageref->page; @@ -283,12 +374,13 @@ static void fb_deferred_io_work(struct work_struct *work) list_for_each_entry_safe(pageref, next, &fbdefio->pagereflist, list) fb_deferred_io_pageref_put(pageref, info); - mutex_unlock(&fbdefio->lock); + mutex_unlock(&fbdefio_state->lock); } int fb_deferred_io_init(struct fb_info *info) { struct fb_deferred_io *fbdefio = info->fbdefio; + struct fb_deferred_io_state *fbdefio_state; struct fb_deferred_io_pageref *pagerefs; unsigned long npagerefs; int ret; @@ -298,7 +390,11 @@ int fb_deferred_io_init(struct fb_info *info) if (WARN_ON(!info->fix.smem_len)) return -EINVAL; - mutex_init(&fbdefio->lock); + fbdefio_state = fb_deferred_io_state_alloc(); + if (!fbdefio_state) + return -ENOMEM; + fbdefio_state->info = info; + INIT_DELAYED_WORK(&info->deferred_work, fb_deferred_io_work); INIT_LIST_HEAD(&fbdefio->pagereflist); if (fbdefio->delay == 0) /* set a default of 1 s */ @@ -315,10 +411,12 @@ int fb_deferred_io_init(struct fb_info *info) info->npagerefs = npagerefs; info->pagerefs = pagerefs; + info->fbdefio_state = fbdefio_state; + return 0; err: - mutex_destroy(&fbdefio->lock); + fb_deferred_io_state_release(fbdefio_state); return ret; } EXPORT_SYMBOL_GPL(fb_deferred_io_init); @@ -352,11 +450,19 @@ EXPORT_SYMBOL_GPL(fb_deferred_io_release); void fb_deferred_io_cleanup(struct fb_info *info) { struct fb_deferred_io *fbdefio = info->fbdefio; + struct fb_deferred_io_state *fbdefio_state = info->fbdefio_state; fb_deferred_io_lastclose(info); + info->fbdefio_state = NULL; + + mutex_lock(&fbdefio_state->lock); + fbdefio_state->info = NULL; + mutex_unlock(&fbdefio_state->lock); + + fb_deferred_io_state_put(fbdefio_state); + kvfree(info->pagerefs); - mutex_destroy(&fbdefio->lock); fbdefio->mapping = NULL; } EXPORT_SYMBOL_GPL(fb_deferred_io_cleanup); diff --git a/include/linux/fb.h b/include/linux/fb.h index 6d4a58084fd5..aed17567fe50 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -218,13 +218,14 @@ struct fb_deferred_io { unsigned long delay; bool sort_pagereflist; /* sort pagelist by offset */ int open_count; /* number of opened files; protected by fb_info lock */ - struct mutex lock; /* mutex that protects the pageref list */ struct list_head pagereflist; /* list of pagerefs for touched pages */ struct address_space *mapping; /* page cache object for fb device */ /* callback */ struct page *(*get_page)(struct fb_info *info, unsigned long offset); void (*deferred_io)(struct fb_info *info, struct list_head *pagelist); }; + +struct fb_deferred_io_state; #endif /* @@ -487,6 +488,7 @@ struct fb_info { unsigned long npagerefs; struct fb_deferred_io_pageref *pagerefs; struct fb_deferred_io *fbdefio; + struct fb_deferred_io_state *fbdefio_state; #endif const struct fb_ops *fbops; -- cgit v1.2.3 From 648bfb62da4e7a970f6b153bb8cdab1703877fcd Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 24 Feb 2026 09:25:56 +0100 Subject: fbdev: defio: Move variable state into struct fb_deferred_io_state Move variable fields from struct fb_deferred_io into struct fb_deferred_io_state. These fields are internal to the defio code and should not be exposed to drivers. At some later point, struct fb_defered_io might become const in all defio code. Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- drivers/video/fbdev/core/fb_defio.c | 37 ++++++++++++++++++++++--------------- include/linux/fb.h | 3 --- 2 files changed, 22 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/drivers/video/fbdev/core/fb_defio.c b/drivers/video/fbdev/core/fb_defio.c index 56030eb42f71..35ac13727da1 100644 --- a/drivers/video/fbdev/core/fb_defio.c +++ b/drivers/video/fbdev/core/fb_defio.c @@ -25,6 +25,8 @@ #include #include +struct address_space; + /* * struct fb_deferred_io_state */ @@ -32,9 +34,13 @@ struct fb_deferred_io_state { struct kref ref; + int open_count; /* number of opened files; protected by fb_info lock */ + struct address_space *mapping; /* page cache object for fb device */ + struct mutex lock; /* mutex that protects the pageref list */ /* fields protected by lock */ struct fb_info *info; + struct list_head pagereflist; /* list of pagerefs for touched pages */ }; static struct fb_deferred_io_state *fb_deferred_io_state_alloc(void) @@ -48,11 +54,14 @@ static struct fb_deferred_io_state *fb_deferred_io_state_alloc(void) kref_init(&fbdefio_state->ref); mutex_init(&fbdefio_state->lock); + INIT_LIST_HEAD(&fbdefio_state->pagereflist); + return fbdefio_state; } static void fb_deferred_io_state_release(struct fb_deferred_io_state *fbdefio_state) { + WARN_ON(!list_empty(&fbdefio_state->pagereflist)); mutex_destroy(&fbdefio_state->lock); kfree(fbdefio_state); @@ -147,7 +156,8 @@ static struct fb_deferred_io_pageref *fb_deferred_io_pageref_get(struct fb_info struct page *page) { struct fb_deferred_io *fbdefio = info->fbdefio; - struct list_head *pos = &fbdefio->pagereflist; + struct fb_deferred_io_state *fbdefio_state = info->fbdefio_state; + struct list_head *pos = &fbdefio_state->pagereflist; struct fb_deferred_io_pageref *pageref, *cur; pageref = fb_deferred_io_pageref_lookup(info, offset, page); @@ -171,7 +181,7 @@ static struct fb_deferred_io_pageref *fb_deferred_io_pageref_get(struct fb_info * pages. If possible, drivers should try to work with * unsorted page lists instead. */ - list_for_each_entry(cur, &fbdefio->pagereflist, list) { + list_for_each_entry(cur, &fbdefio_state->pagereflist, list) { if (cur->offset > pageref->offset) break; } @@ -222,7 +232,7 @@ static vm_fault_t fb_deferred_io_fault(struct vm_fault *vmf) if (!vmf->vma->vm_file) fb_err(info, "no mapping available\n"); - BUG_ON(!info->fbdefio->mapping); + fb_WARN_ON_ONCE(info, !fbdefio_state->mapping); mutex_unlock(&fbdefio_state->lock); @@ -364,20 +374,20 @@ static void fb_deferred_io_work(struct work_struct *work) /* here we wrprotect the page's mappings, then do all deferred IO. */ mutex_lock(&fbdefio_state->lock); #ifdef CONFIG_MMU - list_for_each_entry(pageref, &fbdefio->pagereflist, list) { + list_for_each_entry(pageref, &fbdefio_state->pagereflist, list) { struct page *page = pageref->page; pgoff_t pgoff = pageref->offset >> PAGE_SHIFT; - mapping_wrprotect_range(fbdefio->mapping, pgoff, + mapping_wrprotect_range(fbdefio_state->mapping, pgoff, page_to_pfn(page), 1); } #endif /* driver's callback with pagereflist */ - fbdefio->deferred_io(info, &fbdefio->pagereflist); + fbdefio->deferred_io(info, &fbdefio_state->pagereflist); /* clear the list */ - list_for_each_entry_safe(pageref, next, &fbdefio->pagereflist, list) + list_for_each_entry_safe(pageref, next, &fbdefio_state->pagereflist, list) fb_deferred_io_pageref_put(pageref, info); mutex_unlock(&fbdefio_state->lock); @@ -402,7 +412,6 @@ int fb_deferred_io_init(struct fb_info *info) fbdefio_state->info = info; INIT_DELAYED_WORK(&info->deferred_work, fb_deferred_io_work); - INIT_LIST_HEAD(&fbdefio->pagereflist); if (fbdefio->delay == 0) /* set a default of 1 s */ fbdefio->delay = HZ; @@ -431,11 +440,11 @@ void fb_deferred_io_open(struct fb_info *info, struct inode *inode, struct file *file) { - struct fb_deferred_io *fbdefio = info->fbdefio; + struct fb_deferred_io_state *fbdefio_state = info->fbdefio_state; - fbdefio->mapping = file->f_mapping; + fbdefio_state->mapping = file->f_mapping; file->f_mapping->a_ops = &fb_deferred_io_aops; - fbdefio->open_count++; + fbdefio_state->open_count++; } EXPORT_SYMBOL_GPL(fb_deferred_io_open); @@ -446,16 +455,15 @@ static void fb_deferred_io_lastclose(struct fb_info *info) void fb_deferred_io_release(struct fb_info *info) { - struct fb_deferred_io *fbdefio = info->fbdefio; + struct fb_deferred_io_state *fbdefio_state = info->fbdefio_state; - if (!--fbdefio->open_count) + if (!--fbdefio_state->open_count) fb_deferred_io_lastclose(info); } EXPORT_SYMBOL_GPL(fb_deferred_io_release); void fb_deferred_io_cleanup(struct fb_info *info) { - struct fb_deferred_io *fbdefio = info->fbdefio; struct fb_deferred_io_state *fbdefio_state = info->fbdefio_state; fb_deferred_io_lastclose(info); @@ -469,6 +477,5 @@ void fb_deferred_io_cleanup(struct fb_info *info) fb_deferred_io_state_put(fbdefio_state); kvfree(info->pagerefs); - fbdefio->mapping = NULL; } EXPORT_SYMBOL_GPL(fb_deferred_io_cleanup); diff --git a/include/linux/fb.h b/include/linux/fb.h index aed17567fe50..2791777f3a50 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -217,9 +217,6 @@ struct fb_deferred_io { /* delay between mkwrite and deferred handler */ unsigned long delay; bool sort_pagereflist; /* sort pagelist by offset */ - int open_count; /* number of opened files; protected by fb_info lock */ - struct list_head pagereflist; /* list of pagerefs for touched pages */ - struct address_space *mapping; /* page cache object for fb device */ /* callback */ struct page *(*get_page)(struct fb_info *info, unsigned long offset); void (*deferred_io)(struct fb_info *info, struct list_head *pagelist); -- cgit v1.2.3 From 02fe86e5fc22faee9b29e761d9fdd17fdfe583e6 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 24 Feb 2026 09:25:57 +0100 Subject: fbdev: defio: Move pageref array to struct fb_deferred_io_state The pageref array stores all pageref structures for a device's defio helpers. Move it into struct fb_deferred_io_state to not expose it to drivers. Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- drivers/video/fbdev/core/fb_defio.c | 55 ++++++++++++++++++------------------- include/linux/fb.h | 2 -- 2 files changed, 27 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/drivers/video/fbdev/core/fb_defio.c b/drivers/video/fbdev/core/fb_defio.c index 35ac13727da1..a12dd25ab697 100644 --- a/drivers/video/fbdev/core/fb_defio.c +++ b/drivers/video/fbdev/core/fb_defio.c @@ -41,28 +41,46 @@ struct fb_deferred_io_state { /* fields protected by lock */ struct fb_info *info; struct list_head pagereflist; /* list of pagerefs for touched pages */ + unsigned long npagerefs; + struct fb_deferred_io_pageref *pagerefs; }; -static struct fb_deferred_io_state *fb_deferred_io_state_alloc(void) +static struct fb_deferred_io_state *fb_deferred_io_state_alloc(unsigned long len) { struct fb_deferred_io_state *fbdefio_state; + struct fb_deferred_io_pageref *pagerefs; + unsigned long npagerefs; fbdefio_state = kzalloc_obj(*fbdefio_state); if (!fbdefio_state) return NULL; + npagerefs = DIV_ROUND_UP(len, PAGE_SIZE); + + /* alloc a page ref for each page of the display memory */ + pagerefs = kvzalloc_objs(*pagerefs, npagerefs); + if (!pagerefs) + goto err_kfree; + fbdefio_state->npagerefs = npagerefs; + fbdefio_state->pagerefs = pagerefs; + kref_init(&fbdefio_state->ref); mutex_init(&fbdefio_state->lock); INIT_LIST_HEAD(&fbdefio_state->pagereflist); return fbdefio_state; + +err_kfree: + kfree(fbdefio_state); + return NULL; } static void fb_deferred_io_state_release(struct fb_deferred_io_state *fbdefio_state) { WARN_ON(!list_empty(&fbdefio_state->pagereflist)); mutex_destroy(&fbdefio_state->lock); + kvfree(fbdefio_state->pagerefs); kfree(fbdefio_state); } @@ -125,18 +143,19 @@ static struct page *fb_deferred_io_get_page(struct fb_info *info, unsigned long return page; } -static struct fb_deferred_io_pageref *fb_deferred_io_pageref_lookup(struct fb_info *info, - unsigned long offset, - struct page *page) +static struct fb_deferred_io_pageref * +fb_deferred_io_pageref_lookup(struct fb_deferred_io_state *fbdefio_state, unsigned long offset, + struct page *page) { + struct fb_info *info = fbdefio_state->info; unsigned long pgoff = offset >> PAGE_SHIFT; struct fb_deferred_io_pageref *pageref; - if (fb_WARN_ON_ONCE(info, pgoff >= info->npagerefs)) + if (fb_WARN_ON_ONCE(info, pgoff >= fbdefio_state->npagerefs)) return NULL; /* incorrect allocation size */ /* 1:1 mapping between pageref and page offset */ - pageref = &info->pagerefs[pgoff]; + pageref = &fbdefio_state->pagerefs[pgoff]; if (pageref->page) goto out; @@ -160,7 +179,7 @@ static struct fb_deferred_io_pageref *fb_deferred_io_pageref_get(struct fb_info struct list_head *pos = &fbdefio_state->pagereflist; struct fb_deferred_io_pageref *pageref, *cur; - pageref = fb_deferred_io_pageref_lookup(info, offset, page); + pageref = fb_deferred_io_pageref_lookup(fbdefio_state, offset, page); if (!pageref) return NULL; @@ -397,16 +416,13 @@ int fb_deferred_io_init(struct fb_info *info) { struct fb_deferred_io *fbdefio = info->fbdefio; struct fb_deferred_io_state *fbdefio_state; - struct fb_deferred_io_pageref *pagerefs; - unsigned long npagerefs; - int ret; BUG_ON(!fbdefio); if (WARN_ON(!info->fix.smem_len)) return -EINVAL; - fbdefio_state = fb_deferred_io_state_alloc(); + fbdefio_state = fb_deferred_io_state_alloc(info->fix.smem_len); if (!fbdefio_state) return -ENOMEM; fbdefio_state->info = info; @@ -415,24 +431,9 @@ int fb_deferred_io_init(struct fb_info *info) if (fbdefio->delay == 0) /* set a default of 1 s */ fbdefio->delay = HZ; - npagerefs = DIV_ROUND_UP(info->fix.smem_len, PAGE_SIZE); - - /* alloc a page ref for each page of the display memory */ - pagerefs = kvzalloc_objs(*pagerefs, npagerefs); - if (!pagerefs) { - ret = -ENOMEM; - goto err; - } - info->npagerefs = npagerefs; - info->pagerefs = pagerefs; - info->fbdefio_state = fbdefio_state; return 0; - -err: - fb_deferred_io_state_release(fbdefio_state); - return ret; } EXPORT_SYMBOL_GPL(fb_deferred_io_init); @@ -475,7 +476,5 @@ void fb_deferred_io_cleanup(struct fb_info *info) mutex_unlock(&fbdefio_state->lock); fb_deferred_io_state_put(fbdefio_state); - - kvfree(info->pagerefs); } EXPORT_SYMBOL_GPL(fb_deferred_io_cleanup); diff --git a/include/linux/fb.h b/include/linux/fb.h index 2791777f3a50..b27943719fab 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -482,8 +482,6 @@ struct fb_info { #ifdef CONFIG_FB_DEFERRED_IO struct delayed_work deferred_work; - unsigned long npagerefs; - struct fb_deferred_io_pageref *pagerefs; struct fb_deferred_io *fbdefio; struct fb_deferred_io_state *fbdefio_state; #endif -- cgit v1.2.3 From 993bcaf32c494014f56357f6cdd87fdfaa4e4d11 Mon Sep 17 00:00:00 2001 From: Josua Mayer Date: Thu, 26 Feb 2026 15:21:11 +0200 Subject: mux: Add helper functions for getting optional and selected mux-state In-tree phy-can-transceiver and phy_rcar_gen3_usb2 have already implemented local versions of devm_mux_state_get_optional. The omap-i2c driver gets and selects an optional mux in its probe function without using any helper. Add new helper functions covering both aforementioned use-cases: - mux_control_get_optional: Get a mux-control if specified in dt, return NULL otherwise. - devm_mux_state_get_optional: Get a mux-state if specified in dt, return NULL otherwise. - devm_mux_state_get_selected: Get and select a mux-state specified in dt, return error otherwise. - devm_mux_state_get_optional_selected: Get and select a mux-state if specified in dt, return error or NULL. Existing mux_get helper function is changed to take an extra argument indicating whether the mux is optional. In this case no error is printed, and NULL returned in case of ENOENT. Calling code is adapted to handle NULL return case, and to pass optional argument as required. To support automatic deselect for _selected helper, a new structure is created storing an exit pointer similar to clock core which is called on release. To facilitate code sharing between optional/mandatory/selected helpers, a new internal helper function is added to handle quiet (optional) and verbose (mandatory) errors, as well as storing the correct callback for devm release: __devm_mux_state_get Due to this structure devm_mux_state_get_*_selected can no longer print a useful error message when select fails. Instead callers should print errors where needed. Commit e153fdea9db04 ("phy: can-transceiver: Re-instate "mux-states" property presence check") noted that "mux_get() always prints an error message in case of an error, including when the property is not present, confusing the user." The first error message covers the case that a mux name is not matched in dt. The second error message is based on of_parse_phandle_with_args return value. In optional case no error is printed and NULL is returned. This ensures that the new helper functions will not confuse the user either. With the addition of optional helper functions it became clear that drivers should compile and link even if CONFIG_MULTIPLEXER was not enabled. Add stubs for all symbols exported by mux core. Acked-by: Wolfram Sang Signed-off-by: Josua Mayer Signed-off-by: Ulf Hansson --- drivers/mux/core.c | 194 +++++++++++++++++++++++++++++++++++++------ include/linux/mux/consumer.h | 108 +++++++++++++++++++++++- 2 files changed, 271 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/drivers/mux/core.c b/drivers/mux/core.c index f09ee8782e3d..23538de2c91b 100644 --- a/drivers/mux/core.c +++ b/drivers/mux/core.c @@ -46,6 +46,16 @@ static const struct class mux_class = { .name = "mux", }; +/** + * struct devm_mux_state_state - Tracks managed resources for mux-state objects. + * @mstate: Pointer to a mux state. + * @exit: An optional callback to execute before free. + */ +struct devm_mux_state_state { + struct mux_state *mstate; + int (*exit)(struct mux_state *mstate); +}; + static DEFINE_IDA(mux_ida); static int __init mux_init(void) @@ -516,17 +526,19 @@ static struct mux_chip *of_find_mux_chip_by_node(struct device_node *np) return dev ? to_mux_chip(dev) : NULL; } -/* +/** * mux_get() - Get the mux-control for a device. * @dev: The device that needs a mux-control. * @mux_name: The name identifying the mux-control. * @state: Pointer to where the requested state is returned, or NULL when * the required multiplexer states are handled by other means. + * @optional: Whether to return NULL and silence errors when mux doesn't exist. * - * Return: A pointer to the mux-control, or an ERR_PTR with a negative errno. + * Return: Pointer to the mux-control on success, an ERR_PTR with a negative + * errno on error, or NULL if optional is true and mux doesn't exist. */ static struct mux_control *mux_get(struct device *dev, const char *mux_name, - unsigned int *state) + unsigned int *state, bool optional) { struct device_node *np = dev->of_node; struct of_phandle_args args; @@ -542,7 +554,9 @@ static struct mux_control *mux_get(struct device *dev, const char *mux_name, else index = of_property_match_string(np, "mux-control-names", mux_name); - if (index < 0) { + if (index < 0 && optional) { + return NULL; + } else if (index < 0) { dev_err(dev, "mux controller '%s' not found\n", mux_name); return ERR_PTR(index); @@ -558,8 +572,12 @@ static struct mux_control *mux_get(struct device *dev, const char *mux_name, "mux-controls", "#mux-control-cells", index, &args); if (ret) { + if (optional && ret == -ENOENT) + return NULL; + dev_err(dev, "%pOF: failed to get mux-%s %s(%i)\n", - np, state ? "state" : "control", mux_name ?: "", index); + np, state ? "state" : "control", + mux_name ?: "", index); return ERR_PTR(ret); } @@ -617,10 +635,29 @@ static struct mux_control *mux_get(struct device *dev, const char *mux_name, */ struct mux_control *mux_control_get(struct device *dev, const char *mux_name) { - return mux_get(dev, mux_name, NULL); + struct mux_control *mux = mux_get(dev, mux_name, NULL, false); + + if (!mux) + return ERR_PTR(-ENOENT); + + return mux; } EXPORT_SYMBOL_GPL(mux_control_get); +/** + * mux_control_get_optional() - Get the optional mux-control for a device. + * @dev: The device that needs a mux-control. + * @mux_name: The name identifying the mux-control. + * + * Return: Pointer to the mux-control on success, an ERR_PTR with a negative + * errno on error, or NULL if mux doesn't exist. + */ +struct mux_control *mux_control_get_optional(struct device *dev, const char *mux_name) +{ + return mux_get(dev, mux_name, NULL, true); +} +EXPORT_SYMBOL_GPL(mux_control_get_optional); + /** * mux_control_put() - Put away the mux-control for good. * @mux: The mux-control to put away. @@ -670,14 +707,16 @@ struct mux_control *devm_mux_control_get(struct device *dev, } EXPORT_SYMBOL_GPL(devm_mux_control_get); -/* +/** * mux_state_get() - Get the mux-state for a device. * @dev: The device that needs a mux-state. * @mux_name: The name identifying the mux-state. + * @optional: Whether to return NULL and silence errors when mux doesn't exist. * - * Return: A pointer to the mux-state, or an ERR_PTR with a negative errno. + * Return: Pointer to the mux-state on success, an ERR_PTR with a negative + * errno on error, or NULL if optional is true and mux doesn't exist. */ -static struct mux_state *mux_state_get(struct device *dev, const char *mux_name) +static struct mux_state *mux_state_get(struct device *dev, const char *mux_name, bool optional) { struct mux_state *mstate; @@ -685,12 +724,15 @@ static struct mux_state *mux_state_get(struct device *dev, const char *mux_name) if (!mstate) return ERR_PTR(-ENOMEM); - mstate->mux = mux_get(dev, mux_name, &mstate->state); + mstate->mux = mux_get(dev, mux_name, &mstate->state, optional); if (IS_ERR(mstate->mux)) { int err = PTR_ERR(mstate->mux); kfree(mstate); return ERR_PTR(err); + } else if (!mstate->mux) { + kfree(mstate); + return optional ? NULL : ERR_PTR(-ENOENT); } return mstate; @@ -710,9 +752,66 @@ static void mux_state_put(struct mux_state *mstate) static void devm_mux_state_release(struct device *dev, void *res) { - struct mux_state *mstate = *(struct mux_state **)res; + struct devm_mux_state_state *devm_state = res; + + if (devm_state->exit) + devm_state->exit(devm_state->mstate); + + mux_state_put(devm_state->mstate); +} + +/** + * __devm_mux_state_get() - Get the optional mux-state for a device, + * with resource management. + * @dev: The device that needs a mux-state. + * @mux_name: The name identifying the mux-state. + * @optional: Whether to return NULL and silence errors when mux doesn't exist. + * @init: Optional function pointer for mux-state object initialisation. + * @exit: Optional function pointer for mux-state object cleanup on release. + * + * Return: Pointer to the mux-state on success, an ERR_PTR with a negative + * errno on error, or NULL if optional is true and mux doesn't exist. + */ +static struct mux_state *__devm_mux_state_get(struct device *dev, const char *mux_name, + bool optional, + int (*init)(struct mux_state *mstate), + int (*exit)(struct mux_state *mstate)) +{ + struct devm_mux_state_state *devm_state; + struct mux_state *mstate; + int ret; + + mstate = mux_state_get(dev, mux_name, optional); + if (IS_ERR(mstate)) + return ERR_CAST(mstate); + else if (optional && !mstate) + return NULL; + else if (!mstate) + return ERR_PTR(-ENOENT); + + devm_state = devres_alloc(devm_mux_state_release, sizeof(*devm_state), GFP_KERNEL); + if (!devm_state) { + ret = -ENOMEM; + goto err_devres_alloc; + } + + if (init) { + ret = init(mstate); + if (ret) + goto err_mux_state_init; + } + + devm_state->mstate = mstate; + devm_state->exit = exit; + devres_add(dev, devm_state); + return mstate; + +err_mux_state_init: + devres_free(devm_state); +err_devres_alloc: mux_state_put(mstate); + return ERR_PTR(ret); } /** @@ -722,28 +821,69 @@ static void devm_mux_state_release(struct device *dev, void *res) * @mux_name: The name identifying the mux-control. * * Return: Pointer to the mux-state, or an ERR_PTR with a negative errno. + * + * The mux-state will automatically be freed on release. */ -struct mux_state *devm_mux_state_get(struct device *dev, - const char *mux_name) +struct mux_state *devm_mux_state_get(struct device *dev, const char *mux_name) { - struct mux_state **ptr, *mstate; - - ptr = devres_alloc(devm_mux_state_release, sizeof(*ptr), GFP_KERNEL); - if (!ptr) - return ERR_PTR(-ENOMEM); + return __devm_mux_state_get(dev, mux_name, false, NULL, NULL); +} +EXPORT_SYMBOL_GPL(devm_mux_state_get); - mstate = mux_state_get(dev, mux_name); - if (IS_ERR(mstate)) { - devres_free(ptr); - return mstate; - } +/** + * devm_mux_state_get_optional() - Get the optional mux-state for a device, + * with resource management. + * @dev: The device that needs a mux-state. + * @mux_name: The name identifying the mux-state. + * + * Return: Pointer to the mux-state on success, an ERR_PTR with a negative + * errno on error, or NULL if mux doesn't exist. + * + * The mux-state will automatically be freed on release. + */ +struct mux_state *devm_mux_state_get_optional(struct device *dev, const char *mux_name) +{ + return __devm_mux_state_get(dev, mux_name, true, NULL, NULL); +} +EXPORT_SYMBOL_GPL(devm_mux_state_get_optional); - *ptr = mstate; - devres_add(dev, ptr); +/** + * devm_mux_state_get_selected() - Get the mux-state for a device, with + * resource management. + * @dev: The device that needs a mux-state. + * @mux_name: The name identifying the mux-state. + * + * Return: Pointer to the mux-state, or an ERR_PTR with a negative errno. + * + * The returned mux-state (if valid) is already selected. + * + * The mux-state will automatically be deselected and freed on release. + */ +struct mux_state *devm_mux_state_get_selected(struct device *dev, const char *mux_name) +{ + return __devm_mux_state_get(dev, mux_name, false, mux_state_select, mux_state_deselect); +} +EXPORT_SYMBOL_GPL(devm_mux_state_get_selected); - return mstate; +/** + * devm_mux_state_get_optional_selected() - Get the optional mux-state for + * a device, with resource management. + * @dev: The device that needs a mux-state. + * @mux_name: The name identifying the mux-state. + * + * Return: Pointer to the mux-state on success, an ERR_PTR with a negative + * errno on error, or NULL if mux doesn't exist. + * + * The returned mux-state (if valid) is already selected. + * + * The mux-state will automatically be deselected and freed on release. + */ +struct mux_state *devm_mux_state_get_optional_selected(struct device *dev, + const char *mux_name) +{ + return __devm_mux_state_get(dev, mux_name, true, mux_state_select, mux_state_deselect); } -EXPORT_SYMBOL_GPL(devm_mux_state_get); +EXPORT_SYMBOL_GPL(devm_mux_state_get_optional_selected); /* * Using subsys_initcall instead of module_init here to try to ensure - for diff --git a/include/linux/mux/consumer.h b/include/linux/mux/consumer.h index 2e25c838f831..a961861a503b 100644 --- a/include/linux/mux/consumer.h +++ b/include/linux/mux/consumer.h @@ -16,6 +16,8 @@ struct device; struct mux_control; struct mux_state; +#if IS_ENABLED(CONFIG_MULTIPLEXER) + unsigned int mux_control_states(struct mux_control *mux); int __must_check mux_control_select_delay(struct mux_control *mux, unsigned int state, @@ -54,11 +56,109 @@ int mux_control_deselect(struct mux_control *mux); int mux_state_deselect(struct mux_state *mstate); struct mux_control *mux_control_get(struct device *dev, const char *mux_name); +struct mux_control *mux_control_get_optional(struct device *dev, const char *mux_name); void mux_control_put(struct mux_control *mux); -struct mux_control *devm_mux_control_get(struct device *dev, - const char *mux_name); -struct mux_state *devm_mux_state_get(struct device *dev, - const char *mux_name); +struct mux_control *devm_mux_control_get(struct device *dev, const char *mux_name); +struct mux_state *devm_mux_state_get(struct device *dev, const char *mux_name); +struct mux_state *devm_mux_state_get_optional(struct device *dev, const char *mux_name); +struct mux_state *devm_mux_state_get_selected(struct device *dev, const char *mux_name); +struct mux_state *devm_mux_state_get_optional_selected(struct device *dev, const char *mux_name); + +#else + +static inline unsigned int mux_control_states(struct mux_control *mux) +{ + return 0; +} +static inline int __must_check mux_control_select_delay(struct mux_control *mux, + unsigned int state, unsigned int delay_us) +{ + return -EOPNOTSUPP; +} +static inline int __must_check mux_state_select_delay(struct mux_state *mstate, + unsigned int delay_us) +{ + return -EOPNOTSUPP; +} +static inline int __must_check mux_control_try_select_delay(struct mux_control *mux, + unsigned int state, + unsigned int delay_us) +{ + return -EOPNOTSUPP; +} +static inline int __must_check mux_state_try_select_delay(struct mux_state *mstate, + unsigned int delay_us) +{ + return -EOPNOTSUPP; +} + +static inline int __must_check mux_control_select(struct mux_control *mux, + unsigned int state) +{ + return -EOPNOTSUPP; +} + +static inline int __must_check mux_state_select(struct mux_state *mstate) +{ + return -EOPNOTSUPP; +} + +static inline int __must_check mux_control_try_select(struct mux_control *mux, + unsigned int state) +{ + return -EOPNOTSUPP; +} + +static inline int __must_check mux_state_try_select(struct mux_state *mstate) +{ + return -EOPNOTSUPP; +} + +static inline int mux_control_deselect(struct mux_control *mux) +{ + return -EOPNOTSUPP; +} +static inline int mux_state_deselect(struct mux_state *mstate) +{ + return -EOPNOTSUPP; +} + +static inline struct mux_control *mux_control_get(struct device *dev, const char *mux_name) +{ + return ERR_PTR(-EOPNOTSUPP); +} +static inline struct mux_control *mux_control_get_optional(struct device *dev, + const char *mux_name) +{ + return NULL; +} +static inline void mux_control_put(struct mux_control *mux) {} + +static inline struct mux_control *devm_mux_control_get(struct device *dev, const char *mux_name) +{ + return ERR_PTR(-EOPNOTSUPP); +} +static inline struct mux_state *devm_mux_state_get(struct device *dev, const char *mux_name) +{ + return ERR_PTR(-EOPNOTSUPP); +} +static inline struct mux_state *devm_mux_state_get_optional(struct device *dev, + const char *mux_name) +{ + return NULL; +} +static inline struct mux_state *devm_mux_state_get_selected(struct device *dev, + const char *mux_name) +{ + return ERR_PTR(-EOPNOTSUPP); +} +static inline struct mux_state *devm_mux_state_get_optional_selected(struct device *dev, + const char *mux_name) +{ + return NULL; +} + +#endif /* CONFIG_MULTIPLEXER */ #endif /* _LINUX_MUX_CONSUMER_H */ -- cgit v1.2.3 From 7ea25eaad5ae3a6c837a3df9bdb822194f002565 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Feb 2026 05:20:01 -0800 Subject: block: factor out a bio_integrity_action helper Split the logic to see if a bio needs integrity metadata from bio_integrity_prep into a reusable helper than can be called from file system code. Signed-off-by: Christoph Hellwig Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Reviewed-by: Martin K. Petersen Reviewed-by: Darrick J. Wong Tested-by: Anuj Gupta Signed-off-by: Jens Axboe --- block/bio-integrity-auto.c | 64 ++++++------------------------------------- block/bio-integrity.c | 48 ++++++++++++++++++++++++++++++++ block/blk-mq.c | 6 ++-- drivers/nvdimm/btt.c | 6 ++-- include/linux/bio-integrity.h | 5 ++-- include/linux/blk-integrity.h | 23 ++++++++++++++++ 6 files changed, 89 insertions(+), 63 deletions(-) (limited to 'include/linux') diff --git a/block/bio-integrity-auto.c b/block/bio-integrity-auto.c index 44dcdf7520c5..e16f669dbf1e 100644 --- a/block/bio-integrity-auto.c +++ b/block/bio-integrity-auto.c @@ -50,11 +50,6 @@ static bool bip_should_check(struct bio_integrity_payload *bip) return bip->bip_flags & BIP_CHECK_FLAGS; } -static bool bi_offload_capable(struct blk_integrity *bi) -{ - return bi->metadata_size == bi->pi_tuple_size; -} - /** * __bio_integrity_endio - Integrity I/O completion function * @bio: Protected bio @@ -84,69 +79,27 @@ bool __bio_integrity_endio(struct bio *bio) /** * bio_integrity_prep - Prepare bio for integrity I/O * @bio: bio to prepare + * @action: preparation action needed (BI_ACT_*) + * + * Allocate the integrity payload. For writes, generate the integrity metadata + * and for reads, setup the completion handler to verify the metadata. * - * Checks if the bio already has an integrity payload attached. If it does, the - * payload has been generated by another kernel subsystem, and we just pass it - * through. - * Otherwise allocates integrity payload and for writes the integrity metadata - * will be generated. For reads, the completion handler will verify the - * metadata. + * This is used for bios that do not have user integrity payloads attached. */ -bool bio_integrity_prep(struct bio *bio) +void bio_integrity_prep(struct bio *bio, unsigned int action) { struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct bio_integrity_data *bid; - bool set_flags = true; - gfp_t gfp = GFP_NOIO; - - if (!bi) - return true; - - if (!bio_sectors(bio)) - return true; - - /* Already protected? */ - if (bio_integrity(bio)) - return true; - - switch (bio_op(bio)) { - case REQ_OP_READ: - if (bi->flags & BLK_INTEGRITY_NOVERIFY) { - if (bi_offload_capable(bi)) - return true; - set_flags = false; - } - break; - case REQ_OP_WRITE: - /* - * Zero the memory allocated to not leak uninitialized kernel - * memory to disk for non-integrity metadata where nothing else - * initializes the memory. - */ - if (bi->flags & BLK_INTEGRITY_NOGENERATE) { - if (bi_offload_capable(bi)) - return true; - set_flags = false; - gfp |= __GFP_ZERO; - } else if (bi->metadata_size > bi->pi_tuple_size) - gfp |= __GFP_ZERO; - break; - default: - return true; - } - - if (WARN_ON_ONCE(bio_has_crypt_ctx(bio))) - return true; bid = mempool_alloc(&bid_pool, GFP_NOIO); bio_integrity_init(bio, &bid->bip, &bid->bvec, 1); bid->bio = bio; bid->bip.bip_flags |= BIP_BLOCK_INTEGRITY; - bio_integrity_alloc_buf(bio, gfp & __GFP_ZERO); + bio_integrity_alloc_buf(bio, action & BI_ACT_ZERO); bip_set_seed(&bid->bip, bio->bi_iter.bi_sector); - if (set_flags) { + if (action & BI_ACT_CHECK) { if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) bid->bip.bip_flags |= BIP_IP_CHECKSUM; if (bi->csum_type) @@ -160,7 +113,6 @@ bool bio_integrity_prep(struct bio *bio) blk_integrity_generate(bio); else bid->saved_bio_iter = bio->bi_iter; - return true; } EXPORT_SYMBOL(bio_integrity_prep); diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 20f5d301d32d..0955be90038b 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -7,6 +7,7 @@ */ #include +#include #include "blk.h" struct bio_integrity_alloc { @@ -16,6 +17,53 @@ struct bio_integrity_alloc { static mempool_t integrity_buf_pool; +static bool bi_offload_capable(struct blk_integrity *bi) +{ + return bi->metadata_size == bi->pi_tuple_size; +} + +unsigned int __bio_integrity_action(struct bio *bio) +{ + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + + if (WARN_ON_ONCE(bio_has_crypt_ctx(bio))) + return 0; + + switch (bio_op(bio)) { + case REQ_OP_READ: + if (bi->flags & BLK_INTEGRITY_NOVERIFY) { + if (bi_offload_capable(bi)) + return 0; + return BI_ACT_BUFFER; + } + return BI_ACT_BUFFER | BI_ACT_CHECK; + case REQ_OP_WRITE: + /* + * Flush masquerading as write? + */ + if (!bio_sectors(bio)) + return 0; + + /* + * Zero the memory allocated to not leak uninitialized kernel + * memory to disk for non-integrity metadata where nothing else + * initializes the memory. + */ + if (bi->flags & BLK_INTEGRITY_NOGENERATE) { + if (bi_offload_capable(bi)) + return 0; + return BI_ACT_BUFFER | BI_ACT_ZERO; + } + + if (bi->metadata_size > bi->pi_tuple_size) + return BI_ACT_BUFFER | BI_ACT_CHECK | BI_ACT_ZERO; + return BI_ACT_BUFFER | BI_ACT_CHECK; + default: + return 0; + } +} +EXPORT_SYMBOL_GPL(__bio_integrity_action); + void bio_integrity_alloc_buf(struct bio *bio, bool zero_buffer) { struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); diff --git a/block/blk-mq.c b/block/blk-mq.c index 9af8c3dec3f6..0b311a797178 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3143,6 +3143,7 @@ void blk_mq_submit_bio(struct bio *bio) struct request_queue *q = bdev_get_queue(bio->bi_bdev); struct blk_plug *plug = current->plug; const int is_sync = op_is_sync(bio->bi_opf); + unsigned int integrity_action; struct blk_mq_hw_ctx *hctx; unsigned int nr_segs; struct request *rq; @@ -3195,8 +3196,9 @@ void blk_mq_submit_bio(struct bio *bio) if (!bio) goto queue_exit; - if (!bio_integrity_prep(bio)) - goto queue_exit; + integrity_action = bio_integrity_action(bio); + if (integrity_action) + bio_integrity_prep(bio, integrity_action); blk_mq_bio_issue_init(q, bio); if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index b6bef092f8b8..fdcb080a4314 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1435,14 +1435,16 @@ static void btt_submit_bio(struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); struct btt *btt = bio->bi_bdev->bd_disk->private_data; + unsigned int integrity_action; struct bvec_iter iter; unsigned long start; struct bio_vec bvec; int err = 0; bool do_acct; - if (!bio_integrity_prep(bio)) - return; + integrity_action = bio_integrity_action(bio); + if (integrity_action) + bio_integrity_prep(bio, integrity_action); do_acct = blk_queue_io_stat(bio->bi_bdev->bd_disk->queue); if (do_acct) diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 21e4652dcfd2..276cbbdd2c9d 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -78,7 +78,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter); int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta); void bio_integrity_unmap_user(struct bio *bio); -bool bio_integrity_prep(struct bio *bio); +void bio_integrity_prep(struct bio *bio, unsigned int action); void bio_integrity_advance(struct bio *bio, unsigned int bytes_done); void bio_integrity_trim(struct bio *bio); int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask); @@ -104,9 +104,8 @@ static inline void bio_integrity_unmap_user(struct bio *bio) { } -static inline bool bio_integrity_prep(struct bio *bio) +static inline void bio_integrity_prep(struct bio *bio, unsigned int action) { - return true; } static inline int bio_integrity_clone(struct bio *bio, struct bio *bio_src, diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index c15b1ac62765..fd3f3c8c0fcd 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -180,4 +180,27 @@ static inline struct bio_vec rq_integrity_vec(struct request *rq) } #endif /* CONFIG_BLK_DEV_INTEGRITY */ +enum bio_integrity_action { + BI_ACT_BUFFER = (1u << 0), /* allocate buffer */ + BI_ACT_CHECK = (1u << 1), /* generate / verify PI */ + BI_ACT_ZERO = (1u << 2), /* zero buffer */ +}; + +/** + * bio_integrity_action - return the integrity action needed for a bio + * @bio: bio to operate on + * + * Returns the mask of integrity actions (BI_ACT_*) that need to be performed + * for @bio. + */ +unsigned int __bio_integrity_action(struct bio *bio); +static inline unsigned int bio_integrity_action(struct bio *bio) +{ + if (!blk_get_integrity(bio->bi_bdev->bd_disk)) + return 0; + if (bio_integrity(bio)) + return 0; + return __bio_integrity_action(bio); +} + #endif /* _LINUX_BLK_INTEGRITY_H */ -- cgit v1.2.3 From a936655697cd8d1bab2fd5189e2c33dd6356a266 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Feb 2026 05:20:02 -0800 Subject: block: factor out a bio_integrity_setup_default helper Add a helper to set the seed and check flag based on useful defaults from the profile. Note that this includes a small behavior change, as we now only set the seed if any action is set, which is fine as nothing will look at it otherwise. Signed-off-by: Christoph Hellwig Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Reviewed-by: Martin K. Petersen Reviewed-by: Darrick J. Wong Tested-by: Anuj Gupta Signed-off-by: Jens Axboe --- block/bio-integrity-auto.c | 14 ++------------ block/bio-integrity.c | 16 ++++++++++++++++ include/linux/bio-integrity.h | 1 + 3 files changed, 19 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/block/bio-integrity-auto.c b/block/bio-integrity-auto.c index e16f669dbf1e..b64c71a7fc82 100644 --- a/block/bio-integrity-auto.c +++ b/block/bio-integrity-auto.c @@ -88,7 +88,6 @@ bool __bio_integrity_endio(struct bio *bio) */ void bio_integrity_prep(struct bio *bio, unsigned int action) { - struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct bio_integrity_data *bid; bid = mempool_alloc(&bid_pool, GFP_NOIO); @@ -96,17 +95,8 @@ void bio_integrity_prep(struct bio *bio, unsigned int action) bid->bio = bio; bid->bip.bip_flags |= BIP_BLOCK_INTEGRITY; bio_integrity_alloc_buf(bio, action & BI_ACT_ZERO); - - bip_set_seed(&bid->bip, bio->bi_iter.bi_sector); - - if (action & BI_ACT_CHECK) { - if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) - bid->bip.bip_flags |= BIP_IP_CHECKSUM; - if (bi->csum_type) - bid->bip.bip_flags |= BIP_CHECK_GUARD; - if (bi->flags & BLK_INTEGRITY_REF_TAG) - bid->bip.bip_flags |= BIP_CHECK_REFTAG; - } + if (action & BI_ACT_CHECK) + bio_integrity_setup_default(bio); /* Auto-generate integrity metadata if this is a write */ if (bio_data_dir(bio) == WRITE && bip_should_check(&bid->bip)) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 0955be90038b..e79eaf047794 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -101,6 +101,22 @@ void bio_integrity_free_buf(struct bio_integrity_payload *bip) kfree(bvec_virt(bv)); } +void bio_integrity_setup_default(struct bio *bio) +{ + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + struct bio_integrity_payload *bip = bio_integrity(bio); + + bip_set_seed(bip, bio->bi_iter.bi_sector); + + if (bi->csum_type) { + bip->bip_flags |= BIP_CHECK_GUARD; + if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) + bip->bip_flags |= BIP_IP_CHECKSUM; + } + if (bi->flags & BLK_INTEGRITY_REF_TAG) + bip->bip_flags |= BIP_CHECK_REFTAG; +} + /** * bio_integrity_free - Free bio integrity payload * @bio: bio containing bip to be freed diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 276cbbdd2c9d..232b86b9bbcb 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -143,5 +143,6 @@ static inline int bio_integrity_add_page(struct bio *bio, struct page *page, void bio_integrity_alloc_buf(struct bio *bio, bool zero_buffer); void bio_integrity_free_buf(struct bio_integrity_payload *bip); +void bio_integrity_setup_default(struct bio *bio); #endif /* _LINUX_BIO_INTEGRITY_H */ -- cgit v1.2.3 From 7afe93946dff63aa57c6db81f5eb43ac8233364e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Feb 2026 05:20:03 -0800 Subject: block: add a bdev_has_integrity_csum helper Factor out a helper to see if the block device has an integrity checksum from bdev_stable_writes so that it can be reused for other checks. Signed-off-by: Christoph Hellwig Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Reviewed-by: Martin K. Petersen Reviewed-by: Darrick J. Wong Tested-by: Anuj Gupta Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d463b9b5a0a5..dec0acaed6e6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1477,14 +1477,18 @@ static inline bool bdev_synchronous(struct block_device *bdev) return bdev->bd_disk->queue->limits.features & BLK_FEAT_SYNCHRONOUS; } -static inline bool bdev_stable_writes(struct block_device *bdev) +static inline bool bdev_has_integrity_csum(struct block_device *bdev) { - struct request_queue *q = bdev_get_queue(bdev); + struct queue_limits *lim = bdev_limits(bdev); - if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && - q->limits.integrity.csum_type != BLK_INTEGRITY_CSUM_NONE) - return true; - return q->limits.features & BLK_FEAT_STABLE_WRITES; + return IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && + lim->integrity.csum_type != BLK_INTEGRITY_CSUM_NONE; +} + +static inline bool bdev_stable_writes(struct block_device *bdev) +{ + return bdev_has_integrity_csum(bdev) || + (bdev_limits(bdev)->features & BLK_FEAT_STABLE_WRITES); } static inline bool blk_queue_write_cache(struct request_queue *q) -- cgit v1.2.3 From 8c56ef10150ed7650cf4105539242c94c156148c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Feb 2026 05:20:05 -0800 Subject: block: make max_integrity_io_size public File systems that generate integrity will need this, so move it out of the block private or blk-mq specific headers. Signed-off-by: Christoph Hellwig Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Reviewed-by: Martin K. Petersen Reviewed-by: Darrick J. Wong Tested-by: Anuj Gupta Signed-off-by: Jens Axboe --- block/blk-settings.c | 13 ------------- include/linux/blk-integrity.h | 5 ----- include/linux/blkdev.h | 18 ++++++++++++++++++ 3 files changed, 18 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/block/blk-settings.c b/block/blk-settings.c index a9e65dc090da..dabfab97fbab 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -123,19 +123,6 @@ static int blk_validate_zoned_limits(struct queue_limits *lim) return 0; } -/* - * Maximum size of I/O that needs a block layer integrity buffer. Limited - * by the number of intervals for which we can fit the integrity buffer into - * the buffer size. Because the buffer is a single segment it is also limited - * by the maximum segment size. - */ -static inline unsigned int max_integrity_io_size(struct queue_limits *lim) -{ - return min_t(unsigned int, lim->max_segment_size, - (BLK_INTEGRITY_MAX_SIZE / lim->integrity.metadata_size) << - lim->integrity.interval_exp); -} - static int blk_validate_integrity_limits(struct queue_limits *lim) { struct blk_integrity *bi = &lim->integrity; diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index fd3f3c8c0fcd..ea6d7d322ae3 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -8,11 +8,6 @@ struct request; -/* - * Maximum contiguous integrity buffer allocation. - */ -#define BLK_INTEGRITY_MAX_SIZE SZ_2M - enum blk_integrity_flags { BLK_INTEGRITY_NOVERIFY = 1 << 0, BLK_INTEGRITY_NOGENERATE = 1 << 1, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index dec0acaed6e6..11857ae13d10 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1881,6 +1881,24 @@ static inline int bio_split_rw_at(struct bio *bio, return bio_split_io_at(bio, lim, segs, max_bytes, lim->dma_alignment); } +/* + * Maximum contiguous integrity buffer allocation. + */ +#define BLK_INTEGRITY_MAX_SIZE SZ_2M + +/* + * Maximum size of I/O that needs a block layer integrity buffer. Limited + * by the number of intervals for which we can fit the integrity buffer into + * the buffer size. Because the buffer is a single segment it is also limited + * by the maximum segment size. + */ +static inline unsigned int max_integrity_io_size(struct queue_limits *lim) +{ + return min_t(unsigned int, lim->max_segment_size, + (BLK_INTEGRITY_MAX_SIZE / lim->integrity.metadata_size) << + lim->integrity.interval_exp); +} + #define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { } #endif /* _LINUX_BLKDEV_H */ -- cgit v1.2.3 From 0bde8a12b5540572a7fd6d2867bee6de15e4f289 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Feb 2026 05:20:06 -0800 Subject: block: add fs_bio_integrity helpers Add a set of helpers for file system initiated integrity information. These include mempool backed allocations and verifying based on a passed in sector and size which is often available from file system completion routines. Signed-off-by: Christoph Hellwig Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Reviewed-by: Martin K. Petersen Reviewed-by: Darrick J. Wong Tested-by: Anuj Gupta Signed-off-by: Jens Axboe --- block/Makefile | 2 +- block/bio-integrity-fs.c | 81 +++++++++++++++++++++++++++++++++++++++++++ include/linux/bio-integrity.h | 6 ++++ 3 files changed, 88 insertions(+), 1 deletion(-) create mode 100644 block/bio-integrity-fs.c (limited to 'include/linux') diff --git a/block/Makefile b/block/Makefile index c65f4da93702..7dce2e44276c 100644 --- a/block/Makefile +++ b/block/Makefile @@ -26,7 +26,7 @@ bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o obj-$(CONFIG_IOSCHED_BFQ) += bfq.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o \ - bio-integrity-auto.o + bio-integrity-auto.o bio-integrity-fs.o obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o obj-$(CONFIG_BLK_WBT) += blk-wbt.o obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o diff --git a/block/bio-integrity-fs.c b/block/bio-integrity-fs.c new file mode 100644 index 000000000000..acb1e5f270d2 --- /dev/null +++ b/block/bio-integrity-fs.c @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2025 Christoph Hellwig. + */ +#include +#include +#include "blk.h" + +struct fs_bio_integrity_buf { + struct bio_integrity_payload bip; + struct bio_vec bvec; +}; + +static struct kmem_cache *fs_bio_integrity_cache; +static mempool_t fs_bio_integrity_pool; + +unsigned int fs_bio_integrity_alloc(struct bio *bio) +{ + struct fs_bio_integrity_buf *iib; + unsigned int action; + + action = bio_integrity_action(bio); + if (!action) + return 0; + + iib = mempool_alloc(&fs_bio_integrity_pool, GFP_NOIO); + bio_integrity_init(bio, &iib->bip, &iib->bvec, 1); + + bio_integrity_alloc_buf(bio, action & BI_ACT_ZERO); + if (action & BI_ACT_CHECK) + bio_integrity_setup_default(bio); + return action; +} + +void fs_bio_integrity_free(struct bio *bio) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + + bio_integrity_free_buf(bip); + mempool_free(container_of(bip, struct fs_bio_integrity_buf, bip), + &fs_bio_integrity_pool); + + bio->bi_integrity = NULL; + bio->bi_opf &= ~REQ_INTEGRITY; +} + +void fs_bio_integrity_generate(struct bio *bio) +{ + if (fs_bio_integrity_alloc(bio)) + bio_integrity_generate(bio); +} +EXPORT_SYMBOL_GPL(fs_bio_integrity_generate); + +int fs_bio_integrity_verify(struct bio *bio, sector_t sector, unsigned int size) +{ + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + struct bio_integrity_payload *bip = bio_integrity(bio); + + /* + * Reinitialize bip->bip_iter. + * + * This is for use in the submitter after the driver is done with the + * bio. Requires the submitter to remember the sector and the size. + */ + memset(&bip->bip_iter, 0, sizeof(bip->bip_iter)); + bip->bip_iter.bi_sector = sector; + bip->bip_iter.bi_size = bio_integrity_bytes(bi, size >> SECTOR_SHIFT); + return blk_status_to_errno(bio_integrity_verify(bio, &bip->bip_iter)); +} + +static int __init fs_bio_integrity_init(void) +{ + fs_bio_integrity_cache = kmem_cache_create("fs_bio_integrity", + sizeof(struct fs_bio_integrity_buf), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + if (mempool_init_slab_pool(&fs_bio_integrity_pool, BIO_POOL_SIZE, + fs_bio_integrity_cache)) + panic("fs_bio_integrity: can't create pool\n"); + return 0; +} +fs_initcall(fs_bio_integrity_init); diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 232b86b9bbcb..af5178434ec6 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -145,4 +145,10 @@ void bio_integrity_alloc_buf(struct bio *bio, bool zero_buffer); void bio_integrity_free_buf(struct bio_integrity_payload *bip); void bio_integrity_setup_default(struct bio *bio); +unsigned int fs_bio_integrity_alloc(struct bio *bio); +void fs_bio_integrity_free(struct bio *bio); +void fs_bio_integrity_generate(struct bio *bio); +int fs_bio_integrity_verify(struct bio *bio, sector_t sector, + unsigned int size); + #endif /* _LINUX_BIO_INTEGRITY_H */ -- cgit v1.2.3 From a9aa6045abde87b94168c3ba034b953417e27272 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Feb 2026 05:20:07 -0800 Subject: block: pass a maxlen argument to bio_iov_iter_bounce Allow the file system to limit the size processed in a single bounce operation. This is needed when generating integrity data so that the size of a single integrity segment can't overflow. Signed-off-by: Christoph Hellwig Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Reviewed-by: Martin K. Petersen Reviewed-by: Darrick J. Wong Tested-by: Anuj Gupta Signed-off-by: Jens Axboe --- block/bio.c | 17 ++++++++++------- fs/iomap/direct-io.c | 2 +- include/linux/bio.h | 2 +- 3 files changed, 12 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/block/bio.c b/block/bio.c index d80d5d26804e..784d2a66d3ae 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1327,9 +1327,10 @@ static void bio_free_folios(struct bio *bio) } } -static int bio_iov_iter_bounce_write(struct bio *bio, struct iov_iter *iter) +static int bio_iov_iter_bounce_write(struct bio *bio, struct iov_iter *iter, + size_t maxlen) { - size_t total_len = iov_iter_count(iter); + size_t total_len = min(maxlen, iov_iter_count(iter)); if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return -EINVAL; @@ -1367,9 +1368,10 @@ static int bio_iov_iter_bounce_write(struct bio *bio, struct iov_iter *iter) return 0; } -static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter) +static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter, + size_t maxlen) { - size_t len = min(iov_iter_count(iter), SZ_1M); + size_t len = min3(iov_iter_count(iter), maxlen, SZ_1M); struct folio *folio; folio = folio_alloc_greedy(GFP_KERNEL, &len); @@ -1408,6 +1410,7 @@ static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter) * bio_iov_iter_bounce - bounce buffer data from an iter into a bio * @bio: bio to send * @iter: iter to read from / write into + * @maxlen: maximum size to bounce * * Helper for direct I/O implementations that need to bounce buffer because * we need to checksum the data or perform other operations that require @@ -1415,11 +1418,11 @@ static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter) * copies the data into it. Needs to be paired with bio_iov_iter_unbounce() * called on completion. */ -int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter) +int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter, size_t maxlen) { if (op_is_write(bio_op(bio))) - return bio_iov_iter_bounce_write(bio, iter); - return bio_iov_iter_bounce_read(bio, iter); + return bio_iov_iter_bounce_write(bio, iter, maxlen); + return bio_iov_iter_bounce_read(bio, iter, maxlen); } static void bvec_unpin(struct bio_vec *bv, bool mark_dirty) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 95254aa1b654..21d4fad2eeb8 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -338,7 +338,7 @@ static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter, bio->bi_end_io = iomap_dio_bio_end_io; if (dio->flags & IOMAP_DIO_BOUNCE) - ret = bio_iov_iter_bounce(bio, dio->submit.iter); + ret = bio_iov_iter_bounce(bio, dio->submit.iter, BIO_MAX_SIZE); else ret = bio_iov_iter_get_pages(bio, dio->submit.iter, alignment - 1); diff --git a/include/linux/bio.h b/include/linux/bio.h index 36a3f2275ecd..9693a0d6fefe 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -474,7 +474,7 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); -int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter); +int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter, size_t maxlen); void bio_iov_iter_unbounce(struct bio *bio, bool is_error, bool mark_dirty); extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, -- cgit v1.2.3 From b9e0180b2e6a48532eb80e5cd8793157196586cf Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:43 +0100 Subject: fbdev: Declare src parameter of fb_pad_ helpers as constant Fbdev's padding helpers do not modify the source buffer. Declare the parameter as 'const'. Fbcon's font-rendering code calls these helpers with the font data. Declaring src as const will allow for making the font data constant as well. While at it, also remove the extern qualifier from the function declarations in the header file. Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- drivers/video/fbdev/core/fbmem.c | 6 +++--- include/linux/fb.h | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c index cf199038f069..30f2b59c47bf 100644 --- a/drivers/video/fbdev/core/fbmem.c +++ b/drivers/video/fbdev/core/fbmem.c @@ -91,14 +91,14 @@ EXPORT_SYMBOL(fb_get_color_depth); /* * Data padding functions. */ -void fb_pad_aligned_buffer(u8 *dst, u32 d_pitch, u8 *src, u32 s_pitch, u32 height) +void fb_pad_aligned_buffer(u8 *dst, u32 d_pitch, const u8 *src, u32 s_pitch, u32 height) { __fb_pad_aligned_buffer(dst, d_pitch, src, s_pitch, height); } EXPORT_SYMBOL(fb_pad_aligned_buffer); -void fb_pad_unaligned_buffer(u8 *dst, u32 d_pitch, u8 *src, u32 idx, u32 height, - u32 shift_high, u32 shift_low, u32 mod) +void fb_pad_unaligned_buffer(u8 *dst, u32 d_pitch, const u8 *src, u32 idx, u32 height, + u32 shift_high, u32 shift_low, u32 mod) { u8 mask = (u8) (0xff << shift_high), tmp; int i, j; diff --git a/include/linux/fb.h b/include/linux/fb.h index b27943719fab..5178a33c752c 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -602,9 +602,9 @@ extern int register_framebuffer(struct fb_info *fb_info); extern void unregister_framebuffer(struct fb_info *fb_info); extern int devm_register_framebuffer(struct device *dev, struct fb_info *fb_info); extern char* fb_get_buffer_offset(struct fb_info *info, struct fb_pixmap *buf, u32 size); -extern void fb_pad_unaligned_buffer(u8 *dst, u32 d_pitch, u8 *src, u32 idx, - u32 height, u32 shift_high, u32 shift_low, u32 mod); -extern void fb_pad_aligned_buffer(u8 *dst, u32 d_pitch, u8 *src, u32 s_pitch, u32 height); +void fb_pad_unaligned_buffer(u8 *dst, u32 d_pitch, const u8 *src, u32 idx, u32 height, + u32 shift_high, u32 shift_low, u32 mod); +void fb_pad_aligned_buffer(u8 *dst, u32 d_pitch, const u8 *src, u32 s_pitch, u32 height); extern void fb_set_suspend(struct fb_info *info, int state); extern int fb_get_color_depth(struct fb_var_screeninfo *var, struct fb_fix_screeninfo *fix); @@ -630,8 +630,8 @@ static inline struct device *dev_of_fbinfo(const struct fb_info *info) #endif } -static inline void __fb_pad_aligned_buffer(u8 *dst, u32 d_pitch, - u8 *src, u32 s_pitch, u32 height) +static inline void __fb_pad_aligned_buffer(u8 *dst, u32 d_pitch, const u8 *src, u32 s_pitch, + u32 height) { u32 i, j; -- cgit v1.2.3 From 982f8b002aadef2b5169147b3a60a9eb62f908df Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:44 +0100 Subject: vt: Remove trailing whitespaces Fix coding style. No functional changes. Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- include/linux/console_struct.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h index 13b35637bd5a..ebdb9750d348 100644 --- a/include/linux/console_struct.h +++ b/include/linux/console_struct.h @@ -120,7 +120,7 @@ struct vc_data { unsigned short vc_complement_mask; /* [#] Xor mask for mouse pointer */ unsigned short vc_s_complement_mask; /* Saved mouse pointer mask */ unsigned long vc_pos; /* Cursor address */ - /* fonts */ + /* fonts */ unsigned short vc_hi_font_mask; /* [#] Attribute set for upper 256 chars of font or 0 if not supported */ struct console_font vc_font; /* Current VC font set */ unsigned short vc_video_erase_char; /* Background erase character */ -- cgit v1.2.3 From 61912c607fa9955dcf3fc018b227baa98a6776dc Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:45 +0100 Subject: vt: Store font in struct vc_font Replace struct console_font with struct vc_font for the type of the vc_font field of struct vc_data. Struct console_font is UAPI, which prevents further changes. Hence a new data type is required. Struct console_font has a documented vertical pitch of 32 bytes. This is not the case after the font data has been loaded into the kernel. Changing the type of vc_font addresses this inconsistency. The font data is now declared as constant, as it might come from the kernel's read-only section. There's some fallout throughout the console code where non-const variables refer to it. Fix them. A later update will declare the font data to a dedicated data type. v3: - fix typos Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- drivers/video/fbdev/core/bitblit.c | 11 +++++------ drivers/video/fbdev/core/fbcon.c | 4 ++-- drivers/video/fbdev/core/fbcon.h | 4 ++-- include/linux/console_struct.h | 29 +++++++++++++++++++++++++++-- 4 files changed, 36 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/drivers/video/fbdev/core/bitblit.c b/drivers/video/fbdev/core/bitblit.c index 085ffb44c51a..7478accea8ec 100644 --- a/drivers/video/fbdev/core/bitblit.c +++ b/drivers/video/fbdev/core/bitblit.c @@ -22,8 +22,7 @@ /* * Accelerated handlers. */ -static void update_attr(u8 *dst, u8 *src, int attribute, - struct vc_data *vc) +static void update_attr(u8 *dst, const u8 *src, int attribute, struct vc_data *vc) { int i, offset = (vc->vc_font.height < 10) ? 1 : 2; int width = DIV_ROUND_UP(vc->vc_font.width, 8); @@ -81,7 +80,7 @@ static inline void bit_putcs_aligned(struct vc_data *vc, struct fb_info *info, u16 charmask = vc->vc_hi_font_mask ? 0x1ff : 0xff; unsigned int charcnt = vc->vc_font.charcount; u32 idx = vc->vc_font.width >> 3; - u8 *src; + const u8 *src; while (cnt--) { u16 ch = scr_readw(s++) & charmask; @@ -120,7 +119,7 @@ static inline void bit_putcs_unaligned(struct vc_data *vc, u32 shift_low = 0, mod = vc->vc_font.width % 8; u32 shift_high = 8; u32 idx = vc->vc_font.width >> 3; - u8 *src; + const u8 *src; while (cnt--) { u16 ch = scr_readw(s++) & charmask; @@ -267,7 +266,7 @@ static void bit_cursor(struct vc_data *vc, struct fb_info *info, bool enable, int y = real_y(par->p, vc->state.y); int attribute, use_sw = vc->vc_cursor_type & CUR_SW; int err = 1; - char *src; + const u8 *src; cursor.set = 0; @@ -278,7 +277,7 @@ static void bit_cursor(struct vc_data *vc, struct fb_info *info, bool enable, attribute = get_attribute(info, c); src = vc->vc_font.data + ((c & charmask) * (w * vc->vc_font.height)); - if (par->cursor_state.image.data != src || + if (par->cursor_state.image.data != (const char *)src || par->cursor_reset) { par->cursor_state.image.data = src; cursor.set |= FB_CUR_SETIMAGE; diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 666261ae59d8..247bb90c08d3 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -2286,7 +2286,7 @@ static bool fbcon_blank(struct vc_data *vc, enum vesa_blank_mode blank, static int fbcon_get_font(struct vc_data *vc, struct console_font *font, unsigned int vpitch) { - u8 *fontdata = vc->vc_font.data; + const u8 *fontdata = vc->vc_font.data; u8 *data = font->data; int i, j; @@ -2417,7 +2417,7 @@ static int fbcon_do_set_font(struct vc_data *vc, int w, int h, int charcount, struct fbcon_par *par = info->fbcon_par; struct fbcon_display *p = &fb_display[vc->vc_num]; int resize, ret, old_userfont, old_width, old_height, old_charcount; - u8 *old_data = vc->vc_font.data; + const u8 *old_data = vc->vc_font.data; resize = (w != vc->vc_font.width) || (h != vc->vc_font.height); vc->vc_font.data = (void *)(p->fontdata = data); diff --git a/drivers/video/fbdev/core/fbcon.h b/drivers/video/fbdev/core/fbcon.h index fca14e9b729b..3f4386a40237 100644 --- a/drivers/video/fbdev/core/fbcon.h +++ b/drivers/video/fbdev/core/fbcon.h @@ -82,8 +82,8 @@ struct fbcon_par { int rotate; int cur_rotate; char *cursor_data; - u8 *fontbuffer; - u8 *fontdata; + u8 *fontbuffer; + const u8 *fontdata; u8 *cursor_src; u32 cursor_size; u32 fd_size; diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h index ebdb9750d348..ea0cdf4278a3 100644 --- a/include/linux/console_struct.h +++ b/include/linux/console_struct.h @@ -13,8 +13,9 @@ #ifndef _LINUX_CONSOLE_STRUCT_H #define _LINUX_CONSOLE_STRUCT_H -#include +#include #include +#include #include struct uni_pagedict; @@ -58,6 +59,30 @@ struct vc_state { bool reverse; }; +/** + * struct vc_font - Describes a font + * @width: The width of a single glyph in bits + * @height: The height of a single glyph in scanlines + * @charcount: The number of glyphs in the font + * @data: The raw font data + * + * Font data is organized as an array of glyphs. Each glyph is a bitmap with + * set bits indicating the foreground color. Unset bits indicate background + * color. The fields @width and @height store a single glyph's number of + * horizontal bits and vertical scanlines. If width is not a multiple of 8, + * there are trailing bits to fill up the byte. These bits should not be drawn. + * + * The field @data points to the first glyph's first byte. The value @charcount + * gives the number of glyphs in the font. There are no empty scanlines between + * two adjacent glyphs. + */ +struct vc_font { + unsigned int width; + unsigned int height; + unsigned int charcount; + const unsigned char *data; +}; + /* * Example: vc_data of a console that was scrolled 3 lines down. * @@ -122,7 +147,7 @@ struct vc_data { unsigned long vc_pos; /* Cursor address */ /* fonts */ unsigned short vc_hi_font_mask; /* [#] Attribute set for upper 256 chars of font or 0 if not supported */ - struct console_font vc_font; /* Current VC font set */ + struct vc_font vc_font; /* Current VC font set */ unsigned short vc_video_erase_char; /* Background erase character */ /* VT terminal data */ unsigned int vc_state; /* Escape sequence parser state */ -- cgit v1.2.3 From e370d84b79ad28ecf9a9e1dad967aa64dbfbd8d8 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:46 +0100 Subject: vt: Calculate font-buffer size with vc_font_size() In fbcon, fbcon_resize() computes the size of the font buffer from the values stored in vc_font. Move these calculations to the dedicated helpers vc_font_pitch() and vc_font_size(). Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- drivers/video/fbdev/core/fbcon.c | 9 ++------- include/linux/console_struct.h | 28 ++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 247bb90c08d3..103e91c8d874 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -2037,7 +2037,6 @@ static void updatescrollmode(struct fbcon_display *p, } #define PITCH(w) (((w) + 7) >> 3) -#define CALC_FONTSZ(h, p, c) ((h) * (p) * (c)) /* size = height * pitch * charcount */ static int fbcon_resize(struct vc_data *vc, unsigned int width, unsigned int height, bool from_user) @@ -2049,8 +2048,7 @@ static int fbcon_resize(struct vc_data *vc, unsigned int width, int x_diff, y_diff, virt_w, virt_h, virt_fw, virt_fh; if (p->userfont && FNTSIZE(vc->vc_font.data)) { - int size; - int pitch = PITCH(vc->vc_font.width); + unsigned int size = vc_font_size(&vc->vc_font); /* * If user font, ensure that a possible change to user font @@ -2059,10 +2057,7 @@ static int fbcon_resize(struct vc_data *vc, unsigned int width, * charcount can change and cannot be used to determine the * font data allocated size. */ - if (pitch <= 0) - return -EINVAL; - size = CALC_FONTSZ(vc->vc_font.height, pitch, vc->vc_font.charcount); - if (size > FNTSIZE(vc->vc_font.data)) + if (!size || size > FNTSIZE(vc->vc_font.data)) return -EINVAL; } diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h index ea0cdf4278a3..771cba16cb54 100644 --- a/include/linux/console_struct.h +++ b/include/linux/console_struct.h @@ -83,6 +83,34 @@ struct vc_font { const unsigned char *data; }; +/** + * vc_font_pitch - Calculates the number of bytes between two adjacent scanlines + * @font: The VC font + * + * Returns: + * The number of bytes between two adjacent scanlines in the font data + */ +static inline unsigned int vc_font_pitch(const struct vc_font *font) +{ + return DIV_ROUND_UP(font->width, 8); +} + +/** + * vc_font_size - Calculates the size of the font data in bytes + * @font: The VC font + * + * vc_font_size() calculates the number of bytes of font data in the + * font specified by @font. The function calculates the size from the + * font parameters. + * + * Returns: + * The size of the font data in bytes. + */ +static inline unsigned int vc_font_size(const struct vc_font *font) +{ + return font->height * vc_font_pitch(font) * font->charcount; +} + /* * Example: vc_data of a console that was scrolled 3 lines down. * -- cgit v1.2.3 From 773ac24c44614ad1d5a96258534160c4a0d48d72 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:48 +0100 Subject: lib/fonts: Remove FNTCHARCNT() The character count in the font data is unused. The internal fonts also do not set it. Remove FNTCHARCNT(). Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- drivers/video/console/newport_con.c | 1 - include/linux/font.h | 1 - 2 files changed, 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/video/console/newport_con.c b/drivers/video/console/newport_con.c index 337e04236d6d..259a63fc7789 100644 --- a/drivers/video/console/newport_con.c +++ b/drivers/video/console/newport_con.c @@ -516,7 +516,6 @@ static int newport_set_font(int unit, const struct console_font *op, new_data += FONT_EXTRA_WORDS * sizeof(int); FNTSIZE(new_data) = size; - FNTCHARCNT(new_data) = op->charcount; REFCOUNT(new_data) = 0; /* usage counter */ FNTSUM(new_data) = 0; diff --git a/include/linux/font.h b/include/linux/font.h index fd8625cd76b2..d929c5fa32ca 100644 --- a/include/linux/font.h +++ b/include/linux/font.h @@ -68,7 +68,6 @@ extern const struct font_desc *get_default_font(int xres, int yres, /* Extra word getters */ #define REFCOUNT(fd) (((int *)(fd))[-1]) #define FNTSIZE(fd) (((int *)(fd))[-2]) -#define FNTCHARCNT(fd) (((int *)(fd))[-3]) #define FNTSUM(fd) (((int *)(fd))[-4]) #define FONT_EXTRA_WORDS 4 -- cgit v1.2.3 From 04bd5abc8cbebc1bd7e02471a8e3af51b8aad029 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:49 +0100 Subject: lib/fonts: Store font data as font_data_t; update consoles Store font data as pointer to font_data_t instead of unsigned char. Update consoles. Pointers to font data refer to the raw data. There is a hidden header before the data that contains additional state. Document the existing layout and semantics of font_data_t. The data field in struct vc_font can be used by any console. Therefore it still points to plain data without the additional header. Fbcon sets its value from struct fbcon_display.fontdata. Hence, update the size test in fbcon_resize() to use struct fbcon_display.fontdata instead of struct vc_font.data. v3: - fix typos (Helge) v2: - 'Font lookup' -> 'Font description' in Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- drivers/video/console/newport_con.c | 17 +++++++------- drivers/video/fbdev/core/fbcon.c | 44 ++++++++++++++++++++-------------- drivers/video/fbdev/core/fbcon.h | 3 ++- include/linux/font.h | 47 ++++++++++++++++++++++++++++++++++++- 4 files changed, 84 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/drivers/video/console/newport_con.c b/drivers/video/console/newport_con.c index 259a63fc7789..857b85df360c 100644 --- a/drivers/video/console/newport_con.c +++ b/drivers/video/console/newport_con.c @@ -33,9 +33,9 @@ #define NEWPORT_LEN 0x10000 -#define FONT_DATA ((unsigned char *)font_vga_8x16.data) +#define FONT_DATA font_vga_8x16.data -static unsigned char *font_data[MAX_NR_CONSOLES]; +static font_data_t *font_data[MAX_NR_CONSOLES]; static struct newport_regs *npregs; static unsigned long newport_addr; @@ -370,9 +370,9 @@ static void newport_clear(struct vc_data *vc, unsigned int sy, unsigned int sx, static void newport_putc(struct vc_data *vc, u16 charattr, unsigned int ypos, unsigned int xpos) { - unsigned char *p; + const unsigned char *p; - p = &font_data[vc->vc_num][(charattr & 0xff) << 4]; + p = &font_data_buf(font_data[vc->vc_num])[(charattr & 0xff) << 4]; charattr = (charattr >> 8) & 0xff; xpos <<= 3; ypos <<= 4; @@ -400,7 +400,7 @@ static void newport_putcs(struct vc_data *vc, const u16 *s, unsigned int count, unsigned int ypos, unsigned int xpos) { - unsigned char *p; + const unsigned char *p; unsigned int i; u16 charattr; @@ -424,7 +424,7 @@ static void newport_putcs(struct vc_data *vc, const u16 *s, NPORT_DMODE0_L32); for (i = 0; i < count; i++, xpos += 8) { - p = &font_data[vc->vc_num][(scr_readw(s++) & 0xff) << 4]; + p = &font_data_buf(font_data[vc->vc_num])[(scr_readw(s++) & 0xff) << 4]; newport_wait(npregs); @@ -503,7 +503,8 @@ static int newport_set_font(int unit, const struct console_font *op, int h = op->height; int size = h * op->charcount; int i; - unsigned char *new_data, *data = op->data, *p; + font_data_t *new_data; + unsigned char *data = op->data, *p; /* ladis: when I grow up, there will be a day... and more sizes will * be supported ;-) */ @@ -519,7 +520,7 @@ static int newport_set_font(int unit, const struct console_font *op, REFCOUNT(new_data) = 0; /* usage counter */ FNTSUM(new_data) = 0; - p = new_data; + p = (unsigned char *)font_data_buf(new_data); for (i = 0; i < op->charcount; i++) { memcpy(p, data, h); data += 32; diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 103e91c8d874..8d7840b9ebad 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -1019,8 +1019,10 @@ static const char *fbcon_startup(void) info->pixmap.blit_y); vc->vc_font.width = font->width; vc->vc_font.height = font->height; - vc->vc_font.data = (void *)(p->fontdata = font->data); + vc->vc_font.data = font_data_buf(font->data); vc->vc_font.charcount = font->charcount; + + p->fontdata = font->data; } cols = FBCON_SWAP(par->rotate, info->var.xres, info->var.yres); @@ -1078,11 +1080,12 @@ static void fbcon_init(struct vc_data *vc, bool init) if (t->fontdata) { struct vc_data *fvc = vc_cons[fg_console].d; - vc->vc_font.data = (void *)(p->fontdata = - fvc->vc_font.data); + vc->vc_font.data = fvc->vc_font.data; vc->vc_font.width = fvc->vc_font.width; vc->vc_font.height = fvc->vc_font.height; vc->vc_font.charcount = fvc->vc_font.charcount; + + p->fontdata = t->fontdata; p->userfont = t->userfont; if (p->userfont) @@ -1097,8 +1100,10 @@ static void fbcon_init(struct vc_data *vc, bool init) info->pixmap.blit_y); vc->vc_font.width = font->width; vc->vc_font.height = font->height; - vc->vc_font.data = (void *)(p->fontdata = font->data); + vc->vc_font.data = font_data_buf(font->data); vc->vc_font.charcount = font->charcount; + + p->fontdata = font->data; } } @@ -1409,11 +1414,12 @@ static void fbcon_set_disp(struct fb_info *info, struct fb_var_screeninfo *var, svc = *default_mode; t = &fb_display[svc->vc_num]; - if (!vc->vc_font.data) { - vc->vc_font.data = (void *)(p->fontdata = t->fontdata); + if (!p->fontdata) { + vc->vc_font.data = font_data_buf(t->fontdata); vc->vc_font.width = (*default_mode)->vc_font.width; vc->vc_font.height = (*default_mode)->vc_font.height; vc->vc_font.charcount = (*default_mode)->vc_font.charcount; + p->fontdata = t->fontdata; p->userfont = t->userfont; if (p->userfont) REFCOUNT(p->fontdata)++; @@ -2047,7 +2053,7 @@ static int fbcon_resize(struct vc_data *vc, unsigned int width, struct fb_var_screeninfo var = info->var; int x_diff, y_diff, virt_w, virt_h, virt_fw, virt_fh; - if (p->userfont && FNTSIZE(vc->vc_font.data)) { + if (p->userfont && FNTSIZE(p->fontdata)) { unsigned int size = vc_font_size(&vc->vc_font); /* @@ -2057,7 +2063,7 @@ static int fbcon_resize(struct vc_data *vc, unsigned int width, * charcount can change and cannot be used to determine the * font data allocated size. */ - if (!size || size > FNTSIZE(vc->vc_font.data)) + if (!size || size > FNTSIZE(p->fontdata)) return -EINVAL; } @@ -2281,7 +2287,8 @@ static bool fbcon_blank(struct vc_data *vc, enum vesa_blank_mode blank, static int fbcon_get_font(struct vc_data *vc, struct console_font *font, unsigned int vpitch) { - const u8 *fontdata = vc->vc_font.data; + struct fbcon_display *p = &fb_display[vc->vc_num]; + font_data_t *fontdata = p->fontdata; u8 *data = font->data; int i, j; @@ -2406,16 +2413,18 @@ static void set_vc_hi_font(struct vc_data *vc, bool set) } static int fbcon_do_set_font(struct vc_data *vc, int w, int h, int charcount, - const u8 * data, int userfont) + font_data_t *data, int userfont) { struct fb_info *info = fbcon_info_from_console(vc->vc_num); struct fbcon_par *par = info->fbcon_par; struct fbcon_display *p = &fb_display[vc->vc_num]; int resize, ret, old_userfont, old_width, old_height, old_charcount; + font_data_t *old_fontdata = p->fontdata; const u8 *old_data = vc->vc_font.data; resize = (w != vc->vc_font.width) || (h != vc->vc_font.height); - vc->vc_font.data = (void *)(p->fontdata = data); + p->fontdata = data; + vc->vc_font.data = font_data_buf(p->fontdata); old_userfont = p->userfont; if ((p->userfont = userfont)) REFCOUNT(data)++; @@ -2448,12 +2457,12 @@ static int fbcon_do_set_font(struct vc_data *vc, int w, int h, int charcount, update_screen(vc); } - if (old_userfont && (--REFCOUNT(old_data) == 0)) - kfree(old_data - FONT_EXTRA_WORDS * sizeof(int)); + if (old_userfont && (--REFCOUNT(old_fontdata) == 0)) + kfree(old_fontdata - FONT_EXTRA_WORDS * sizeof(int)); return 0; err_out: - p->fontdata = old_data; + p->fontdata = old_fontdata; vc->vc_font.data = old_data; if (userfont) { @@ -2483,7 +2492,8 @@ static int fbcon_set_font(struct vc_data *vc, const struct console_font *font, int h = font->height; int size, alloc_size; int i, csum; - u8 *new_data, *data = font->data; + font_data_t *new_data; + u8 *data = font->data; int pitch = PITCH(font->width); /* Is there a reason why fbconsole couldn't handle any charcount >256? @@ -2522,13 +2532,13 @@ static int fbcon_set_font(struct vc_data *vc, const struct console_font *font, if (!new_data) return -ENOMEM; - memset(new_data, 0, FONT_EXTRA_WORDS * sizeof(int)); + memset((u8 *)new_data, 0, FONT_EXTRA_WORDS * sizeof(int)); new_data += FONT_EXTRA_WORDS * sizeof(int); FNTSIZE(new_data) = size; REFCOUNT(new_data) = 0; /* usage counter */ for (i=0; i< charcount; i++) { - memcpy(new_data + i*h*pitch, data + i*vpitch*pitch, h*pitch); + memcpy((u8 *)new_data + i * h * pitch, data + i * vpitch * pitch, h * pitch); } /* Since linux has a nice crc32 function use it for counting font diff --git a/drivers/video/fbdev/core/fbcon.h b/drivers/video/fbdev/core/fbcon.h index 3f4386a40237..d26ee7860cf5 100644 --- a/drivers/video/fbdev/core/fbcon.h +++ b/drivers/video/fbdev/core/fbcon.h @@ -11,6 +11,7 @@ #ifndef _VIDEO_FBCON_H #define _VIDEO_FBCON_H +#include #include #include #include @@ -25,7 +26,7 @@ struct fbcon_display { /* Filled in by the low-level console driver */ - const u_char *fontdata; + font_data_t *fontdata; int userfont; /* != 0 if fontdata kmalloc()ed */ #ifdef CONFIG_FRAMEBUFFER_CONSOLE_LEGACY_ACCELERATION u_short scrollmode; /* Scroll Method, use fb_scrollmode() */ diff --git a/include/linux/font.h b/include/linux/font.h index d929c5fa32ca..746a0996a018 100644 --- a/include/linux/font.h +++ b/include/linux/font.h @@ -13,12 +13,57 @@ #include +/* + * font_data_t and helpers + */ + +/** + * font_data_t - Raw font data + * + * Values of type font_data_t store a pointer to raw font data. The format + * is monochrome. Each bit sets a pixel of a stored glyph. Font data does + * not store geometry information for the individual glyphs. Users of the + * font have to store glyph size, pitch and character count separately. + * + * Font data in font_data_t is not equivalent to raw u8. Each pointer stores + * an additional hidden header before the font data. The layout is + * + * +------+-----------------------------+ + * | -16 | CRC32 Checksum (optional) | + * | -12 | | + * | -8 | Number of data bytes | + * | -4 | Reference count | + * +------+-----------------------------+ + * | 0 | Data buffer | + * | ... | | + * +------+-----------------------------+ + * + * Use helpers to access font_data_t. Use font_data_buf() to get the stored data. + */ +typedef const unsigned char font_data_t; + +/** + * font_data_buf() - Returns the font data as raw bytes + * @fd: The font data + * + * Returns: + * The raw font data. The provided buffer is read-only. + */ +static inline const unsigned char *font_data_buf(font_data_t *fd) +{ + return (const unsigned char *)fd; +} + +/* + * Font description + */ + struct font_desc { int idx; const char *name; unsigned int width, height; unsigned int charcount; - const void *data; + font_data_t *data; int pref; }; -- cgit v1.2.3 From e2e000a0b22036a72474088d3399097ff47ace02 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:50 +0100 Subject: lib/fonts: Read font size with font_data_size() Add font_data_size() and update consoles to use it. Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- drivers/video/console/newport_con.c | 2 +- drivers/video/fbdev/core/fbcon.c | 14 +++++++------- include/linux/font.h | 2 ++ lib/fonts/fonts.c | 21 +++++++++++++++++++++ 4 files changed, 31 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/video/console/newport_con.c b/drivers/video/console/newport_con.c index 857b85df360c..00ace2940aa0 100644 --- a/drivers/video/console/newport_con.c +++ b/drivers/video/console/newport_con.c @@ -530,7 +530,7 @@ static int newport_set_font(int unit, const struct console_font *op, /* check if font is already used by other console */ for (i = 0; i < MAX_NR_CONSOLES; i++) { if (font_data[i] != FONT_DATA - && FNTSIZE(font_data[i]) == size + && font_data_size(font_data[i]) == size && !memcmp(font_data[i], new_data, size)) { kfree(new_data - FONT_EXTRA_WORDS * sizeof(int)); /* current font is the same as the new one */ diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 8d7840b9ebad..fa8f3e4196de 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -2053,7 +2053,7 @@ static int fbcon_resize(struct vc_data *vc, unsigned int width, struct fb_var_screeninfo var = info->var; int x_diff, y_diff, virt_w, virt_h, virt_fw, virt_fh; - if (p->userfont && FNTSIZE(p->fontdata)) { + if (p->userfont && font_data_size(p->fontdata)) { unsigned int size = vc_font_size(&vc->vc_font); /* @@ -2063,7 +2063,7 @@ static int fbcon_resize(struct vc_data *vc, unsigned int width, * charcount can change and cannot be used to determine the * font data allocated size. */ - if (!size || size > FNTSIZE(p->fontdata)) + if (!size || size > font_data_size(p->fontdata)) return -EINVAL; } @@ -2302,7 +2302,7 @@ static int fbcon_get_font(struct vc_data *vc, struct console_font *font, unsigne if (font->width <= 8) { j = vc->vc_font.height; - if (font->charcount * j > FNTSIZE(fontdata)) + if (font->charcount * j > font_data_size(fontdata)) return -EINVAL; for (i = 0; i < font->charcount; i++) { @@ -2313,7 +2313,7 @@ static int fbcon_get_font(struct vc_data *vc, struct console_font *font, unsigne } } else if (font->width <= 16) { j = vc->vc_font.height * 2; - if (font->charcount * j > FNTSIZE(fontdata)) + if (font->charcount * j > font_data_size(fontdata)) return -EINVAL; for (i = 0; i < font->charcount; i++) { @@ -2323,7 +2323,7 @@ static int fbcon_get_font(struct vc_data *vc, struct console_font *font, unsigne fontdata += j; } } else if (font->width <= 24) { - if (font->charcount * (vc->vc_font.height * sizeof(u32)) > FNTSIZE(fontdata)) + if (font->charcount * (vc->vc_font.height * sizeof(u32)) > font_data_size(fontdata)) return -EINVAL; for (i = 0; i < font->charcount; i++) { @@ -2338,7 +2338,7 @@ static int fbcon_get_font(struct vc_data *vc, struct console_font *font, unsigne } } else { j = vc->vc_font.height * 4; - if (font->charcount * j > FNTSIZE(fontdata)) + if (font->charcount * j > font_data_size(fontdata)) return -EINVAL; for (i = 0; i < font->charcount; i++) { @@ -2553,7 +2553,7 @@ static int fbcon_set_font(struct vc_data *vc, const struct console_font *font, if (fb_display[i].userfont && fb_display[i].fontdata && FNTSUM(fb_display[i].fontdata) == csum && - FNTSIZE(fb_display[i].fontdata) == size && + font_data_size(fb_display[i].fontdata) == size && tmp->vc_font.width == w && !memcmp(fb_display[i].fontdata, new_data, size)) { kfree(new_data - FONT_EXTRA_WORDS * sizeof(int)); diff --git a/include/linux/font.h b/include/linux/font.h index 746a0996a018..5b8557813c5c 100644 --- a/include/linux/font.h +++ b/include/linux/font.h @@ -54,6 +54,8 @@ static inline const unsigned char *font_data_buf(font_data_t *fd) return (const unsigned char *)fd; } +unsigned int font_data_size(font_data_t *fd); + /* * Font description */ diff --git a/lib/fonts/fonts.c b/lib/fonts/fonts.c index a7f118b30171..8c9a6762061c 100644 --- a/lib/fonts/fonts.c +++ b/lib/fonts/fonts.c @@ -20,6 +20,27 @@ #endif #include +/* + * Helpers for font_data_t + */ + +/** + * font_data_size - Return size of the font data in bytes + * @fd: Font data + * + * Returns: + * The number of bytes in the given font data. + */ +unsigned int font_data_size(font_data_t *fd) +{ + return FNTSIZE(fd); +} +EXPORT_SYMBOL_GPL(font_data_size); + +/* + * Font lookup + */ + static const struct font_desc *fonts[] = { #ifdef CONFIG_FONT_8x8 &font_vga_8x8, -- cgit v1.2.3 From 1de371b1f1b02dc82da598f9707089229699a604 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:51 +0100 Subject: lib/fonts: Manage font-data lifetime with font_data_get/_put() Add font_data_get() and font_data_put(). Update consoles to use them over REFCOUNT() and plain kfree(). Newly allocated font data starts with a reference count of 1. Loading the font puts the previously loaded font. If the reference count reaches zero, font_data_put() frees the font data. The kernel stores a refcount of zero for internal font data. Invoking font_data_get() and font_data_put() tests this internally and returns success without further operation. From the caller's perspective, getting and putting works the same for all font data. Fbcon used the userfont flag distinguish between internal fonts and fonts loaded by user space. Only the latter where refcounted. With the new helper's automatic handling of internal font data, remove the userfont flag from fbcon. Newport_con uses a default font, FONT_DATA, until user space loads custom font data. Remove all special cases for FONT_DATA, as the get and put calls' read-only handling also covers this case. v3: - fix module linker error wrt font symbols (Nathan, Arnd) - fix typos Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- drivers/video/console/newport_con.c | 23 ++++------- drivers/video/fbdev/core/fbcon.c | 65 +++++++++++++++---------------- drivers/video/fbdev/core/fbcon.h | 1 - include/linux/font.h | 2 + lib/fonts/fonts.c | 77 ++++++++++++++++++++++++++++++++++++- 5 files changed, 115 insertions(+), 53 deletions(-) (limited to 'include/linux') diff --git a/drivers/video/console/newport_con.c b/drivers/video/console/newport_con.c index 00ace2940aa0..dbbb787fc46e 100644 --- a/drivers/video/console/newport_con.c +++ b/drivers/video/console/newport_con.c @@ -517,7 +517,7 @@ static int newport_set_font(int unit, const struct console_font *op, new_data += FONT_EXTRA_WORDS * sizeof(int); FNTSIZE(new_data) = size; - REFCOUNT(new_data) = 0; /* usage counter */ + REFCOUNT(new_data) = 1; /* usage counter */ FNTSUM(new_data) = 0; p = (unsigned char *)font_data_buf(new_data); @@ -532,21 +532,17 @@ static int newport_set_font(int unit, const struct console_font *op, if (font_data[i] != FONT_DATA && font_data_size(font_data[i]) == size && !memcmp(font_data[i], new_data, size)) { - kfree(new_data - FONT_EXTRA_WORDS * sizeof(int)); + font_data_put(new_data); /* current font is the same as the new one */ if (i == unit) return 0; new_data = font_data[i]; + font_data_get(new_data); break; } } - /* old font is user font */ - if (font_data[unit] != FONT_DATA) { - if (--REFCOUNT(font_data[unit]) == 0) - kfree(font_data[unit] - - FONT_EXTRA_WORDS * sizeof(int)); - } - REFCOUNT(new_data)++; + + font_data_put(font_data[unit]); font_data[unit] = new_data; return 0; @@ -554,12 +550,9 @@ static int newport_set_font(int unit, const struct console_font *op, static int newport_set_def_font(int unit, struct console_font *op) { - if (font_data[unit] != FONT_DATA) { - if (--REFCOUNT(font_data[unit]) == 0) - kfree(font_data[unit] - - FONT_EXTRA_WORDS * sizeof(int)); - font_data[unit] = FONT_DATA; - } + font_data_put(font_data[unit]); + font_data[unit] = FONT_DATA; + font_data_get(font_data[unit]); return 0; } diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index fa8f3e4196de..ac59480c98cb 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -1023,6 +1023,7 @@ static const char *fbcon_startup(void) vc->vc_font.charcount = font->charcount; p->fontdata = font->data; + font_data_get(p->fontdata); } cols = FBCON_SWAP(par->rotate, info->var.xres, info->var.yres); @@ -1086,10 +1087,7 @@ static void fbcon_init(struct vc_data *vc, bool init) vc->vc_font.charcount = fvc->vc_font.charcount; p->fontdata = t->fontdata; - p->userfont = t->userfont; - - if (p->userfont) - REFCOUNT(p->fontdata)++; + font_data_get(p->fontdata); } else { const struct font_desc *font = NULL; @@ -1104,6 +1102,7 @@ static void fbcon_init(struct vc_data *vc, bool init) vc->vc_font.charcount = font->charcount; p->fontdata = font->data; + font_data_get(p->fontdata); } } @@ -1194,10 +1193,10 @@ static void fbcon_init(struct vc_data *vc, bool init) static void fbcon_free_font(struct fbcon_display *p) { - if (p->userfont && p->fontdata && (--REFCOUNT(p->fontdata) == 0)) - kfree(p->fontdata - FONT_EXTRA_WORDS * sizeof(int)); - p->fontdata = NULL; - p->userfont = 0; + if (p->fontdata) { + font_data_put(p->fontdata); + p->fontdata = NULL; + } } static void set_vc_hi_font(struct vc_data *vc, bool set); @@ -1420,9 +1419,7 @@ static void fbcon_set_disp(struct fb_info *info, struct fb_var_screeninfo *var, vc->vc_font.height = (*default_mode)->vc_font.height; vc->vc_font.charcount = (*default_mode)->vc_font.charcount; p->fontdata = t->fontdata; - p->userfont = t->userfont; - if (p->userfont) - REFCOUNT(p->fontdata)++; + font_data_get(p->fontdata); } var->activate = FB_ACTIVATE_NOW; @@ -2053,7 +2050,7 @@ static int fbcon_resize(struct vc_data *vc, unsigned int width, struct fb_var_screeninfo var = info->var; int x_diff, y_diff, virt_w, virt_h, virt_fw, virt_fh; - if (p->userfont && font_data_size(p->fontdata)) { + if (font_data_size(p->fontdata)) { unsigned int size = vc_font_size(&vc->vc_font); /* @@ -2413,21 +2410,20 @@ static void set_vc_hi_font(struct vc_data *vc, bool set) } static int fbcon_do_set_font(struct vc_data *vc, int w, int h, int charcount, - font_data_t *data, int userfont) + font_data_t *data) { struct fb_info *info = fbcon_info_from_console(vc->vc_num); struct fbcon_par *par = info->fbcon_par; struct fbcon_display *p = &fb_display[vc->vc_num]; - int resize, ret, old_userfont, old_width, old_height, old_charcount; + int resize, ret, old_width, old_height, old_charcount; font_data_t *old_fontdata = p->fontdata; const u8 *old_data = vc->vc_font.data; + font_data_get(data); + resize = (w != vc->vc_font.width) || (h != vc->vc_font.height); p->fontdata = data; vc->vc_font.data = font_data_buf(p->fontdata); - old_userfont = p->userfont; - if ((p->userfont = userfont)) - REFCOUNT(data)++; old_width = vc->vc_font.width; old_height = vc->vc_font.height; @@ -2457,24 +2453,20 @@ static int fbcon_do_set_font(struct vc_data *vc, int w, int h, int charcount, update_screen(vc); } - if (old_userfont && (--REFCOUNT(old_fontdata) == 0)) - kfree(old_fontdata - FONT_EXTRA_WORDS * sizeof(int)); + if (old_fontdata) + font_data_put(old_fontdata); + return 0; err_out: p->fontdata = old_fontdata; vc->vc_font.data = old_data; - - if (userfont) { - p->userfont = old_userfont; - if (--REFCOUNT(data) == 0) - kfree(data - FONT_EXTRA_WORDS * sizeof(int)); - } - vc->vc_font.width = old_width; vc->vc_font.height = old_height; vc->vc_font.charcount = old_charcount; + font_data_put(data); + return ret; } @@ -2491,9 +2483,9 @@ static int fbcon_set_font(struct vc_data *vc, const struct console_font *font, int w = font->width; int h = font->height; int size, alloc_size; - int i, csum; + int i, csum, ret; font_data_t *new_data; - u8 *data = font->data; + const u8 *data = font->data; int pitch = PITCH(font->width); /* Is there a reason why fbconsole couldn't handle any charcount >256? @@ -2536,7 +2528,7 @@ static int fbcon_set_font(struct vc_data *vc, const struct console_font *font, new_data += FONT_EXTRA_WORDS * sizeof(int); FNTSIZE(new_data) = size; - REFCOUNT(new_data) = 0; /* usage counter */ + REFCOUNT(new_data) = 1; /* usage counter */ for (i=0; i< charcount; i++) { memcpy((u8 *)new_data + i * h * pitch, data + i * vpitch * pitch, h * pitch); } @@ -2550,18 +2542,21 @@ static int fbcon_set_font(struct vc_data *vc, const struct console_font *font, for (i = first_fb_vc; i <= last_fb_vc; i++) { struct vc_data *tmp = vc_cons[i].d; - if (fb_display[i].userfont && - fb_display[i].fontdata && + if (fb_display[i].fontdata && FNTSUM(fb_display[i].fontdata) == csum && font_data_size(fb_display[i].fontdata) == size && tmp->vc_font.width == w && !memcmp(fb_display[i].fontdata, new_data, size)) { - kfree(new_data - FONT_EXTRA_WORDS * sizeof(int)); - new_data = (u8 *)fb_display[i].fontdata; + font_data_get(fb_display[i].fontdata); + font_data_put(new_data); + new_data = fb_display[i].fontdata; break; } } - return fbcon_do_set_font(vc, font->width, font->height, charcount, new_data, 1); + ret = fbcon_do_set_font(vc, font->width, font->height, charcount, new_data); + font_data_put(new_data); + + return ret; } static int fbcon_set_def_font(struct vc_data *vc, struct console_font *font, @@ -2578,7 +2573,7 @@ static int fbcon_set_def_font(struct vc_data *vc, struct console_font *font, font->width = f->width; font->height = f->height; - return fbcon_do_set_font(vc, f->width, f->height, f->charcount, f->data, 0); + return fbcon_do_set_font(vc, f->width, f->height, f->charcount, f->data); } static u16 palette_red[16]; diff --git a/drivers/video/fbdev/core/fbcon.h b/drivers/video/fbdev/core/fbcon.h index d26ee7860cf5..1e3c1ef84762 100644 --- a/drivers/video/fbdev/core/fbcon.h +++ b/drivers/video/fbdev/core/fbcon.h @@ -27,7 +27,6 @@ struct fbcon_display { /* Filled in by the low-level console driver */ font_data_t *fontdata; - int userfont; /* != 0 if fontdata kmalloc()ed */ #ifdef CONFIG_FRAMEBUFFER_CONSOLE_LEGACY_ACCELERATION u_short scrollmode; /* Scroll Method, use fb_scrollmode() */ #endif diff --git a/include/linux/font.h b/include/linux/font.h index 5b8557813c5c..dd319d0f0201 100644 --- a/include/linux/font.h +++ b/include/linux/font.h @@ -54,6 +54,8 @@ static inline const unsigned char *font_data_buf(font_data_t *fd) return (const unsigned char *)fd; } +void font_data_get(font_data_t *fd); +bool font_data_put(font_data_t *fd); unsigned int font_data_size(font_data_t *fd); /* diff --git a/lib/fonts/fonts.c b/lib/fonts/fonts.c index 8c9a6762061c..d25efd8d6c31 100644 --- a/lib/fonts/fonts.c +++ b/lib/fonts/fonts.c @@ -12,18 +12,91 @@ * for more details. */ +#include +#include #include -#include +#include #include +#include + #if defined(__mc68000__) #include #endif -#include /* * Helpers for font_data_t */ +static struct font_data *to_font_data_struct(font_data_t *fd) +{ + return container_of(fd, struct font_data, data[0]); +} + +static bool font_data_is_internal(font_data_t *fd) +{ + return !REFCOUNT(fd); /* internal fonts have no reference counting */ +} + +static void font_data_free(font_data_t *fd) +{ + kfree(to_font_data_struct(fd)); +} + +/** + * font_data_get - Acquires a reference on font data + * @fd: Font data + * + * Font data from user space is reference counted. The helper + * font_data_get() increases the reference counter by one. Invoke + * font_data_put() to release the reference. + * + * Internal font data is located in read-only memory. In this case + * the helper returns success without modifying the counter field. + * It is still required to call font_data_put() on internal font data. + */ +void font_data_get(font_data_t *fd) +{ + if (font_data_is_internal(fd)) + return; /* never ref static data */ + + if (WARN_ON(!REFCOUNT(fd))) + return; /* should never be 0 */ + ++REFCOUNT(fd); +} +EXPORT_SYMBOL_GPL(font_data_get); + +/** + * font_data_put - Release a reference on font data + * @fd: Font data + * + * Font data from user space is reference counted. The helper + * font_data_put() decreases the reference counter by one. If this was + * the final reference, it frees the allocated memory. + * + * Internal font data is located in read-only memory. In this case + * the helper returns success without modifying the counter field. + * + * Returns: + * True if the font data's memory buffer has been freed, false otherwise. + */ +bool font_data_put(font_data_t *fd) +{ + unsigned int count; + + if (font_data_is_internal(fd)) + return false; /* never unref static data */ + + if (WARN_ON(!REFCOUNT(fd))) + return false; /* should never be 0 */ + + count = --REFCOUNT(fd); + if (!count) + font_data_free(fd); + + return !count; +} +EXPORT_SYMBOL_GPL(font_data_put); + /** * font_data_size - Return size of the font data in bytes * @fd: Font data -- cgit v1.2.3 From 1e3c49aa03fbfeea595ca0c9a4ebaf1e9cc078af Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:52 +0100 Subject: lib/fonts: Compare font data for equality with font_data_is_equal() Add font_data_is_equal() and update consoles to use it. Font data is equal if it has the same size and contains the same values on all bytes. Only fbcon uses a crc32 checksum. If set in both operands the checksums have to be equal. The new helper also guarantees to not compare internal fonts against fonts from user space. Internal fonts cannot be ref-counted, so making them equal to user-space fonts with the same byte sequence results in undefined behavior. The test only compares data buffers. Their interpretation is up each console. Therefore remove a width test in fbcon_set_font(). v3: - rebase onto font_data_{get,put}() Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- drivers/video/console/newport_con.c | 4 +--- drivers/video/fbdev/core/fbcon.c | 7 +------ include/linux/font.h | 1 + lib/fonts/fonts.c | 26 ++++++++++++++++++++++++++ 4 files changed, 29 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/video/console/newport_con.c b/drivers/video/console/newport_con.c index dbbb787fc46e..db0228bce00e 100644 --- a/drivers/video/console/newport_con.c +++ b/drivers/video/console/newport_con.c @@ -529,9 +529,7 @@ static int newport_set_font(int unit, const struct console_font *op, /* check if font is already used by other console */ for (i = 0; i < MAX_NR_CONSOLES; i++) { - if (font_data[i] != FONT_DATA - && font_data_size(font_data[i]) == size - && !memcmp(font_data[i], new_data, size)) { + if (font_data_is_equal(font_data[i], new_data)) { font_data_put(new_data); /* current font is the same as the new one */ if (i == unit) diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index ac59480c98cb..00255ac92e42 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -2540,13 +2540,8 @@ static int fbcon_set_font(struct vc_data *vc, const struct console_font *font, FNTSUM(new_data) = csum; /* Check if the same font is on some other console already */ for (i = first_fb_vc; i <= last_fb_vc; i++) { - struct vc_data *tmp = vc_cons[i].d; - if (fb_display[i].fontdata && - FNTSUM(fb_display[i].fontdata) == csum && - font_data_size(fb_display[i].fontdata) == size && - tmp->vc_font.width == w && - !memcmp(fb_display[i].fontdata, new_data, size)) { + font_data_is_equal(fb_display[i].fontdata, new_data)) { font_data_get(fb_display[i].fontdata); font_data_put(new_data); new_data = fb_display[i].fontdata; diff --git a/include/linux/font.h b/include/linux/font.h index dd319d0f0201..58bf3c64cabb 100644 --- a/include/linux/font.h +++ b/include/linux/font.h @@ -57,6 +57,7 @@ static inline const unsigned char *font_data_buf(font_data_t *fd) void font_data_get(font_data_t *fd); bool font_data_put(font_data_t *fd); unsigned int font_data_size(font_data_t *fd); +bool font_data_is_equal(font_data_t *lhs, font_data_t *rhs); /* * Font description diff --git a/lib/fonts/fonts.c b/lib/fonts/fonts.c index d25efd8d6c31..3fb76d185647 100644 --- a/lib/fonts/fonts.c +++ b/lib/fonts/fonts.c @@ -110,6 +110,32 @@ unsigned int font_data_size(font_data_t *fd) } EXPORT_SYMBOL_GPL(font_data_size); +/** + * font_data_is_equal - Compares font data for equality + * @lhs: Left-hand side font data + * @rhs: Right-hand-size font data + * + * Font data is equal if is constain the same sequence of values. The + * helper also use the checksum, if both arguments contain it. Font data + * coming from different origins, internal or from user space, is never + * equal. Allowing this would break reference counting. + * + * Returns: + * True if the given font data is equal, false otherwise. + */ +bool font_data_is_equal(font_data_t *lhs, font_data_t *rhs) +{ + if (font_data_is_internal(lhs) != font_data_is_internal(rhs)) + return false; + if (font_data_size(lhs) != font_data_size(rhs)) + return false; + if (FNTSUM(lhs) && FNTSUM(rhs) && FNTSUM(lhs) != FNTSUM(rhs)) + return false; + + return !memcmp(lhs, rhs, FNTSIZE(lhs)); +} +EXPORT_SYMBOL_GPL(font_data_is_equal); + /* * Font lookup */ -- cgit v1.2.3 From 514d0de7cf403144d3e6c5b9fabb1ce4c15974ca Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:53 +0100 Subject: lib/fonts: Create font_data_t from struct console_font with font_data_import() Add font_data_import() and update consoles to use it. The implementation of font_data_import() is based on code from fbcon, which supports overflow checks and crc32 checksums. Fbcon uses the crc32 checksum. Newport_con now implements the same overflow checks as fbcon. As before, this console does not support checksums, which are optional. Newport_con can now also handle input font data with a vertical pitch other than 32 bytes. (The vertical pitch is the offset between two glyphs in the font data.) As an internal change, remove the const qualifier from the data field if struct font_data. This allows font_data_import() to write the data without type casting. For all users of the font data via font_data_t, the stored data is still read only. v3: - fix typos Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- drivers/video/console/newport_con.c | 22 +++---------- drivers/video/fbdev/core/fbcon.c | 38 +++-------------------- include/linux/font.h | 6 +++- lib/fonts/fonts.c | 62 +++++++++++++++++++++++++++++++++++++ 4 files changed, 75 insertions(+), 53 deletions(-) (limited to 'include/linux') diff --git a/drivers/video/console/newport_con.c b/drivers/video/console/newport_con.c index db0228bce00e..e88ff3a93b77 100644 --- a/drivers/video/console/newport_con.c +++ b/drivers/video/console/newport_con.c @@ -501,31 +501,17 @@ static int newport_set_font(int unit, const struct console_font *op, { int w = op->width; int h = op->height; - int size = h * op->charcount; int i; font_data_t *new_data; - unsigned char *data = op->data, *p; /* ladis: when I grow up, there will be a day... and more sizes will * be supported ;-) */ - if ((w != 8) || (h != 16) || (vpitch != 32) - || (op->charcount != 256 && op->charcount != 512)) + if (w != 8 || h != 16 || (op->charcount != 256 && op->charcount != 512)) return -EINVAL; - if (!(new_data = kmalloc(FONT_EXTRA_WORDS * sizeof(int) + size, - GFP_USER))) return -ENOMEM; - - new_data += FONT_EXTRA_WORDS * sizeof(int); - FNTSIZE(new_data) = size; - REFCOUNT(new_data) = 1; /* usage counter */ - FNTSUM(new_data) = 0; - - p = (unsigned char *)font_data_buf(new_data); - for (i = 0; i < op->charcount; i++) { - memcpy(p, data, h); - data += 32; - p += h; - } + new_data = font_data_import(op, vpitch, NULL); + if (IS_ERR(new_data)) + return PTR_ERR(new_data); /* check if font is already used by other console */ for (i = 0; i < MAX_NR_CONSOLES; i++) { diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 00255ac92e42..53677c09a0ec 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -2039,8 +2039,6 @@ static void updatescrollmode(struct fbcon_display *p, updatescrollmode_accel(p, info, vc); } -#define PITCH(w) (((w) + 7) >> 3) - static int fbcon_resize(struct vc_data *vc, unsigned int width, unsigned int height, bool from_user) { @@ -2424,7 +2422,6 @@ static int fbcon_do_set_font(struct vc_data *vc, int w, int h, int charcount, resize = (w != vc->vc_font.width) || (h != vc->vc_font.height); p->fontdata = data; vc->vc_font.data = font_data_buf(p->fontdata); - old_width = vc->vc_font.width; old_height = vc->vc_font.height; old_charcount = vc->vc_font.charcount; @@ -2482,11 +2479,8 @@ static int fbcon_set_font(struct vc_data *vc, const struct console_font *font, unsigned charcount = font->charcount; int w = font->width; int h = font->height; - int size, alloc_size; - int i, csum, ret; + int i, ret; font_data_t *new_data; - const u8 *data = font->data; - int pitch = PITCH(font->width); /* Is there a reason why fbconsole couldn't handle any charcount >256? * If not this check should be changed to charcount < 256 */ @@ -2510,34 +2504,10 @@ static int fbcon_set_font(struct vc_data *vc, const struct console_font *font, if (fbcon_invalid_charcount(info, charcount)) return -EINVAL; - /* Check for integer overflow in font size calculation */ - if (check_mul_overflow(h, pitch, &size) || - check_mul_overflow(size, charcount, &size)) - return -EINVAL; - - /* Check for overflow in allocation size calculation */ - if (check_add_overflow(FONT_EXTRA_WORDS * sizeof(int), size, &alloc_size)) - return -EINVAL; - - new_data = kmalloc(alloc_size, GFP_USER); - - if (!new_data) - return -ENOMEM; - - memset((u8 *)new_data, 0, FONT_EXTRA_WORDS * sizeof(int)); - - new_data += FONT_EXTRA_WORDS * sizeof(int); - FNTSIZE(new_data) = size; - REFCOUNT(new_data) = 1; /* usage counter */ - for (i=0; i< charcount; i++) { - memcpy((u8 *)new_data + i * h * pitch, data + i * vpitch * pitch, h * pitch); - } - - /* Since linux has a nice crc32 function use it for counting font - * checksums. */ - csum = crc32(0, new_data, size); + new_data = font_data_import(font, vpitch, crc32); + if (IS_ERR(new_data)) + return PTR_ERR(new_data); - FNTSUM(new_data) = csum; /* Check if the same font is on some other console already */ for (i = first_fb_vc; i <= last_fb_vc; i++) { if (fb_display[i].fontdata && diff --git a/include/linux/font.h b/include/linux/font.h index 58bf3c64cabb..3eb4818402c5 100644 --- a/include/linux/font.h +++ b/include/linux/font.h @@ -13,6 +13,8 @@ #include +struct console_font; + /* * font_data_t and helpers */ @@ -54,6 +56,8 @@ static inline const unsigned char *font_data_buf(font_data_t *fd) return (const unsigned char *)fd; } +font_data_t *font_data_import(const struct console_font *font, unsigned int vpitch, + u32 (*calc_csum)(u32, const void *, size_t)); void font_data_get(font_data_t *fd); bool font_data_put(font_data_t *fd); unsigned int font_data_size(font_data_t *fd); @@ -124,7 +128,7 @@ extern const struct font_desc *get_default_font(int xres, int yres, struct font_data { unsigned int extra[FONT_EXTRA_WORDS]; - const unsigned char data[]; + unsigned char data[]; } __packed; #endif /* _VIDEO_FONT_H */ diff --git a/lib/fonts/fonts.c b/lib/fonts/fonts.c index 3fb76d185647..16e75c3d2a0f 100644 --- a/lib/fonts/fonts.c +++ b/lib/fonts/fonts.c @@ -14,7 +14,9 @@ #include #include +#include #include +#include #include #include #include @@ -23,6 +25,8 @@ #include #endif +#define console_font_pitch(font) DIV_ROUND_UP((font)->width, 8) + /* * Helpers for font_data_t */ @@ -42,6 +46,64 @@ static void font_data_free(font_data_t *fd) kfree(to_font_data_struct(fd)); } +/** + * font_data_import - Allocates and initializes font data from user space + * @font: A font from user space + * @vpitch: The size of a single glyph in @font in bytes + * @calc_csum: An optional helper to calculate a chechsum + * + * Font data from user space must be translated to the kernel's format. The + * font's glyph geometry and data is provided in @font. The parameter @vpitch + * gives the number of bytes per glyph, including trailing bytes. + * + * The parameter @calc_csum is optional. Fbcon passes crc32() to calculate the + * font data's checksum. + * + * Returns: + * Newly initialized font data on success, or a pointer-encoded errno value otherwise. + */ +font_data_t *font_data_import(const struct console_font *font, unsigned int vpitch, + u32 (*calc_csum)(u32, const void *, size_t)) +{ + unsigned int pitch = console_font_pitch(font); + unsigned int h = font->height; + unsigned int charcount = font->charcount; + const unsigned char *data = font->data; + u32 csum = 0; + struct font_data *font_data; + int size, alloc_size; + unsigned int i; + font_data_t *fd; + + /* Check for integer overflow in font-size calculation */ + if (check_mul_overflow(h, pitch, &size) || + check_mul_overflow(size, charcount, &size)) + return ERR_PTR(-EINVAL); + + /* Check for overflow in allocation size calculation */ + if (check_add_overflow(sizeof(*font_data), size, &alloc_size)) + return ERR_PTR(-EINVAL); + + font_data = kmalloc(alloc_size, GFP_USER); + if (!font_data) + return ERR_PTR(-ENOMEM); + memset(font_data->extra, 0, sizeof(font_data->extra)); + + for (i = 0; i < charcount; ++i) + memcpy(font_data->data + i * h * pitch, data + i * vpitch * pitch, h * pitch); + + if (calc_csum) + csum = calc_csum(0, font_data->data, size); + + fd = font_data->data; + REFCOUNT(fd) = 1; /* start with reference acquired */ + FNTSIZE(fd) = size; + FNTSUM(fd) = csum; + + return fd; +} +EXPORT_SYMBOL_GPL(font_data_import); + /** * font_data_get - Acquires a reference on font data * @fd: Font data -- cgit v1.2.3 From c37bd7c8d36f760c064de2639423866dc0270997 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:54 +0100 Subject: lib/fonts: Store font data for user space with font_data_export() Add font_data_export() and update consoles to use it. The helper font_data_export() is based on code in fbcon_get_font(). It extends the size of a single glyph to match the requested vpitch, which us usually 32 bytes for fonts from user space. Internal fonts have a pitch according to the glyph's height. The implementation of font_data_export() differs in several ways from the original code. The original implementation distinguished between different pitches of the font data. This is not necessary as the pitch is a parameter in the copying. There was also special handling for a font pitch of 3 bytes, which got expanded to 4 bytes (with trailing bits on each scanline). The logic originated from long before git history exists even in the historical tree. So it is not clear why this was implemented. It is not what user space expects. The setfont utitlity loads font with 3-bytes pitches and expects to read such fonts with a 3-byte pitch. For any font width, the font pitch is always the width extended to the next multiple of 8. See [1] for the user-space font-reading code. With the changes to handling the font pitches, font_data_export() replaces the original code's various special cases with a single copying logic. v3: - fix typos (Helge) Signed-off-by: Thomas Zimmermann Link: https://github.com/legionus/kbd/blob/v2.9.0/src/libkfont/kdfontop.c#L73 # [1] Signed-off-by: Helge Deller --- drivers/video/fbdev/core/fbcon.c | 57 ++-------------------------------------- include/linux/font.h | 1 + lib/fonts/fonts.c | 40 ++++++++++++++++++++++++++++ 3 files changed, 43 insertions(+), 55 deletions(-) (limited to 'include/linux') diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index 53677c09a0ec..8641b0b3edc4 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -2282,68 +2282,15 @@ static bool fbcon_blank(struct vc_data *vc, enum vesa_blank_mode blank, static int fbcon_get_font(struct vc_data *vc, struct console_font *font, unsigned int vpitch) { - struct fbcon_display *p = &fb_display[vc->vc_num]; - font_data_t *fontdata = p->fontdata; - u8 *data = font->data; - int i, j; + const struct fbcon_display *p = &fb_display[vc->vc_num]; font->width = vc->vc_font.width; font->height = vc->vc_font.height; if (font->height > vpitch) return -ENOSPC; font->charcount = vc->vc_hi_font_mask ? 512 : 256; - if (!font->data) - return 0; - - if (font->width <= 8) { - j = vc->vc_font.height; - if (font->charcount * j > font_data_size(fontdata)) - return -EINVAL; - for (i = 0; i < font->charcount; i++) { - memcpy(data, fontdata, j); - memset(data + j, 0, vpitch - j); - data += vpitch; - fontdata += j; - } - } else if (font->width <= 16) { - j = vc->vc_font.height * 2; - if (font->charcount * j > font_data_size(fontdata)) - return -EINVAL; - - for (i = 0; i < font->charcount; i++) { - memcpy(data, fontdata, j); - memset(data + j, 0, 2*vpitch - j); - data += 2*vpitch; - fontdata += j; - } - } else if (font->width <= 24) { - if (font->charcount * (vc->vc_font.height * sizeof(u32)) > font_data_size(fontdata)) - return -EINVAL; - - for (i = 0; i < font->charcount; i++) { - for (j = 0; j < vc->vc_font.height; j++) { - *data++ = fontdata[0]; - *data++ = fontdata[1]; - *data++ = fontdata[2]; - fontdata += sizeof(u32); - } - memset(data, 0, 3 * (vpitch - j)); - data += 3 * (vpitch - j); - } - } else { - j = vc->vc_font.height * 4; - if (font->charcount * j > font_data_size(fontdata)) - return -EINVAL; - - for (i = 0; i < font->charcount; i++) { - memcpy(data, fontdata, j); - memset(data + j, 0, 4 * vpitch - j); - data += 4 * vpitch; - fontdata += j; - } - } - return 0; + return font_data_export(p->fontdata, font, vpitch); } /* set/clear vc_hi_font_mask and update vc attrs accordingly */ diff --git a/include/linux/font.h b/include/linux/font.h index 3eb4818402c5..d80db66a5c17 100644 --- a/include/linux/font.h +++ b/include/linux/font.h @@ -62,6 +62,7 @@ void font_data_get(font_data_t *fd); bool font_data_put(font_data_t *fd); unsigned int font_data_size(font_data_t *fd); bool font_data_is_equal(font_data_t *lhs, font_data_t *rhs); +int font_data_export(font_data_t *fd, struct console_font *font, unsigned int vpitch); /* * Font description diff --git a/lib/fonts/fonts.c b/lib/fonts/fonts.c index 16e75c3d2a0f..a861b375e35d 100644 --- a/lib/fonts/fonts.c +++ b/lib/fonts/fonts.c @@ -198,6 +198,46 @@ bool font_data_is_equal(font_data_t *lhs, font_data_t *rhs) } EXPORT_SYMBOL_GPL(font_data_is_equal); +/** + * font_data_export - Stores font data for user space + * @fd: Font data + * @font: A font for user space + * @vpitch: The size of a single glyph in @font in bytes + * + * Store the font data given in @fd to the font in @font. Values and + * pointers in @font are pre-initialized. This helper mostly checks some + * corner cases and translates glyph sizes according to the value given + * @vpitch. + * + * Returns: + * 0 on success, or a negative errno code otherwise. + */ +int font_data_export(font_data_t *fd, struct console_font *font, unsigned int vpitch) +{ + const unsigned char *font_data = font_data_buf(fd); + unsigned char *data = font->data; + unsigned int pitch = console_font_pitch(font); + unsigned int glyphsize, i; + + if (!font->width || !font->height || !font->charcount || !font->data) + return 0; + + glyphsize = font->height * pitch; + + if (font->charcount * glyphsize > font_data_size(fd)) + return -EINVAL; + + for (i = 0; i < font->charcount; i++) { + memcpy(data, font_data, glyphsize); + memset(data + glyphsize, 0, pitch * vpitch - glyphsize); + data += pitch * vpitch; + font_data += glyphsize; + } + + return 0; +} +EXPORT_SYMBOL_GPL(font_data_export); + /* * Font lookup */ -- cgit v1.2.3 From db65872b38dc9f18a62669d6ae1e4ec7868a85a9 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 9 Mar 2026 15:14:55 +0100 Subject: lib/fonts: Remove internal symbols and macros from public header file Define access macros for font_data_t in fonts.c. Define struct font_data and declare most of the font symbols in the internal header font.h, where they can only be seen by the font code. Also move font indices into internal font.h. They appear to be unused though. There is m86k assembly code that operates on the idx field, so leave them in place for now. List all built-in fonts in a separate section in the public header file. v2: - do not add config guards around font symbols (Helge) - keep declaration of built-in fonts in public header Signed-off-by: Thomas Zimmermann Signed-off-by: Helge Deller --- include/linux/font.h | 57 ++++++++++++++-------------------------------- lib/fonts/font.h | 38 +++++++++++++++++++++++++++++++ lib/fonts/font_10x18.c | 2 +- lib/fonts/font_6x10.c | 3 ++- lib/fonts/font_6x11.c | 2 +- lib/fonts/font_6x8.c | 3 ++- lib/fonts/font_7x14.c | 2 +- lib/fonts/font_8x16.c | 3 ++- lib/fonts/font_8x8.c | 2 +- lib/fonts/font_acorn_8x8.c | 2 +- lib/fonts/font_mini_4x6.c | 2 +- lib/fonts/font_pearl_8x8.c | 2 +- lib/fonts/font_sun12x22.c | 3 ++- lib/fonts/font_sun8x16.c | 3 ++- lib/fonts/font_ter10x18.c | 4 +++- lib/fonts/font_ter16x32.c | 4 +++- lib/fonts/fonts.c | 8 ++++++- 17 files changed, 85 insertions(+), 55 deletions(-) create mode 100644 lib/fonts/font.h (limited to 'include/linux') diff --git a/include/linux/font.h b/include/linux/font.h index d80db66a5c17..5401f07dd6ce 100644 --- a/include/linux/font.h +++ b/include/linux/font.h @@ -77,36 +77,6 @@ struct font_desc { int pref; }; -#define VGA8x8_IDX 0 -#define VGA8x16_IDX 1 -#define PEARL8x8_IDX 2 -#define VGA6x11_IDX 3 -#define FONT7x14_IDX 4 -#define FONT10x18_IDX 5 -#define SUN8x16_IDX 6 -#define SUN12x22_IDX 7 -#define ACORN8x8_IDX 8 -#define MINI4x6_IDX 9 -#define FONT6x10_IDX 10 -#define TER16x32_IDX 11 -#define FONT6x8_IDX 12 -#define TER10x18_IDX 13 - -extern const struct font_desc font_vga_8x8, - font_vga_8x16, - font_pearl_8x8, - font_vga_6x11, - font_7x14, - font_10x18, - font_sun_8x16, - font_sun_12x22, - font_acorn_8x8, - font_mini_4x6, - font_6x10, - font_ter_16x32, - font_6x8, - font_ter_10x18; - /* Find a font with a specific name */ extern const struct font_desc *find_font(const char *name); @@ -120,16 +90,23 @@ extern const struct font_desc *get_default_font(int xres, int yres, /* Max. length for the name of a predefined font */ #define MAX_FONT_NAME 32 -/* Extra word getters */ -#define REFCOUNT(fd) (((int *)(fd))[-1]) -#define FNTSIZE(fd) (((int *)(fd))[-2]) -#define FNTSUM(fd) (((int *)(fd))[-4]) - -#define FONT_EXTRA_WORDS 4 +/* + * Built-in fonts + */ -struct font_data { - unsigned int extra[FONT_EXTRA_WORDS]; - unsigned char data[]; -} __packed; +extern const struct font_desc font_10x18; +extern const struct font_desc font_6x10; +extern const struct font_desc font_6x8; +extern const struct font_desc font_7x14; +extern const struct font_desc font_acorn_8x8; +extern const struct font_desc font_mini_4x6; +extern const struct font_desc font_pearl_8x8; +extern const struct font_desc font_sun_12x22; +extern const struct font_desc font_sun_8x16; +extern const struct font_desc font_ter_10x18; +extern const struct font_desc font_ter_16x32; +extern const struct font_desc font_vga_6x11; +extern const struct font_desc font_vga_8x16; +extern const struct font_desc font_vga_8x8; #endif /* _VIDEO_FONT_H */ diff --git a/lib/fonts/font.h b/lib/fonts/font.h new file mode 100644 index 000000000000..4f1adf0b6b54 --- /dev/null +++ b/lib/fonts/font.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _LIB_FONTS_FONT_H +#define _LIB_FONTS_FONT_H + +#include + +/* + * Font data + */ + +#define FONT_EXTRA_WORDS 4 + +struct font_data { + unsigned int extra[FONT_EXTRA_WORDS]; + unsigned char data[]; +} __packed; + +/* + * Built-in fonts + */ + +#define VGA8x8_IDX 0 +#define VGA8x16_IDX 1 +#define PEARL8x8_IDX 2 +#define VGA6x11_IDX 3 +#define FONT7x14_IDX 4 +#define FONT10x18_IDX 5 +#define SUN8x16_IDX 6 +#define SUN12x22_IDX 7 +#define ACORN8x8_IDX 8 +#define MINI4x6_IDX 9 +#define FONT6x10_IDX 10 +#define TER16x32_IDX 11 +#define FONT6x8_IDX 12 +#define TER10x18_IDX 13 + +#endif diff --git a/lib/fonts/font_10x18.c b/lib/fonts/font_10x18.c index 5d940db626e7..10edebc4bb74 100644 --- a/lib/fonts/font_10x18.c +++ b/lib/fonts/font_10x18.c @@ -4,7 +4,7 @@ * by Jurriaan Kalkman 06-2005 * ********************************/ -#include +#include "font.h" #define FONTDATAMAX 9216 diff --git a/lib/fonts/font_6x10.c b/lib/fonts/font_6x10.c index e65df019e0d2..660d3a371b30 100644 --- a/lib/fonts/font_6x10.c +++ b/lib/fonts/font_6x10.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -#include + +#include "font.h" #define FONTDATAMAX 2560 diff --git a/lib/fonts/font_6x11.c b/lib/fonts/font_6x11.c index bd76b3f6b635..671487ccc172 100644 --- a/lib/fonts/font_6x11.c +++ b/lib/fonts/font_6x11.c @@ -5,7 +5,7 @@ /* */ /**********************************************/ -#include +#include "font.h" #define FONTDATAMAX (11*256) diff --git a/lib/fonts/font_6x8.c b/lib/fonts/font_6x8.c index 06ace7792521..5811ee07f4d8 100644 --- a/lib/fonts/font_6x8.c +++ b/lib/fonts/font_6x8.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -#include + +#include "font.h" #define FONTDATAMAX 2048 diff --git a/lib/fonts/font_7x14.c b/lib/fonts/font_7x14.c index a2f561c9fa04..0c7475d643c8 100644 --- a/lib/fonts/font_7x14.c +++ b/lib/fonts/font_7x14.c @@ -4,7 +4,7 @@ /* by Jurriaan Kalkman 05-2005 */ /**************************************/ -#include +#include "font.h" #define FONTDATAMAX 3584 diff --git a/lib/fonts/font_8x16.c b/lib/fonts/font_8x16.c index 06ae14088514..523e95c75569 100644 --- a/lib/fonts/font_8x16.c +++ b/lib/fonts/font_8x16.c @@ -5,9 +5,10 @@ /* */ /**********************************************/ -#include #include +#include "font.h" + #define FONTDATAMAX 4096 static const struct font_data fontdata_8x16 = { diff --git a/lib/fonts/font_8x8.c b/lib/fonts/font_8x8.c index 69570b8c31af..e5b697fc9675 100644 --- a/lib/fonts/font_8x8.c +++ b/lib/fonts/font_8x8.c @@ -5,7 +5,7 @@ /* */ /**********************************************/ -#include +#include "font.h" #define FONTDATAMAX 2048 diff --git a/lib/fonts/font_acorn_8x8.c b/lib/fonts/font_acorn_8x8.c index af5fa72aa8b7..36c51016769d 100644 --- a/lib/fonts/font_acorn_8x8.c +++ b/lib/fonts/font_acorn_8x8.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* Acorn-like font definition, with PC graphics characters */ -#include +#include "font.h" #define FONTDATAMAX 2048 diff --git a/lib/fonts/font_mini_4x6.c b/lib/fonts/font_mini_4x6.c index cc21dc70cfd1..dc919c160dde 100644 --- a/lib/fonts/font_mini_4x6.c +++ b/lib/fonts/font_mini_4x6.c @@ -39,7 +39,7 @@ __END__; MSBit to LSBit = left to right. */ -#include +#include "font.h" #define FONTDATAMAX 1536 diff --git a/lib/fonts/font_pearl_8x8.c b/lib/fonts/font_pearl_8x8.c index ae98ca17982e..2438b374acea 100644 --- a/lib/fonts/font_pearl_8x8.c +++ b/lib/fonts/font_pearl_8x8.c @@ -10,7 +10,7 @@ /* */ /**********************************************/ -#include +#include "font.h" #define FONTDATAMAX 2048 diff --git a/lib/fonts/font_sun12x22.c b/lib/fonts/font_sun12x22.c index 91daf5ab8b6b..2afbc144bea8 100644 --- a/lib/fonts/font_sun12x22.c +++ b/lib/fonts/font_sun12x22.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -#include + +#include "font.h" #define FONTDATAMAX 11264 diff --git a/lib/fonts/font_sun8x16.c b/lib/fonts/font_sun8x16.c index 81bb4eeae04e..2b7b2d8e548a 100644 --- a/lib/fonts/font_sun8x16.c +++ b/lib/fonts/font_sun8x16.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -#include + +#include "font.h" #define FONTDATAMAX 4096 diff --git a/lib/fonts/font_ter10x18.c b/lib/fonts/font_ter10x18.c index 80356e9d56c7..3f30b4a211ab 100644 --- a/lib/fonts/font_ter10x18.c +++ b/lib/fonts/font_ter10x18.c @@ -1,7 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 -#include + #include +#include "font.h" + #define FONTDATAMAX 9216 static const struct font_data fontdata_ter10x18 = { diff --git a/lib/fonts/font_ter16x32.c b/lib/fonts/font_ter16x32.c index 5baedc573dd6..93616cffe642 100644 --- a/lib/fonts/font_ter16x32.c +++ b/lib/fonts/font_ter16x32.c @@ -1,7 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 -#include + #include +#include "font.h" + #define FONTDATAMAX 16384 static const struct font_data fontdata_ter16x32 = { diff --git a/lib/fonts/fonts.c b/lib/fonts/fonts.c index a861b375e35d..5938f542906b 100644 --- a/lib/fonts/fonts.c +++ b/lib/fonts/fonts.c @@ -13,7 +13,6 @@ */ #include -#include #include #include #include @@ -25,12 +24,19 @@ #include #endif +#include "font.h" + #define console_font_pitch(font) DIV_ROUND_UP((font)->width, 8) /* * Helpers for font_data_t */ +/* Extra word getters */ +#define REFCOUNT(fd) (((int *)(fd))[-1]) +#define FNTSIZE(fd) (((int *)(fd))[-2]) +#define FNTSUM(fd) (((int *)(fd))[-4]) + static struct font_data *to_font_data_struct(font_data_t *fd) { return container_of(fd, struct font_data, data[0]); -- cgit v1.2.3 From 2e67fabd8b77c4f482df9b211bca1b495c6c2c24 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:24:49 +0000 Subject: ring-buffer: Introduce ring-buffer remotes Add ring-buffer remotes to support entities outside of the kernel (such as firmware or a hypervisor) that writes events into a ring-buffer using the tracefs format Require a description of the ring-buffer pages (struct trace_buffer_desc) and callbacks (swap_reader_page and reset) to set up the ring-buffer on the kernel side. Expect the remote entity to maintain and update the meta-page. Link: https://patch.msgid.link/20260309162516.2623589-4-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer.h | 58 +++++++++++ kernel/trace/ring_buffer.c | 233 ++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 283 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index d862fa610270..994f52b34344 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -251,4 +251,62 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, void ring_buffer_map_dup(struct trace_buffer *buffer, int cpu); int ring_buffer_unmap(struct trace_buffer *buffer, int cpu); int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu); + +struct ring_buffer_desc { + int cpu; + unsigned int nr_page_va; /* excludes the meta page */ + unsigned long meta_va; + unsigned long page_va[] __counted_by(nr_page_va); +}; + +struct trace_buffer_desc { + int nr_cpus; + size_t struct_len; + char __data[]; /* list of ring_buffer_desc */ +}; + +static inline struct ring_buffer_desc *__next_ring_buffer_desc(struct ring_buffer_desc *desc) +{ + size_t len = struct_size(desc, page_va, desc->nr_page_va); + + return (struct ring_buffer_desc *)((void *)desc + len); +} + +static inline struct ring_buffer_desc *__first_ring_buffer_desc(struct trace_buffer_desc *desc) +{ + return (struct ring_buffer_desc *)(&desc->__data[0]); +} + +static inline size_t trace_buffer_desc_size(size_t buffer_size, unsigned int nr_cpus) +{ + unsigned int nr_pages = max(DIV_ROUND_UP(buffer_size, PAGE_SIZE), 2UL) + 1; + struct ring_buffer_desc *rbdesc; + + return size_add(offsetof(struct trace_buffer_desc, __data), + size_mul(nr_cpus, struct_size(rbdesc, page_va, nr_pages))); +} + +#define for_each_ring_buffer_desc(__pdesc, __cpu, __trace_pdesc) \ + for (__pdesc = __first_ring_buffer_desc(__trace_pdesc), __cpu = 0; \ + (__cpu) < (__trace_pdesc)->nr_cpus; \ + (__cpu)++, __pdesc = __next_ring_buffer_desc(__pdesc)) + +struct ring_buffer_remote { + struct trace_buffer_desc *desc; + int (*swap_reader_page)(unsigned int cpu, void *priv); + int (*reset)(unsigned int cpu, void *priv); + void *priv; +}; + +int ring_buffer_poll_remote(struct trace_buffer *buffer, int cpu); + +struct trace_buffer * +__ring_buffer_alloc_remote(struct ring_buffer_remote *remote, + struct lock_class_key *key); + +#define ring_buffer_alloc_remote(remote) \ +({ \ + static struct lock_class_key __key; \ + __ring_buffer_alloc_remote(remote, &__key); \ +}) #endif /* _LINUX_RING_BUFFER_H */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3d2804a7e8ab..88218377fa29 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -559,6 +559,8 @@ struct ring_buffer_per_cpu { struct trace_buffer_meta *meta_page; struct ring_buffer_cpu_meta *ring_meta; + struct ring_buffer_remote *remote; + /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; struct list_head new_pages; /* new pages to add */ @@ -581,6 +583,8 @@ struct trace_buffer { struct ring_buffer_per_cpu **buffers; + struct ring_buffer_remote *remote; + struct hlist_node node; u64 (*clock)(void); @@ -2238,6 +2242,40 @@ static void rb_meta_buffer_update(struct ring_buffer_per_cpu *cpu_buffer, } } +static struct ring_buffer_desc *ring_buffer_desc(struct trace_buffer_desc *trace_desc, int cpu) +{ + struct ring_buffer_desc *desc, *end; + size_t len; + int i; + + if (!trace_desc) + return NULL; + + if (cpu >= trace_desc->nr_cpus) + return NULL; + + end = (struct ring_buffer_desc *)((void *)trace_desc + trace_desc->struct_len); + desc = __first_ring_buffer_desc(trace_desc); + len = struct_size(desc, page_va, desc->nr_page_va); + desc = (struct ring_buffer_desc *)((void *)desc + (len * cpu)); + + if (desc < end && desc->cpu == cpu) + return desc; + + /* Missing CPUs, need to linear search */ + for_each_ring_buffer_desc(desc, i, trace_desc) { + if (desc->cpu == cpu) + return desc; + } + + return NULL; +} + +static void *ring_buffer_desc_page(struct ring_buffer_desc *desc, int page_id) +{ + return page_id > desc->nr_page_va ? NULL : (void *)desc->page_va[page_id]; +} + static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, long nr_pages, struct list_head *pages) { @@ -2245,6 +2283,7 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_cpu_meta *meta = NULL; struct buffer_page *bpage, *tmp; bool user_thread = current->mm != NULL; + struct ring_buffer_desc *desc = NULL; long i; /* @@ -2273,6 +2312,12 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, if (buffer->range_addr_start) meta = rb_range_meta(buffer, nr_pages, cpu_buffer->cpu); + if (buffer->remote) { + desc = ring_buffer_desc(buffer->remote->desc, cpu_buffer->cpu); + if (!desc || WARN_ON(desc->nr_page_va != (nr_pages + 1))) + return -EINVAL; + } + for (i = 0; i < nr_pages; i++) { bpage = alloc_cpu_page(cpu_buffer->cpu); @@ -2297,6 +2342,16 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, rb_meta_buffer_update(cpu_buffer, bpage); bpage->range = 1; bpage->id = i + 1; + } else if (desc) { + void *p = ring_buffer_desc_page(desc, i + 1); + + if (WARN_ON(!p)) + goto free_pages; + + bpage->page = p; + bpage->range = 1; /* bpage->page can't be freed */ + bpage->id = i + 1; + cpu_buffer->subbuf_ids[i + 1] = bpage; } else { int order = cpu_buffer->buffer->subbuf_order; bpage->page = alloc_cpu_data(cpu_buffer->cpu, order); @@ -2394,6 +2449,30 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) if (cpu_buffer->ring_meta->head_buffer) rb_meta_buffer_update(cpu_buffer, bpage); bpage->range = 1; + } else if (buffer->remote) { + struct ring_buffer_desc *desc = ring_buffer_desc(buffer->remote->desc, cpu); + + if (!desc) + goto fail_free_reader; + + cpu_buffer->remote = buffer->remote; + cpu_buffer->meta_page = (struct trace_buffer_meta *)(void *)desc->meta_va; + cpu_buffer->nr_pages = nr_pages; + cpu_buffer->subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, + sizeof(*cpu_buffer->subbuf_ids), GFP_KERNEL); + if (!cpu_buffer->subbuf_ids) + goto fail_free_reader; + + /* Remote buffers are read-only and immutable */ + atomic_inc(&cpu_buffer->record_disabled); + atomic_inc(&cpu_buffer->resize_disabled); + + bpage->page = ring_buffer_desc_page(desc, cpu_buffer->meta_page->reader.id); + if (!bpage->page) + goto fail_free_reader; + + bpage->range = 1; + cpu_buffer->subbuf_ids[0] = bpage; } else { int order = cpu_buffer->buffer->subbuf_order; bpage->page = alloc_cpu_data(cpu, order); @@ -2453,6 +2532,9 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) irq_work_sync(&cpu_buffer->irq_work.work); + if (cpu_buffer->remote) + kfree(cpu_buffer->subbuf_ids); + free_buffer_page(cpu_buffer->reader_page); if (head) { @@ -2475,7 +2557,8 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, int order, unsigned long start, unsigned long end, unsigned long scratch_size, - struct lock_class_key *key) + struct lock_class_key *key, + struct ring_buffer_remote *remote) { struct trace_buffer *buffer __free(kfree) = NULL; long nr_pages; @@ -2515,6 +2598,8 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, if (!buffer->buffers) goto fail_free_cpumask; + cpu = raw_smp_processor_id(); + /* If start/end are specified, then that overrides size */ if (start && end) { unsigned long buffers_start; @@ -2570,6 +2655,15 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, buffer->range_addr_end = end; rb_range_meta_init(buffer, nr_pages, scratch_size); + } else if (remote) { + struct ring_buffer_desc *desc = ring_buffer_desc(remote->desc, cpu); + + buffer->remote = remote; + /* The writer is remote. This ring-buffer is read-only */ + atomic_inc(&buffer->record_disabled); + nr_pages = desc->nr_page_va - 1; + if (nr_pages < 2) + goto fail_free_buffers; } else { /* need at least two pages */ @@ -2578,7 +2672,6 @@ static struct trace_buffer *alloc_buffer(unsigned long size, unsigned flags, nr_pages = 2; } - cpu = raw_smp_processor_id(); cpumask_set_cpu(cpu, buffer->cpumask); buffer->buffers[cpu] = rb_allocate_cpu_buffer(buffer, nr_pages, cpu); if (!buffer->buffers[cpu]) @@ -2620,7 +2713,7 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key) { /* Default buffer page size - one system page */ - return alloc_buffer(size, flags, 0, 0, 0, 0, key); + return alloc_buffer(size, flags, 0, 0, 0, 0, key, NULL); } EXPORT_SYMBOL_GPL(__ring_buffer_alloc); @@ -2647,7 +2740,18 @@ struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned flag struct lock_class_key *key) { return alloc_buffer(size, flags, order, start, start + range_size, - scratch_size, key); + scratch_size, key, NULL); +} + +/** + * __ring_buffer_alloc_remote - allocate a new ring_buffer from a remote + * @remote: Contains a description of the ring-buffer pages and remote callbacks. + * @key: ring buffer reader_lock_key. + */ +struct trace_buffer *__ring_buffer_alloc_remote(struct ring_buffer_remote *remote, + struct lock_class_key *key) +{ + return alloc_buffer(0, 0, 0, 0, 0, 0, key, remote); } void *ring_buffer_meta_scratch(struct trace_buffer *buffer, unsigned int *size) @@ -5274,6 +5378,16 @@ unsigned long ring_buffer_overruns(struct trace_buffer *buffer) } EXPORT_SYMBOL_GPL(ring_buffer_overruns); +static bool rb_read_remote_meta_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + local_set(&cpu_buffer->entries, READ_ONCE(cpu_buffer->meta_page->entries)); + local_set(&cpu_buffer->overrun, READ_ONCE(cpu_buffer->meta_page->overrun)); + local_set(&cpu_buffer->pages_touched, READ_ONCE(cpu_buffer->meta_page->pages_touched)); + local_set(&cpu_buffer->pages_lost, READ_ONCE(cpu_buffer->meta_page->pages_lost)); + + return rb_num_of_entries(cpu_buffer); +} + static void rb_iter_reset(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; @@ -5428,7 +5542,43 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter, } static struct buffer_page * -rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) +__rb_get_reader_page_from_remote(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *new_reader, *prev_reader; + + if (!rb_read_remote_meta_page(cpu_buffer)) + return NULL; + + /* More to read on the reader page */ + if (cpu_buffer->reader_page->read < rb_page_size(cpu_buffer->reader_page)) { + if (!cpu_buffer->reader_page->read) + cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp; + return cpu_buffer->reader_page; + } + + prev_reader = cpu_buffer->subbuf_ids[cpu_buffer->meta_page->reader.id]; + + WARN_ON_ONCE(cpu_buffer->remote->swap_reader_page(cpu_buffer->cpu, + cpu_buffer->remote->priv)); + /* nr_pages doesn't include the reader page */ + if (WARN_ON_ONCE(cpu_buffer->meta_page->reader.id > cpu_buffer->nr_pages)) + return NULL; + + new_reader = cpu_buffer->subbuf_ids[cpu_buffer->meta_page->reader.id]; + + WARN_ON_ONCE(prev_reader == new_reader); + + cpu_buffer->reader_page->page = new_reader->page; + cpu_buffer->reader_page->id = new_reader->id; + cpu_buffer->reader_page->read = 0; + cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp; + cpu_buffer->lost_events = cpu_buffer->meta_page->reader.lost_events; + + return rb_page_size(cpu_buffer->reader_page) ? cpu_buffer->reader_page : NULL; +} + +static struct buffer_page * +__rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *reader = NULL; unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size); @@ -5598,6 +5748,13 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) return reader; } +static struct buffer_page * +rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + return cpu_buffer->remote ? __rb_get_reader_page_from_remote(cpu_buffer) : + __rb_get_reader_page(cpu_buffer); +} + static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) { struct ring_buffer_event *event; @@ -5998,7 +6155,7 @@ ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags) struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_iter *iter; - if (!cpumask_test_cpu(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask) || buffer->remote) return NULL; iter = kzalloc_obj(*iter, flags); @@ -6166,6 +6323,23 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) { struct buffer_page *page; + if (cpu_buffer->remote) { + if (!cpu_buffer->remote->reset) + return; + + cpu_buffer->remote->reset(cpu_buffer->cpu, cpu_buffer->remote->priv); + rb_read_remote_meta_page(cpu_buffer); + + /* Read related values, not covered by the meta-page */ + local_set(&cpu_buffer->pages_read, 0); + cpu_buffer->read = 0; + cpu_buffer->read_bytes = 0; + cpu_buffer->last_overrun = 0; + cpu_buffer->reader_page->read = 0; + + return; + } + rb_head_page_deactivate(cpu_buffer); cpu_buffer->head_page @@ -6396,6 +6570,46 @@ bool ring_buffer_empty_cpu(struct trace_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); +int ring_buffer_poll_remote(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (cpu != RING_BUFFER_ALL_CPUS) { + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -EINVAL; + + cpu_buffer = buffer->buffers[cpu]; + + guard(raw_spinlock)(&cpu_buffer->reader_lock); + if (rb_read_remote_meta_page(cpu_buffer)) + rb_wakeups(buffer, cpu_buffer); + + return 0; + } + + guard(cpus_read_lock)(); + + /* + * Make sure all the ring buffers are up to date before we start reading + * them. + */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + + guard(raw_spinlock)(&cpu_buffer->reader_lock); + rb_read_remote_meta_page(cpu_buffer); + } + + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + + if (rb_num_of_entries(cpu_buffer)) + rb_wakeups(buffer, cpu_buffer); + } + + return 0; +} + #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP /** * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers @@ -6634,6 +6848,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, unsigned int commit; unsigned int read; u64 save_timestamp; + bool force_memcpy; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return -1; @@ -6671,6 +6886,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer, /* Check if any events were dropped */ missed_events = cpu_buffer->lost_events; + force_memcpy = cpu_buffer->mapped || cpu_buffer->remote; + /* * If this page has been partially read or * if len is not big enough to read the rest of the page or @@ -6680,7 +6897,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer, */ if (read || (len < (commit - read)) || cpu_buffer->reader_page == cpu_buffer->commit_page || - cpu_buffer->mapped) { + force_memcpy) { struct buffer_data_page *rpage = cpu_buffer->reader_page->page; unsigned int rpos = read; unsigned int pos = 0; @@ -7259,7 +7476,7 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu, unsigned long flags; int err; - if (!cpumask_test_cpu(cpu, buffer->cpumask)) + if (!cpumask_test_cpu(cpu, buffer->cpumask) || buffer->remote) return -EINVAL; cpu_buffer = buffer->buffers[cpu]; -- cgit v1.2.3 From 96e43537af5461b26f50904c6055046ba65d742f Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:24:51 +0000 Subject: tracing: Introduce trace remotes A trace remote relies on ring-buffer remotes to read and control compatible tracing buffers, written by entity such as firmware or hypervisor. Add a Tracefs directory remotes/ that contains all instances of trace remotes. Each instance follows the same hierarchy as any other to ease the support by existing user-space tools. This currently does not provide any event support, which will come later. Link: https://patch.msgid.link/20260309162516.2623589-6-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_remote.h | 36 +++ kernel/trace/Kconfig | 3 + kernel/trace/Makefile | 1 + kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 6 + kernel/trace/trace_remote.c | 619 +++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 666 insertions(+), 1 deletion(-) create mode 100644 include/linux/trace_remote.h create mode 100644 kernel/trace/trace_remote.c (limited to 'include/linux') diff --git a/include/linux/trace_remote.h b/include/linux/trace_remote.h new file mode 100644 index 000000000000..65b7e7b8267c --- /dev/null +++ b/include/linux/trace_remote.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _LINUX_TRACE_REMOTE_H +#define _LINUX_TRACE_REMOTE_H + +#include + +/** + * struct trace_remote_callbacks - Callbacks used by Tracefs to control the remote + * @load_trace_buffer: Called before Tracefs accesses the trace buffer for the first + * time. Must return a &trace_buffer_desc + * (most likely filled with trace_remote_alloc_buffer()) + * @unload_trace_buffer: + * Called once Tracefs has no use for the trace buffer + * (most likely call trace_remote_free_buffer()) + * @enable_tracing: Called on Tracefs tracing_on. It is expected from the + * remote to allow writing. + * @swap_reader_page: Called when Tracefs consumes a new page from a + * ring-buffer. It is expected from the remote to isolate a + * new reader-page from the @cpu ring-buffer. + */ +struct trace_remote_callbacks { + struct trace_buffer_desc *(*load_trace_buffer)(unsigned long size, void *priv); + void (*unload_trace_buffer)(struct trace_buffer_desc *desc, void *priv); + int (*enable_tracing)(bool enable, void *priv); + int (*swap_reader_page)(unsigned int cpu, void *priv); +}; + +int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv); + +int trace_remote_alloc_buffer(struct trace_buffer_desc *desc, size_t desc_size, size_t buffer_size, + const struct cpumask *cpumask); + +void trace_remote_free_buffer(struct trace_buffer_desc *desc); + +#endif diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 49de13cae428..384dd36c8e29 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -1281,4 +1281,7 @@ config HIST_TRIGGERS_DEBUG source "kernel/trace/rv/Kconfig" +config TRACE_REMOTE + bool + endif # FTRACE diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 04096c21d06b..318923ce39f5 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -128,4 +128,5 @@ obj-$(CONFIG_FPROBE_EVENTS) += trace_fprobe.o obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o obj-$(CONFIG_RV) += rv/ +obj-$(CONFIG_TRACE_REMOTE) += trace_remote.o libftrace-y := ftrace.o diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ebd996f8710e..e33cb3c39f6e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8589,7 +8589,7 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) return tr->percpu_dir; } -static struct dentry * +struct dentry * trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent, void *data, long cpu, const struct file_operations *fops) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b8f3804586a0..d6af4405b341 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -689,6 +689,12 @@ struct dentry *trace_create_file(const char *name, struct dentry *parent, void *data, const struct file_operations *fops); +struct dentry *trace_create_cpu_file(const char *name, + umode_t mode, + struct dentry *parent, + void *data, + long cpu, + const struct file_operations *fops); /** diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c new file mode 100644 index 000000000000..8b06f730376e --- /dev/null +++ b/kernel/trace/trace_remote.c @@ -0,0 +1,619 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 - Google LLC + * Author: Vincent Donnefort + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +#define TRACEFS_DIR "remotes" +#define TRACEFS_MODE_WRITE 0640 +#define TRACEFS_MODE_READ 0440 + +struct trace_remote_iterator { + struct trace_remote *remote; + struct trace_seq seq; + struct delayed_work poll_work; + unsigned long lost_events; + u64 ts; + int cpu; + int evt_cpu; +}; + +struct trace_remote { + struct trace_remote_callbacks *cbs; + void *priv; + struct trace_buffer *trace_buffer; + struct trace_buffer_desc *trace_buffer_desc; + unsigned long trace_buffer_size; + struct ring_buffer_remote rb_remote; + struct mutex lock; + unsigned int nr_readers; + unsigned int poll_ms; + bool tracing_on; +}; + +static bool trace_remote_loaded(struct trace_remote *remote) +{ + return !!remote->trace_buffer; +} + +static int trace_remote_load(struct trace_remote *remote) +{ + struct ring_buffer_remote *rb_remote = &remote->rb_remote; + struct trace_buffer_desc *desc; + + lockdep_assert_held(&remote->lock); + + if (trace_remote_loaded(remote)) + return 0; + + desc = remote->cbs->load_trace_buffer(remote->trace_buffer_size, remote->priv); + if (IS_ERR(desc)) + return PTR_ERR(desc); + + rb_remote->desc = desc; + rb_remote->swap_reader_page = remote->cbs->swap_reader_page; + rb_remote->priv = remote->priv; + remote->trace_buffer = ring_buffer_alloc_remote(rb_remote); + if (!remote->trace_buffer) { + remote->cbs->unload_trace_buffer(desc, remote->priv); + return -ENOMEM; + } + + remote->trace_buffer_desc = desc; + + return 0; +} + +static void trace_remote_try_unload(struct trace_remote *remote) +{ + lockdep_assert_held(&remote->lock); + + if (!trace_remote_loaded(remote)) + return; + + /* The buffer is being read or writable */ + if (remote->nr_readers || remote->tracing_on) + return; + + /* The buffer has readable data */ + if (!ring_buffer_empty(remote->trace_buffer)) + return; + + ring_buffer_free(remote->trace_buffer); + remote->trace_buffer = NULL; + remote->cbs->unload_trace_buffer(remote->trace_buffer_desc, remote->priv); +} + +static int trace_remote_enable_tracing(struct trace_remote *remote) +{ + int ret; + + lockdep_assert_held(&remote->lock); + + if (remote->tracing_on) + return 0; + + ret = trace_remote_load(remote); + if (ret) + return ret; + + ret = remote->cbs->enable_tracing(true, remote->priv); + if (ret) { + trace_remote_try_unload(remote); + return ret; + } + + remote->tracing_on = true; + + return 0; +} + +static int trace_remote_disable_tracing(struct trace_remote *remote) +{ + int ret; + + lockdep_assert_held(&remote->lock); + + if (!remote->tracing_on) + return 0; + + ret = remote->cbs->enable_tracing(false, remote->priv); + if (ret) + return ret; + + ring_buffer_poll_remote(remote->trace_buffer, RING_BUFFER_ALL_CPUS); + remote->tracing_on = false; + trace_remote_try_unload(remote); + + return 0; +} + +static ssize_t +tracing_on_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_remote *remote = filp->private_data; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + guard(mutex)(&remote->lock); + + ret = val ? trace_remote_enable_tracing(remote) : trace_remote_disable_tracing(remote); + if (ret) + return ret; + + return cnt; +} +static int tracing_on_show(struct seq_file *s, void *unused) +{ + struct trace_remote *remote = s->private; + + seq_printf(s, "%d\n", remote->tracing_on); + + return 0; +} +DEFINE_SHOW_STORE_ATTRIBUTE(tracing_on); + +static ssize_t buffer_size_kb_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct trace_remote *remote = filp->private_data; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + /* KiB to Bytes */ + if (!val || check_shl_overflow(val, 10, &val)) + return -EINVAL; + + guard(mutex)(&remote->lock); + + if (trace_remote_loaded(remote)) + return -EBUSY; + + remote->trace_buffer_size = val; + + return cnt; +} + +static int buffer_size_kb_show(struct seq_file *s, void *unused) +{ + struct trace_remote *remote = s->private; + + seq_printf(s, "%lu (%s)\n", remote->trace_buffer_size >> 10, + trace_remote_loaded(remote) ? "loaded" : "unloaded"); + + return 0; +} +DEFINE_SHOW_STORE_ATTRIBUTE(buffer_size_kb); + +static int trace_remote_get(struct trace_remote *remote, int cpu) +{ + int ret; + + if (remote->nr_readers == UINT_MAX) + return -EBUSY; + + ret = trace_remote_load(remote); + if (ret) + return ret; + + remote->nr_readers++; + + return 0; +} + +static void trace_remote_put(struct trace_remote *remote) +{ + if (WARN_ON(!remote->nr_readers)) + return; + + remote->nr_readers--; + if (remote->nr_readers) + return; + + trace_remote_try_unload(remote); +} + +static void __poll_remote(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct trace_remote_iterator *iter; + + iter = container_of(dwork, struct trace_remote_iterator, poll_work); + ring_buffer_poll_remote(iter->remote->trace_buffer, iter->cpu); + schedule_delayed_work((struct delayed_work *)work, + msecs_to_jiffies(iter->remote->poll_ms)); +} + +static struct trace_remote_iterator *trace_remote_iter(struct trace_remote *remote, int cpu) +{ + struct trace_remote_iterator *iter = NULL; + int ret; + + lockdep_assert_held(&remote->lock); + + + ret = trace_remote_get(remote, cpu); + if (ret) + return ERR_PTR(ret); + + /* Test the CPU */ + ret = ring_buffer_poll_remote(remote->trace_buffer, cpu); + if (ret) + goto err; + + iter = kzalloc_obj(*iter); + if (iter) { + iter->remote = remote; + iter->cpu = cpu; + trace_seq_init(&iter->seq); + INIT_DELAYED_WORK(&iter->poll_work, __poll_remote); + schedule_delayed_work(&iter->poll_work, msecs_to_jiffies(remote->poll_ms)); + + return iter; + } + ret = -ENOMEM; + +err: + kfree(iter); + trace_remote_put(remote); + + return ERR_PTR(ret); +} + +static void trace_remote_iter_free(struct trace_remote_iterator *iter) +{ + struct trace_remote *remote; + + if (!iter) + return; + + remote = iter->remote; + + lockdep_assert_held(&remote->lock); + + kfree(iter); + trace_remote_put(remote); +} + +static bool trace_remote_iter_read_event(struct trace_remote_iterator *iter) +{ + struct trace_buffer *trace_buffer = iter->remote->trace_buffer; + int cpu = iter->cpu; + + if (cpu != RING_BUFFER_ALL_CPUS) { + if (ring_buffer_empty_cpu(trace_buffer, cpu)) + return false; + + if (!ring_buffer_peek(trace_buffer, cpu, &iter->ts, &iter->lost_events)) + return false; + + iter->evt_cpu = cpu; + return true; + } + + iter->ts = U64_MAX; + for_each_possible_cpu(cpu) { + unsigned long lost_events; + u64 ts; + + if (ring_buffer_empty_cpu(trace_buffer, cpu)) + continue; + + if (!ring_buffer_peek(trace_buffer, cpu, &ts, &lost_events)) + continue; + + if (ts >= iter->ts) + continue; + + iter->ts = ts; + iter->evt_cpu = cpu; + iter->lost_events = lost_events; + } + + return iter->ts != U64_MAX; +} + +static int trace_remote_iter_print_event(struct trace_remote_iterator *iter) +{ + unsigned long usecs_rem; + u64 ts = iter->ts; + + if (iter->lost_events) + trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", + iter->evt_cpu, iter->lost_events); + + do_div(ts, 1000); + usecs_rem = do_div(ts, USEC_PER_SEC); + + trace_seq_printf(&iter->seq, "[%03d]\t%5llu.%06lu: ", iter->evt_cpu, + ts, usecs_rem); + + return trace_seq_has_overflowed(&iter->seq) ? -EOVERFLOW : 0; +} + +static int trace_pipe_open(struct inode *inode, struct file *filp) +{ + struct trace_remote *remote = inode->i_private; + struct trace_remote_iterator *iter; + int cpu = RING_BUFFER_ALL_CPUS; + + if (inode->i_cdev) + cpu = (long)inode->i_cdev - 1; + + guard(mutex)(&remote->lock); + iter = trace_remote_iter(remote, cpu); + filp->private_data = iter; + + return IS_ERR(iter) ? PTR_ERR(iter) : 0; +} + +static int trace_pipe_release(struct inode *inode, struct file *filp) +{ + struct trace_remote_iterator *iter = filp->private_data; + struct trace_remote *remote = iter->remote; + + guard(mutex)(&remote->lock); + + trace_remote_iter_free(iter); + + return 0; +} + +static ssize_t trace_pipe_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct trace_remote_iterator *iter = filp->private_data; + struct trace_buffer *trace_buffer = iter->remote->trace_buffer; + int ret; + +copy_to_user: + ret = trace_seq_to_user(&iter->seq, ubuf, cnt); + if (ret != -EBUSY) + return ret; + + trace_seq_init(&iter->seq); + + ret = ring_buffer_wait(trace_buffer, iter->cpu, 0, NULL, NULL); + if (ret < 0) + return ret; + + while (trace_remote_iter_read_event(iter)) { + int prev_len = iter->seq.seq.len; + + if (trace_remote_iter_print_event(iter)) { + iter->seq.seq.len = prev_len; + break; + } + + ring_buffer_consume(trace_buffer, iter->evt_cpu, NULL, NULL); + } + + goto copy_to_user; +} + +static const struct file_operations trace_pipe_fops = { + .open = trace_pipe_open, + .read = trace_pipe_read, + .release = trace_pipe_release, +}; + +static int trace_remote_init_tracefs(const char *name, struct trace_remote *remote) +{ + struct dentry *remote_d, *percpu_d, *d; + static struct dentry *root; + static DEFINE_MUTEX(lock); + bool root_inited = false; + int cpu; + + guard(mutex)(&lock); + + if (!root) { + root = tracefs_create_dir(TRACEFS_DIR, NULL); + if (!root) { + pr_err("Failed to create tracefs dir "TRACEFS_DIR"\n"); + return -ENOMEM; + } + root_inited = true; + } + + remote_d = tracefs_create_dir(name, root); + if (!remote_d) { + pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/\n", name); + goto err; + } + + d = trace_create_file("tracing_on", TRACEFS_MODE_WRITE, remote_d, remote, &tracing_on_fops); + if (!d) + goto err; + + d = trace_create_file("buffer_size_kb", TRACEFS_MODE_WRITE, remote_d, remote, + &buffer_size_kb_fops); + if (!d) + goto err; + + d = trace_create_file("trace_pipe", TRACEFS_MODE_READ, remote_d, remote, &trace_pipe_fops); + if (!d) + goto err; + + percpu_d = tracefs_create_dir("per_cpu", remote_d); + if (!percpu_d) { + pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/per_cpu/\n", name); + goto err; + } + + for_each_possible_cpu(cpu) { + struct dentry *cpu_d; + char cpu_name[16]; + + snprintf(cpu_name, sizeof(cpu_name), "cpu%d", cpu); + cpu_d = tracefs_create_dir(cpu_name, percpu_d); + if (!cpu_d) { + pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/percpu/cpu%d\n", + name, cpu); + goto err; + } + + d = trace_create_cpu_file("trace_pipe", TRACEFS_MODE_READ, cpu_d, remote, cpu, + &trace_pipe_fops); + if (!d) + goto err; + } + + return 0; + +err: + if (root_inited) { + tracefs_remove(root); + root = NULL; + } else { + tracefs_remove(remote_d); + } + + return -ENOMEM; +} + +/** + * trace_remote_register() - Register a Tracefs remote + * @name: Name of the remote, used for the Tracefs remotes/ directory. + * @cbs: Set of callbacks used to control the remote. + * @priv: Private data, passed to each callback from @cbs. + * @events: Array of events. &remote_event.name and &remote_event.id must be + * filled by the caller. + * @nr_events: Number of events in the @events array. + * + * A trace remote is an entity, outside of the kernel (most likely firmware or + * hypervisor) capable of writing events into a Tracefs compatible ring-buffer. + * The kernel would then act as a reader. + * + * The registered remote will be found under the Tracefs directory + * remotes/. + * + * Return: 0 on success, negative error code on failure. + */ +int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv) +{ + struct trace_remote *remote; + + remote = kzalloc_obj(*remote); + if (!remote) + return -ENOMEM; + + remote->cbs = cbs; + remote->priv = priv; + remote->trace_buffer_size = 7 << 10; + remote->poll_ms = 100; + mutex_init(&remote->lock); + + if (trace_remote_init_tracefs(name, remote)) { + kfree(remote); + return -ENOMEM; + } + + return 0; +} +EXPORT_SYMBOL_GPL(trace_remote_register); + +/** + * trace_remote_free_buffer() - Free trace buffer allocated with trace_remote_alloc_buffer() + * @desc: Descriptor of the per-CPU ring-buffers, originally filled by + * trace_remote_alloc_buffer() + * + * Most likely called from &trace_remote_callbacks.unload_trace_buffer. + */ +void trace_remote_free_buffer(struct trace_buffer_desc *desc) +{ + struct ring_buffer_desc *rb_desc; + int cpu; + + for_each_ring_buffer_desc(rb_desc, cpu, desc) { + unsigned int id; + + free_page(rb_desc->meta_va); + + for (id = 0; id < rb_desc->nr_page_va; id++) + free_page(rb_desc->page_va[id]); + } +} +EXPORT_SYMBOL_GPL(trace_remote_free_buffer); + +/** + * trace_remote_alloc_buffer() - Dynamically allocate a trace buffer + * @desc: Uninitialized trace_buffer_desc + * @desc_size: Size of the trace_buffer_desc. Must be at least equal to + * trace_buffer_desc_size() + * @buffer_size: Size in bytes of each per-CPU ring-buffer + * @cpumask: CPUs to allocate a ring-buffer for + * + * Helper to dynamically allocate a set of pages (enough to cover @buffer_size) + * for each CPU from @cpumask and fill @desc. Most likely called from + * &trace_remote_callbacks.load_trace_buffer. + * + * Return: 0 on success, negative error code on failure. + */ +int trace_remote_alloc_buffer(struct trace_buffer_desc *desc, size_t desc_size, size_t buffer_size, + const struct cpumask *cpumask) +{ + unsigned int nr_pages = max(DIV_ROUND_UP(buffer_size, PAGE_SIZE), 2UL) + 1; + void *desc_end = desc + desc_size; + struct ring_buffer_desc *rb_desc; + int cpu, ret = -ENOMEM; + + if (desc_size < struct_size(desc, __data, 0)) + return -EINVAL; + + desc->nr_cpus = 0; + desc->struct_len = struct_size(desc, __data, 0); + + rb_desc = (struct ring_buffer_desc *)&desc->__data[0]; + + for_each_cpu(cpu, cpumask) { + unsigned int id; + + if ((void *)rb_desc + struct_size(rb_desc, page_va, nr_pages) > desc_end) { + ret = -EINVAL; + goto err; + } + + rb_desc->cpu = cpu; + rb_desc->nr_page_va = 0; + rb_desc->meta_va = (unsigned long)__get_free_page(GFP_KERNEL); + if (!rb_desc->meta_va) + goto err; + + for (id = 0; id < nr_pages; id++) { + rb_desc->page_va[id] = (unsigned long)__get_free_page(GFP_KERNEL); + if (!rb_desc->page_va[id]) + goto err; + + rb_desc->nr_page_va++; + } + desc->nr_cpus++; + desc->struct_len += offsetof(struct ring_buffer_desc, page_va); + desc->struct_len += struct_size(rb_desc, page_va, rb_desc->nr_page_va); + rb_desc = __next_ring_buffer_desc(rb_desc); + } + + return 0; + +err: + trace_remote_free_buffer(desc); + return ret; +} +EXPORT_SYMBOL_GPL(trace_remote_alloc_buffer); -- cgit v1.2.3 From 9af4ab0e11e336e2671d303ffcc6578e3546d9fc Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:24:52 +0000 Subject: tracing: Add reset to trace remotes Allow to reset the trace remote buffer by writing to the Tracefs "trace" file. This is similar to the regular Tracefs interface. Link: https://patch.msgid.link/20260309162516.2623589-7-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_remote.h | 3 +++ kernel/trace/trace_remote.c | 45 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) (limited to 'include/linux') diff --git a/include/linux/trace_remote.h b/include/linux/trace_remote.h index 65b7e7b8267c..10ca03dc192b 100644 --- a/include/linux/trace_remote.h +++ b/include/linux/trace_remote.h @@ -17,6 +17,8 @@ * remote to allow writing. * @swap_reader_page: Called when Tracefs consumes a new page from a * ring-buffer. It is expected from the remote to isolate a + * @reset: Called on `echo 0 > trace`. It is expected from the + * remote to reset all ring-buffer pages. * new reader-page from the @cpu ring-buffer. */ struct trace_remote_callbacks { @@ -24,6 +26,7 @@ struct trace_remote_callbacks { void (*unload_trace_buffer)(struct trace_buffer_desc *desc, void *priv); int (*enable_tracing)(bool enable, void *priv); int (*swap_reader_page)(unsigned int cpu, void *priv); + int (*reset)(unsigned int cpu, void *priv); }; int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv); diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c index 8b06f730376e..a7b94736dd38 100644 --- a/kernel/trace/trace_remote.c +++ b/kernel/trace/trace_remote.c @@ -63,6 +63,7 @@ static int trace_remote_load(struct trace_remote *remote) rb_remote->desc = desc; rb_remote->swap_reader_page = remote->cbs->swap_reader_page; rb_remote->priv = remote->priv; + rb_remote->reset = remote->cbs->reset; remote->trace_buffer = ring_buffer_alloc_remote(rb_remote); if (!remote->trace_buffer) { remote->cbs->unload_trace_buffer(desc, remote->priv); @@ -138,6 +139,21 @@ static int trace_remote_disable_tracing(struct trace_remote *remote) return 0; } +static void trace_remote_reset(struct trace_remote *remote, int cpu) +{ + lockdep_assert_held(&remote->lock); + + if (!trace_remote_loaded(remote)) + return; + + if (cpu == RING_BUFFER_ALL_CPUS) + ring_buffer_reset(remote->trace_buffer); + else + ring_buffer_reset_cpu(remote->trace_buffer, cpu); + + trace_remote_try_unload(remote); +} + static ssize_t tracing_on_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -414,6 +430,26 @@ static const struct file_operations trace_pipe_fops = { .release = trace_pipe_release, }; +static ssize_t trace_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct inode *inode = file_inode(filp); + struct trace_remote *remote = inode->i_private; + int cpu = RING_BUFFER_ALL_CPUS; + + if (inode->i_cdev) + cpu = (long)inode->i_cdev - 1; + + guard(mutex)(&remote->lock); + + trace_remote_reset(remote, cpu); + + return cnt; +} + +static const struct file_operations trace_fops = { + .write = trace_write, +}; + static int trace_remote_init_tracefs(const char *name, struct trace_remote *remote) { struct dentry *remote_d, *percpu_d, *d; @@ -452,6 +488,10 @@ static int trace_remote_init_tracefs(const char *name, struct trace_remote *remo if (!d) goto err; + d = trace_create_file("trace", TRACEFS_MODE_WRITE, remote_d, remote, &trace_fops); + if (!d) + goto err; + percpu_d = tracefs_create_dir("per_cpu", remote_d); if (!percpu_d) { pr_err("Failed to create tracefs dir "TRACEFS_DIR"%s/per_cpu/\n", name); @@ -474,6 +514,11 @@ static int trace_remote_init_tracefs(const char *name, struct trace_remote *remo &trace_pipe_fops); if (!d) goto err; + + d = trace_create_cpu_file("trace", TRACEFS_MODE_WRITE, cpu_d, remote, cpu, + &trace_fops); + if (!d) + goto err; } return 0; -- cgit v1.2.3 From bf2ba0f8ca1af14aaaa765cbb93caf564d383aad Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:24:54 +0000 Subject: tracing: Add init callback to trace remotes Add a .init call back so the trace remote callers can add entries to the tracefs directory. Link: https://patch.msgid.link/20260309162516.2623589-9-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_remote.h | 4 ++++ kernel/trace/trace_remote.c | 7 ++++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/trace_remote.h b/include/linux/trace_remote.h index 10ca03dc192b..090c58b7d92b 100644 --- a/include/linux/trace_remote.h +++ b/include/linux/trace_remote.h @@ -3,10 +3,13 @@ #ifndef _LINUX_TRACE_REMOTE_H #define _LINUX_TRACE_REMOTE_H +#include #include /** * struct trace_remote_callbacks - Callbacks used by Tracefs to control the remote + * @init: Called once the remote has been registered. Allows the + * caller to extend the Tracefs remote directory * @load_trace_buffer: Called before Tracefs accesses the trace buffer for the first * time. Must return a &trace_buffer_desc * (most likely filled with trace_remote_alloc_buffer()) @@ -22,6 +25,7 @@ * new reader-page from the @cpu ring-buffer. */ struct trace_remote_callbacks { + int (*init)(struct dentry *d, void *priv); struct trace_buffer_desc *(*load_trace_buffer)(unsigned long size, void *priv); void (*unload_trace_buffer)(struct trace_buffer_desc *desc, void *priv); int (*enable_tracing)(bool enable, void *priv); diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c index 039ba71c3b3e..294d051dcef1 100644 --- a/kernel/trace/trace_remote.c +++ b/kernel/trace/trace_remote.c @@ -863,6 +863,7 @@ err: int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv) { struct trace_remote *remote; + int ret; remote = kzalloc_obj(*remote); if (!remote) @@ -880,7 +881,11 @@ int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, return -ENOMEM; } - return 0; + ret = cbs->init ? cbs->init(remote->dentry, priv) : 0; + if (ret) + pr_err("Init failed for trace remote '%s' (%d)\n", name, ret); + + return ret; } EXPORT_SYMBOL_GPL(trace_remote_register); -- cgit v1.2.3 From 072529158e604cc964feb78dcf094c6975828146 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:24:55 +0000 Subject: tracing: Add events to trace remotes An event is predefined point in the writer code that allows to log data. Following the same scheme as kernel events, add remote events, described to user-space within the events/ tracefs directory found in the corresponding trace remote. Remote events are expected to be described during the trace remote registration. Add also a .enable_event callback for trace_remote to toggle the event logging, if supported. Link: https://patch.msgid.link/20260309162516.2623589-10-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_remote.h | 7 +- include/linux/trace_remote_event.h | 23 ++++ kernel/trace/trace_remote.c | 264 ++++++++++++++++++++++++++++++++++++- 3 files changed, 288 insertions(+), 6 deletions(-) create mode 100644 include/linux/trace_remote_event.h (limited to 'include/linux') diff --git a/include/linux/trace_remote.h b/include/linux/trace_remote.h index 090c58b7d92b..fcd1d46ea466 100644 --- a/include/linux/trace_remote.h +++ b/include/linux/trace_remote.h @@ -5,6 +5,7 @@ #include #include +#include /** * struct trace_remote_callbacks - Callbacks used by Tracefs to control the remote @@ -23,6 +24,8 @@ * @reset: Called on `echo 0 > trace`. It is expected from the * remote to reset all ring-buffer pages. * new reader-page from the @cpu ring-buffer. + * @enable_event: Called on events/event_name/enable. It is expected from + * the remote to allow the writing event @id. */ struct trace_remote_callbacks { int (*init)(struct dentry *d, void *priv); @@ -31,9 +34,11 @@ struct trace_remote_callbacks { int (*enable_tracing)(bool enable, void *priv); int (*swap_reader_page)(unsigned int cpu, void *priv); int (*reset)(unsigned int cpu, void *priv); + int (*enable_event)(unsigned short id, bool enable, void *priv); }; -int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv); +int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv, + struct remote_event *events, size_t nr_events); int trace_remote_alloc_buffer(struct trace_buffer_desc *desc, size_t desc_size, size_t buffer_size, const struct cpumask *cpumask); diff --git a/include/linux/trace_remote_event.h b/include/linux/trace_remote_event.h new file mode 100644 index 000000000000..a4449008a075 --- /dev/null +++ b/include/linux/trace_remote_event.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _LINUX_TRACE_REMOTE_EVENTS_H +#define _LINUX_TRACE_REMOTE_EVENTS_H + +struct trace_remote; +struct trace_event_fields; + +struct remote_event_hdr { + unsigned short id; +}; + +#define REMOTE_EVENT_NAME_MAX 30 +struct remote_event { + char name[REMOTE_EVENT_NAME_MAX]; + unsigned short id; + bool enabled; + struct trace_remote *remote; + struct trace_event_fields *fields; + char *print_fmt; + void (*print)(void *evt, struct trace_seq *seq); +}; +#endif diff --git a/kernel/trace/trace_remote.c b/kernel/trace/trace_remote.c index 294d051dcef1..0d0af53c0ce9 100644 --- a/kernel/trace/trace_remote.c +++ b/kernel/trace/trace_remote.c @@ -31,6 +31,7 @@ struct trace_remote_iterator { u64 ts; struct ring_buffer_iter *rb_iter; struct ring_buffer_iter **rb_iters; + struct remote_event_hdr *evt; int cpu; int evt_cpu; loff_t pos; @@ -42,6 +43,10 @@ struct trace_remote { void *priv; struct trace_buffer *trace_buffer; struct trace_buffer_desc *trace_buffer_desc; + struct dentry *dentry; + struct eventfs_inode *eventfs; + struct remote_event *events; + unsigned long nr_events; unsigned long trace_buffer_size; struct ring_buffer_remote rb_remote; struct mutex lock; @@ -168,7 +173,8 @@ static void trace_remote_reset(struct trace_remote *remote, int cpu) static ssize_t tracing_on_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct trace_remote *remote = filp->private_data; + struct seq_file *seq = filp->private_data; + struct trace_remote *remote = seq->private; unsigned long val; int ret; @@ -197,7 +203,8 @@ DEFINE_SHOW_STORE_ATTRIBUTE(tracing_on); static ssize_t buffer_size_kb_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct trace_remote *remote = filp->private_data; + struct seq_file *seq = filp->private_data; + struct trace_remote *remote = seq->private; unsigned long val; int ret; @@ -484,16 +491,19 @@ __peek_event(struct trace_remote_iterator *iter, int cpu, u64 *ts, unsigned long static bool trace_remote_iter_read_event(struct trace_remote_iterator *iter) { struct trace_buffer *trace_buffer = iter->remote->trace_buffer; + struct ring_buffer_event *rb_evt; int cpu = iter->cpu; if (cpu != RING_BUFFER_ALL_CPUS) { if (ring_buffer_empty_cpu(trace_buffer, cpu)) return false; - if (!__peek_event(iter, cpu, &iter->ts, &iter->lost_events)) + rb_evt = __peek_event(iter, cpu, &iter->ts, &iter->lost_events); + if (!rb_evt) return false; iter->evt_cpu = cpu; + iter->evt = ring_buffer_event_data(rb_evt); return true; } @@ -505,7 +515,8 @@ static bool trace_remote_iter_read_event(struct trace_remote_iterator *iter) if (ring_buffer_empty_cpu(trace_buffer, cpu)) continue; - if (!__peek_event(iter, cpu, &ts, &lost_events)) + rb_evt = __peek_event(iter, cpu, &ts, &lost_events); + if (!rb_evt) continue; if (ts >= iter->ts) @@ -513,6 +524,7 @@ static bool trace_remote_iter_read_event(struct trace_remote_iterator *iter) iter->ts = ts; iter->evt_cpu = cpu; + iter->evt = ring_buffer_event_data(rb_evt); iter->lost_events = lost_events; } @@ -533,8 +545,11 @@ static void trace_remote_iter_move(struct trace_remote_iterator *iter) } } +static struct remote_event *trace_remote_find_event(struct trace_remote *remote, unsigned short id); + static int trace_remote_iter_print_event(struct trace_remote_iterator *iter) { + struct remote_event *evt; unsigned long usecs_rem; u64 ts = iter->ts; @@ -548,6 +563,12 @@ static int trace_remote_iter_print_event(struct trace_remote_iterator *iter) trace_seq_printf(&iter->seq, "[%03d]\t%5llu.%06lu: ", iter->evt_cpu, ts, usecs_rem); + evt = trace_remote_find_event(iter->remote, iter->evt->id); + if (!evt) + trace_seq_printf(&iter->seq, "UNKNOWN id=%d\n", iter->evt->id); + else + evt->print(iter->evt, &iter->seq); + return trace_seq_has_overflowed(&iter->seq) ? -EOVERFLOW : 0; } @@ -829,6 +850,8 @@ static int trace_remote_init_tracefs(const char *name, struct trace_remote *remo goto err; } + remote->dentry = remote_d; + return 0; err: @@ -842,6 +865,9 @@ err: return -ENOMEM; } +static int trace_remote_register_events(const char *remote_name, struct trace_remote *remote, + struct remote_event *events, size_t nr_events); + /** * trace_remote_register() - Register a Tracefs remote * @name: Name of the remote, used for the Tracefs remotes/ directory. @@ -860,7 +886,8 @@ err: * * Return: 0 on success, negative error code on failure. */ -int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv) +int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, void *priv, + struct remote_event *events, size_t nr_events) { struct trace_remote *remote; int ret; @@ -881,6 +908,13 @@ int trace_remote_register(const char *name, struct trace_remote_callbacks *cbs, return -ENOMEM; } + ret = trace_remote_register_events(name, remote, events, nr_events); + if (ret) { + pr_err("Failed to register events for trace remote '%s' (%d)\n", + name, ret); + return ret; + } + ret = cbs->init ? cbs->init(remote->dentry, priv) : 0; if (ret) pr_err("Init failed for trace remote '%s' (%d)\n", name, ret); @@ -976,3 +1010,223 @@ err: return ret; } EXPORT_SYMBOL_GPL(trace_remote_alloc_buffer); + +static int +trace_remote_enable_event(struct trace_remote *remote, struct remote_event *evt, bool enable) +{ + int ret; + + lockdep_assert_held(&remote->lock); + + if (evt->enabled == enable) + return 0; + + ret = remote->cbs->enable_event(evt->id, enable, remote->priv); + if (ret) + return ret; + + evt->enabled = enable; + + return 0; +} + +static int remote_event_enable_show(struct seq_file *s, void *unused) +{ + struct remote_event *evt = s->private; + + seq_printf(s, "%d\n", evt->enabled); + + return 0; +} + +static ssize_t remote_event_enable_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct seq_file *seq = filp->private_data; + struct remote_event *evt = seq->private; + struct trace_remote *remote = evt->remote; + u8 enable; + int ret; + + ret = kstrtou8_from_user(ubuf, count, 10, &enable); + if (ret) + return ret; + + guard(mutex)(&remote->lock); + + ret = trace_remote_enable_event(remote, evt, enable); + if (ret) + return ret; + + return count; +} +DEFINE_SHOW_STORE_ATTRIBUTE(remote_event_enable); + +static int remote_event_id_show(struct seq_file *s, void *unused) +{ + struct remote_event *evt = s->private; + + seq_printf(s, "%d\n", evt->id); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(remote_event_id); + +static int remote_event_format_show(struct seq_file *s, void *unused) +{ + size_t offset = sizeof(struct remote_event_hdr); + struct remote_event *evt = s->private; + struct trace_event_fields *field; + + seq_printf(s, "name: %s\n", evt->name); + seq_printf(s, "ID: %d\n", evt->id); + seq_puts(s, + "format:\n\tfield:unsigned short common_type;\toffset:0;\tsize:2;\tsigned:0;\n\n"); + + field = &evt->fields[0]; + while (field->name) { + seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%u;\tsigned:%d;\n", + field->type, field->name, offset, field->size, + field->is_signed); + offset += field->size; + field++; + } + + if (field != &evt->fields[0]) + seq_puts(s, "\n"); + + seq_printf(s, "print fmt: %s\n", evt->print_fmt); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(remote_event_format); + +static int remote_event_callback(const char *name, umode_t *mode, void **data, + const struct file_operations **fops) +{ + if (!strcmp(name, "enable")) { + *mode = TRACEFS_MODE_WRITE; + *fops = &remote_event_enable_fops; + return 1; + } + + if (!strcmp(name, "id")) { + *mode = TRACEFS_MODE_READ; + *fops = &remote_event_id_fops; + return 1; + } + + if (!strcmp(name, "format")) { + *mode = TRACEFS_MODE_READ; + *fops = &remote_event_format_fops; + return 1; + } + + return 0; +} + +static int trace_remote_init_eventfs(const char *remote_name, struct trace_remote *remote, + struct remote_event *evt) +{ + struct eventfs_inode *eventfs = remote->eventfs; + static struct eventfs_entry entries[] = { + { + .name = "enable", + .callback = remote_event_callback, + }, { + .name = "id", + .callback = remote_event_callback, + }, { + .name = "format", + .callback = remote_event_callback, + } + }; + bool eventfs_create = false; + + if (!eventfs) { + eventfs = eventfs_create_events_dir("events", remote->dentry, NULL, 0, NULL); + if (IS_ERR(eventfs)) + return PTR_ERR(eventfs); + + /* + * Create similar hierarchy as local events even if a single system is supported at + * the moment + */ + eventfs = eventfs_create_dir(remote_name, eventfs, NULL, 0, NULL); + if (IS_ERR(eventfs)) + return PTR_ERR(eventfs); + + remote->eventfs = eventfs; + eventfs_create = true; + } + + eventfs = eventfs_create_dir(evt->name, eventfs, entries, ARRAY_SIZE(entries), evt); + if (IS_ERR(eventfs)) { + if (eventfs_create) { + eventfs_remove_events_dir(remote->eventfs); + remote->eventfs = NULL; + } + return PTR_ERR(eventfs); + } + + return 0; +} + +static int trace_remote_attach_events(struct trace_remote *remote, struct remote_event *events, + size_t nr_events) +{ + int i; + + for (i = 0; i < nr_events; i++) { + struct remote_event *evt = &events[i]; + + if (evt->remote) + return -EEXIST; + + evt->remote = remote; + + /* We need events to be sorted for efficient lookup */ + if (i && evt->id <= events[i - 1].id) + return -EINVAL; + } + + remote->events = events; + remote->nr_events = nr_events; + + return 0; +} + +static int trace_remote_register_events(const char *remote_name, struct trace_remote *remote, + struct remote_event *events, size_t nr_events) +{ + int i, ret; + + ret = trace_remote_attach_events(remote, events, nr_events); + if (ret) + return ret; + + for (i = 0; i < nr_events; i++) { + struct remote_event *evt = &events[i]; + + ret = trace_remote_init_eventfs(remote_name, remote, evt); + if (ret) + pr_warn("Failed to init eventfs for event '%s' (%d)", + evt->name, ret); + } + + return 0; +} + +static int __cmp_events(const void *key, const void *data) +{ + const struct remote_event *evt = data; + int id = (int)((long)key); + + return id - (int)evt->id; +} + +static struct remote_event *trace_remote_find_event(struct trace_remote *remote, unsigned short id) +{ + return bsearch((const void *)(unsigned long)id, remote->events, remote->nr_events, + sizeof(*remote->events), __cmp_events); +} -- cgit v1.2.3 From 5f3efd1dcebc35d44cce39630ae00980a45d9247 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:24:57 +0000 Subject: tracing: Add helpers to create trace remote events Declaring remote events can be cumbersome let's add a set of macros to simplify developers life. The declaration of a remote event is very similar to kernel's events: REMOTE_EVENT(name, id, RE_STRUCT( re_field(u64 foo) ), RE_PRINTK("foo=%llu", __entry->foo) ) Link: https://patch.msgid.link/20260309162516.2623589-12-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_remote_event.h | 10 +++++ include/trace/define_remote_events.h | 73 ++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+) create mode 100644 include/trace/define_remote_events.h (limited to 'include/linux') diff --git a/include/linux/trace_remote_event.h b/include/linux/trace_remote_event.h index a4449008a075..c8ae1e1f5e72 100644 --- a/include/linux/trace_remote_event.h +++ b/include/linux/trace_remote_event.h @@ -5,6 +5,7 @@ struct trace_remote; struct trace_event_fields; +struct trace_seq; struct remote_event_hdr { unsigned short id; @@ -20,4 +21,13 @@ struct remote_event { char *print_fmt; void (*print)(void *evt, struct trace_seq *seq); }; + +#define RE_STRUCT(__args...) __args +#define re_field(__type, __field) __type __field; + +#define REMOTE_EVENT_FORMAT(__name, __struct) \ + struct remote_event_format_##__name { \ + struct remote_event_hdr hdr; \ + __struct \ + } #endif diff --git a/include/trace/define_remote_events.h b/include/trace/define_remote_events.h new file mode 100644 index 000000000000..676e803dc144 --- /dev/null +++ b/include/trace/define_remote_events.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include +#include +#include +#include + +#define REMOTE_EVENT_INCLUDE(__file) __stringify(../../__file) + +#ifdef REMOTE_EVENT_SECTION +# define __REMOTE_EVENT_SECTION(__name) __used __section(REMOTE_EVENT_SECTION"."#__name) +#else +# define __REMOTE_EVENT_SECTION(__name) +#endif + +#define REMOTE_PRINTK_COUNT_ARGS(__args...) \ + __COUNT_ARGS(, ##__args, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 0) + +#define __remote_printk0() \ + trace_seq_putc(seq, '\n') + +#define __remote_printk1(__fmt) \ + trace_seq_puts(seq, " " __fmt "\n") \ + +#define __remote_printk2(__fmt, __args...) \ +do { \ + trace_seq_putc(seq, ' '); \ + trace_seq_printf(seq, __fmt, __args); \ + trace_seq_putc(seq, '\n'); \ +} while (0) + +/* Apply the appropriate trace_seq sequence according to the number of arguments */ +#define remote_printk(__args...) \ + CONCATENATE(__remote_printk, REMOTE_PRINTK_COUNT_ARGS(__args))(__args) + +#define RE_PRINTK(__args...) __args + +#define REMOTE_EVENT(__name, __id, __struct, __printk) \ + REMOTE_EVENT_FORMAT(__name, __struct); \ + static void remote_event_print_##__name(void *evt, struct trace_seq *seq) \ + { \ + struct remote_event_format_##__name __maybe_unused *__entry = evt; \ + trace_seq_puts(seq, #__name); \ + remote_printk(__printk); \ + } +#include REMOTE_EVENT_INCLUDE(REMOTE_EVENT_INCLUDE_FILE) + +#undef REMOTE_EVENT +#undef RE_PRINTK +#undef re_field +#define re_field(__type, __field) \ + { \ + .type = #__type, .name = #__field, \ + .size = sizeof(__type), .align = __alignof__(__type), \ + .is_signed = is_signed_type(__type), \ + }, +#define __entry REC +#define RE_PRINTK(__fmt, __args...) "\"" __fmt "\", " __stringify(__args) +#define REMOTE_EVENT(__name, __id, __struct, __printk) \ + static struct trace_event_fields remote_event_fields_##__name[] = { \ + __struct \ + {} \ + }; \ + static char remote_event_print_fmt_##__name[] = __printk; \ + static struct remote_event __REMOTE_EVENT_SECTION(__name) \ + remote_event_##__name = { \ + .name = #__name, \ + .id = __id, \ + .fields = remote_event_fields_##__name, \ + .print_fmt = remote_event_print_fmt_##__name, \ + .print = remote_event_print_##__name, \ + } +#include REMOTE_EVENT_INCLUDE(REMOTE_EVENT_INCLUDE_FILE) -- cgit v1.2.3 From 93ae1b76fff9e745f870a2f2cd32f472328c4a8f Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:24:58 +0000 Subject: ring-buffer: Export buffer_data_page and macros In preparation for allowing the writing of ring-buffer compliant pages outside of ring_buffer.c, move buffer_data_page and timestamps encoding macros into the publicly available ring_buffer_types.h. Link: https://patch.msgid.link/20260309162516.2623589-13-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/ring_buffer_types.h | 41 +++++++++++++++++++++++++++++++++++++++ kernel/trace/ring_buffer.c | 36 +--------------------------------- 2 files changed, 42 insertions(+), 35 deletions(-) create mode 100644 include/linux/ring_buffer_types.h (limited to 'include/linux') diff --git a/include/linux/ring_buffer_types.h b/include/linux/ring_buffer_types.h new file mode 100644 index 000000000000..54577021a49d --- /dev/null +++ b/include/linux/ring_buffer_types.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_RING_BUFFER_TYPES_H +#define _LINUX_RING_BUFFER_TYPES_H + +#include + +#define TS_SHIFT 27 +#define TS_MASK ((1ULL << TS_SHIFT) - 1) +#define TS_DELTA_TEST (~TS_MASK) + +/* + * We need to fit the time_stamp delta into 27 bits. + */ +static inline bool test_time_stamp(u64 delta) +{ + return !!(delta & TS_DELTA_TEST); +} + +#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) + +#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) +#define RB_ALIGNMENT 4U +#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) +#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ + +#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS +# define RB_FORCE_8BYTE_ALIGNMENT 0 +# define RB_ARCH_ALIGNMENT RB_ALIGNMENT +#else +# define RB_FORCE_8BYTE_ALIGNMENT 1 +# define RB_ARCH_ALIGNMENT 8U +#endif + +#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT) + +struct buffer_data_page { + u64 time_stamp; /* page time stamp */ + local_t commit; /* write committed index */ + unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */ +}; +#endif diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 605142e06863..96e0d80d492b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4,6 +4,7 @@ * * Copyright (C) 2008 Steven Rostedt */ +#include #include #include #include @@ -157,23 +158,6 @@ int ring_buffer_print_entry_header(struct trace_seq *s) /* Used for individual buffers (after the counter) */ #define RB_BUFFER_OFF (1 << 20) -#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) - -#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) -#define RB_ALIGNMENT 4U -#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) -#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ - -#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS -# define RB_FORCE_8BYTE_ALIGNMENT 0 -# define RB_ARCH_ALIGNMENT RB_ALIGNMENT -#else -# define RB_FORCE_8BYTE_ALIGNMENT 1 -# define RB_ARCH_ALIGNMENT 8U -#endif - -#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT) - /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX @@ -316,10 +300,6 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); #define for_each_online_buffer_cpu(buffer, cpu) \ for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask) -#define TS_SHIFT 27 -#define TS_MASK ((1ULL << TS_SHIFT) - 1) -#define TS_DELTA_TEST (~TS_MASK) - static u64 rb_event_time_stamp(struct ring_buffer_event *event) { u64 ts; @@ -338,12 +318,6 @@ static u64 rb_event_time_stamp(struct ring_buffer_event *event) #define RB_MISSED_MASK (3 << 30) -struct buffer_data_page { - u64 time_stamp; /* page time stamp */ - local_t commit; /* write committed index */ - unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */ -}; - struct buffer_data_read_page { unsigned order; /* order of the page */ struct buffer_data_page *data; /* actual data, stored in this page */ @@ -437,14 +411,6 @@ static struct buffer_data_page *alloc_cpu_data(int cpu, int order) return dpage; } -/* - * We need to fit the time_stamp delta into 27 bits. - */ -static inline bool test_time_stamp(u64 delta) -{ - return !!(delta & TS_DELTA_TEST); -} - struct rb_irq_work { struct irq_work work; wait_queue_head_t waiters; -- cgit v1.2.3 From 34e5b958bdad0f9cf16306368bbc2dc5b2a50143 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:24:59 +0000 Subject: tracing: Introduce simple_ring_buffer Add a simple implementation of the kernel ring-buffer. This intends to be used later by ring-buffer remotes such as the pKVM hypervisor, hence the need for a cut down version (write only) without any dependency. Link: https://patch.msgid.link/20260309162516.2623589-14-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/simple_ring_buffer.h | 57 +++++ kernel/trace/Kconfig | 3 + kernel/trace/Makefile | 1 + kernel/trace/simple_ring_buffer.c | 464 +++++++++++++++++++++++++++++++++++++ 4 files changed, 525 insertions(+) create mode 100644 include/linux/simple_ring_buffer.h create mode 100644 kernel/trace/simple_ring_buffer.c (limited to 'include/linux') diff --git a/include/linux/simple_ring_buffer.h b/include/linux/simple_ring_buffer.h new file mode 100644 index 000000000000..2c4c0ae336bc --- /dev/null +++ b/include/linux/simple_ring_buffer.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SIMPLE_RING_BUFFER_H +#define _LINUX_SIMPLE_RING_BUFFER_H + +#include +#include +#include +#include + +/* + * Ideally those struct would stay private but the caller needs to know + * the allocation size for simple_ring_buffer_init(). + */ +struct simple_buffer_page { + struct list_head link; + struct buffer_data_page *page; + u64 entries; + u32 write; + u32 id; +}; + +struct simple_rb_per_cpu { + struct simple_buffer_page *tail_page; + struct simple_buffer_page *reader_page; + struct simple_buffer_page *head_page; + struct simple_buffer_page *bpages; + struct trace_buffer_meta *meta; + u32 nr_pages; + +#define SIMPLE_RB_UNAVAILABLE 0 +#define SIMPLE_RB_READY 1 +#define SIMPLE_RB_WRITING 2 + u32 status; + + u64 last_overrun; + u64 write_stamp; + + struct simple_rb_cbs *cbs; +}; + +int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_buffer_page *bpages, + const struct ring_buffer_desc *desc); + +void simple_ring_buffer_unload(struct simple_rb_per_cpu *cpu_buffer); + +void *simple_ring_buffer_reserve(struct simple_rb_per_cpu *cpu_buffer, unsigned long length, + u64 timestamp); + +void simple_ring_buffer_commit(struct simple_rb_per_cpu *cpu_buffer); + +int simple_ring_buffer_enable_tracing(struct simple_rb_per_cpu *cpu_buffer, bool enable); + +int simple_ring_buffer_reset(struct simple_rb_per_cpu *cpu_buffer); + +int simple_ring_buffer_swap_reader_page(struct simple_rb_per_cpu *cpu_buffer); + +#endif diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 384dd36c8e29..edbdd7b38f61 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -1284,4 +1284,7 @@ source "kernel/trace/rv/Kconfig" config TRACE_REMOTE bool +config SIMPLE_RING_BUFFER + bool + endif # FTRACE diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 318923ce39f5..2e39b09398b3 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -129,4 +129,5 @@ obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o obj-$(CONFIG_RV) += rv/ obj-$(CONFIG_TRACE_REMOTE) += trace_remote.o +obj-$(CONFIG_SIMPLE_RING_BUFFER) += simple_ring_buffer.o libftrace-y := ftrace.o diff --git a/kernel/trace/simple_ring_buffer.c b/kernel/trace/simple_ring_buffer.c new file mode 100644 index 000000000000..15df9781411b --- /dev/null +++ b/kernel/trace/simple_ring_buffer.c @@ -0,0 +1,464 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 - Google LLC + * Author: Vincent Donnefort + */ + +#include +#include + +#include +#include + +enum simple_rb_link_type { + SIMPLE_RB_LINK_NORMAL = 0, + SIMPLE_RB_LINK_HEAD = 1, + SIMPLE_RB_LINK_HEAD_MOVING +}; + +#define SIMPLE_RB_LINK_MASK ~(SIMPLE_RB_LINK_HEAD | SIMPLE_RB_LINK_HEAD_MOVING) + +static void simple_bpage_set_head_link(struct simple_buffer_page *bpage) +{ + unsigned long link = (unsigned long)bpage->link.next; + + link &= SIMPLE_RB_LINK_MASK; + link |= SIMPLE_RB_LINK_HEAD; + + /* + * Paired with simple_rb_find_head() to order access between the head + * link and overrun. It ensures we always report an up-to-date value + * after swapping the reader page. + */ + smp_store_release(&bpage->link.next, (struct list_head *)link); +} + +static bool simple_bpage_unset_head_link(struct simple_buffer_page *bpage, + struct simple_buffer_page *dst, + enum simple_rb_link_type new_type) +{ + unsigned long *link = (unsigned long *)(&bpage->link.next); + unsigned long old = (*link & SIMPLE_RB_LINK_MASK) | SIMPLE_RB_LINK_HEAD; + unsigned long new = (unsigned long)(&dst->link) | new_type; + + return try_cmpxchg(link, &old, new); +} + +static void simple_bpage_set_normal_link(struct simple_buffer_page *bpage) +{ + unsigned long link = (unsigned long)bpage->link.next; + + WRITE_ONCE(bpage->link.next, (struct list_head *)(link & SIMPLE_RB_LINK_MASK)); +} + +static struct simple_buffer_page *simple_bpage_from_link(struct list_head *link) +{ + unsigned long ptr = (unsigned long)link & SIMPLE_RB_LINK_MASK; + + return container_of((struct list_head *)ptr, struct simple_buffer_page, link); +} + +static struct simple_buffer_page *simple_bpage_next_page(struct simple_buffer_page *bpage) +{ + return simple_bpage_from_link(bpage->link.next); +} + +static void simple_bpage_reset(struct simple_buffer_page *bpage) +{ + bpage->write = 0; + bpage->entries = 0; + + local_set(&bpage->page->commit, 0); +} + +static void simple_bpage_init(struct simple_buffer_page *bpage, unsigned long page) +{ + INIT_LIST_HEAD(&bpage->link); + bpage->page = (struct buffer_data_page *)page; + + simple_bpage_reset(bpage); +} + +#define simple_rb_meta_inc(__meta, __inc) \ + WRITE_ONCE((__meta), (__meta + __inc)) + +static bool simple_rb_loaded(struct simple_rb_per_cpu *cpu_buffer) +{ + return !!cpu_buffer->bpages; +} + +static int simple_rb_find_head(struct simple_rb_per_cpu *cpu_buffer) +{ + int retry = cpu_buffer->nr_pages * 2; + struct simple_buffer_page *head; + + head = cpu_buffer->head_page; + + while (retry--) { + unsigned long link; + +spin: + /* See smp_store_release in simple_bpage_set_head_link() */ + link = (unsigned long)smp_load_acquire(&head->link.prev->next); + + switch (link & ~SIMPLE_RB_LINK_MASK) { + /* Found the head */ + case SIMPLE_RB_LINK_HEAD: + cpu_buffer->head_page = head; + return 0; + /* The writer caught the head, we can spin, that won't be long */ + case SIMPLE_RB_LINK_HEAD_MOVING: + goto spin; + } + + head = simple_bpage_next_page(head); + } + + return -EBUSY; +} + +/** + * simple_ring_buffer_swap_reader_page - Swap ring-buffer head with the reader + * @cpu_buffer: A simple_rb_per_cpu + * + * This function enables consuming reading. It ensures the current head page will not be overwritten + * and can be safely read. + * + * Returns 0 on success, -ENODEV if @cpu_buffer was unloaded or -EBUSY if we failed to catch the + * head page. + */ +int simple_ring_buffer_swap_reader_page(struct simple_rb_per_cpu *cpu_buffer) +{ + struct simple_buffer_page *last, *head, *reader; + unsigned long overrun; + int retry = 8; + int ret; + + if (!simple_rb_loaded(cpu_buffer)) + return -ENODEV; + + reader = cpu_buffer->reader_page; + + do { + /* Run after the writer to find the head */ + ret = simple_rb_find_head(cpu_buffer); + if (ret) + return ret; + + head = cpu_buffer->head_page; + + /* Connect the reader page around the header page */ + reader->link.next = head->link.next; + reader->link.prev = head->link.prev; + + /* The last page before the head */ + last = simple_bpage_from_link(head->link.prev); + + /* The reader page points to the new header page */ + simple_bpage_set_head_link(reader); + + overrun = cpu_buffer->meta->overrun; + } while (!simple_bpage_unset_head_link(last, reader, SIMPLE_RB_LINK_NORMAL) && retry--); + + if (!retry) + return -EINVAL; + + cpu_buffer->head_page = simple_bpage_from_link(reader->link.next); + cpu_buffer->head_page->link.prev = &reader->link; + cpu_buffer->reader_page = head; + cpu_buffer->meta->reader.lost_events = overrun - cpu_buffer->last_overrun; + cpu_buffer->meta->reader.id = cpu_buffer->reader_page->id; + cpu_buffer->last_overrun = overrun; + + return 0; +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_swap_reader_page); + +static struct simple_buffer_page *simple_rb_move_tail(struct simple_rb_per_cpu *cpu_buffer) +{ + struct simple_buffer_page *tail, *new_tail; + + tail = cpu_buffer->tail_page; + new_tail = simple_bpage_next_page(tail); + + if (simple_bpage_unset_head_link(tail, new_tail, SIMPLE_RB_LINK_HEAD_MOVING)) { + /* + * Oh no! we've caught the head. There is none anymore and + * swap_reader will spin until we set the new one. Overrun must + * be written first, to make sure we report the correct number + * of lost events. + */ + simple_rb_meta_inc(cpu_buffer->meta->overrun, new_tail->entries); + simple_rb_meta_inc(cpu_buffer->meta->pages_lost, 1); + + simple_bpage_set_head_link(new_tail); + simple_bpage_set_normal_link(tail); + } + + simple_bpage_reset(new_tail); + cpu_buffer->tail_page = new_tail; + + simple_rb_meta_inc(cpu_buffer->meta->pages_touched, 1); + + return new_tail; +} + +static unsigned long rb_event_size(unsigned long length) +{ + struct ring_buffer_event *event; + + return length + RB_EVNT_HDR_SIZE + sizeof(event->array[0]); +} + +static struct ring_buffer_event * +rb_event_add_ts_extend(struct ring_buffer_event *event, u64 delta) +{ + event->type_len = RINGBUF_TYPE_TIME_EXTEND; + event->time_delta = delta & TS_MASK; + event->array[0] = delta >> TS_SHIFT; + + return (struct ring_buffer_event *)((unsigned long)event + 8); +} + +static struct ring_buffer_event * +simple_rb_reserve_next(struct simple_rb_per_cpu *cpu_buffer, unsigned long length, u64 timestamp) +{ + unsigned long ts_ext_size = 0, event_size = rb_event_size(length); + struct simple_buffer_page *tail = cpu_buffer->tail_page; + struct ring_buffer_event *event; + u32 write, prev_write; + u64 time_delta; + + time_delta = timestamp - cpu_buffer->write_stamp; + + if (test_time_stamp(time_delta)) + ts_ext_size = 8; + + prev_write = tail->write; + write = prev_write + event_size + ts_ext_size; + + if (unlikely(write > (PAGE_SIZE - BUF_PAGE_HDR_SIZE))) + tail = simple_rb_move_tail(cpu_buffer); + + if (!tail->entries) { + tail->page->time_stamp = timestamp; + time_delta = 0; + ts_ext_size = 0; + write = event_size; + prev_write = 0; + } + + tail->write = write; + tail->entries++; + + cpu_buffer->write_stamp = timestamp; + + event = (struct ring_buffer_event *)(tail->page->data + prev_write); + if (ts_ext_size) { + event = rb_event_add_ts_extend(event, time_delta); + time_delta = 0; + } + + event->type_len = 0; + event->time_delta = time_delta; + event->array[0] = event_size - RB_EVNT_HDR_SIZE; + + return event; +} + +/** + * simple_ring_buffer_reserve - Reserve an entry in @cpu_buffer + * @cpu_buffer: A simple_rb_per_cpu + * @length: Size of the entry in bytes + * @timestamp: Timestamp of the entry + * + * Returns the address of the entry where to write data or NULL + */ +void *simple_ring_buffer_reserve(struct simple_rb_per_cpu *cpu_buffer, unsigned long length, + u64 timestamp) +{ + struct ring_buffer_event *rb_event; + + if (cmpxchg(&cpu_buffer->status, SIMPLE_RB_READY, SIMPLE_RB_WRITING) != SIMPLE_RB_READY) + return NULL; + + rb_event = simple_rb_reserve_next(cpu_buffer, length, timestamp); + + return &rb_event->array[1]; +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_reserve); + +/** + * simple_ring_buffer_commit - Commit the entry reserved with simple_ring_buffer_reserve() + * @cpu_buffer: The simple_rb_per_cpu where the entry has been reserved + */ +void simple_ring_buffer_commit(struct simple_rb_per_cpu *cpu_buffer) +{ + local_set(&cpu_buffer->tail_page->page->commit, + cpu_buffer->tail_page->write); + simple_rb_meta_inc(cpu_buffer->meta->entries, 1); + + /* + * Paired with simple_rb_enable_tracing() to ensure data is + * written to the ring-buffer before teardown. + */ + smp_store_release(&cpu_buffer->status, SIMPLE_RB_READY); +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_commit); + +static u32 simple_rb_enable_tracing(struct simple_rb_per_cpu *cpu_buffer, bool enable) +{ + u32 prev_status; + + if (enable) + return cmpxchg(&cpu_buffer->status, SIMPLE_RB_UNAVAILABLE, SIMPLE_RB_READY); + + /* Wait for the buffer to be released */ + do { + prev_status = cmpxchg_acquire(&cpu_buffer->status, + SIMPLE_RB_READY, + SIMPLE_RB_UNAVAILABLE); + } while (prev_status == SIMPLE_RB_WRITING); + + return prev_status; +} + +/** + * simple_ring_buffer_reset - Reset @cpu_buffer + * @cpu_buffer: A simple_rb_per_cpu + * + * This will not clear the content of the data, only reset counters and pointers + * + * Returns 0 on success or -ENODEV if @cpu_buffer was unloaded. + */ +int simple_ring_buffer_reset(struct simple_rb_per_cpu *cpu_buffer) +{ + struct simple_buffer_page *bpage; + u32 prev_status; + int ret; + + if (!simple_rb_loaded(cpu_buffer)) + return -ENODEV; + + prev_status = simple_rb_enable_tracing(cpu_buffer, false); + + ret = simple_rb_find_head(cpu_buffer); + if (ret) + return ret; + + bpage = cpu_buffer->tail_page = cpu_buffer->head_page; + do { + simple_bpage_reset(bpage); + bpage = simple_bpage_next_page(bpage); + } while (bpage != cpu_buffer->head_page); + + simple_bpage_reset(cpu_buffer->reader_page); + + cpu_buffer->last_overrun = 0; + cpu_buffer->write_stamp = 0; + + cpu_buffer->meta->reader.read = 0; + cpu_buffer->meta->reader.lost_events = 0; + cpu_buffer->meta->entries = 0; + cpu_buffer->meta->overrun = 0; + cpu_buffer->meta->read = 0; + cpu_buffer->meta->pages_lost = 0; + cpu_buffer->meta->pages_touched = 0; + + if (prev_status == SIMPLE_RB_READY) + simple_rb_enable_tracing(cpu_buffer, true); + + return 0; +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_reset); + +/** + * simple_ring_buffer_init - Init @cpu_buffer based on @desc + * @cpu_buffer: A simple_rb_per_cpu buffer to init, allocated by the caller. + * @bpages: Array of simple_buffer_pages, with as many elements as @desc->nr_page_va + * @desc: A ring_buffer_desc + * + * Returns 0 on success or -EINVAL if the content of @desc is invalid + */ +int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_buffer_page *bpages, + const struct ring_buffer_desc *desc) +{ + struct simple_buffer_page *bpage = bpages; + int i; + + /* At least 1 reader page and two pages in the ring-buffer */ + if (desc->nr_page_va < 3) + return -EINVAL; + + memset(cpu_buffer, 0, sizeof(*cpu_buffer)); + + cpu_buffer->bpages = bpages; + + cpu_buffer->meta = (void *)desc->meta_va; + memset(cpu_buffer->meta, 0, sizeof(*cpu_buffer->meta)); + cpu_buffer->meta->meta_page_size = PAGE_SIZE; + cpu_buffer->meta->nr_subbufs = cpu_buffer->nr_pages; + + /* The reader page is not part of the ring initially */ + simple_bpage_init(bpage, desc->page_va[0]); + bpage->id = 0; + + cpu_buffer->nr_pages = 1; + + cpu_buffer->reader_page = bpage; + cpu_buffer->tail_page = bpage + 1; + cpu_buffer->head_page = bpage + 1; + + for (i = 1; i < desc->nr_page_va; i++) { + simple_bpage_init(++bpage, desc->page_va[i]); + + bpage->link.next = &(bpage + 1)->link; + bpage->link.prev = &(bpage - 1)->link; + bpage->id = i; + + cpu_buffer->nr_pages = i + 1; + } + + /* Close the ring */ + bpage->link.next = &cpu_buffer->tail_page->link; + cpu_buffer->tail_page->link.prev = &bpage->link; + + /* The last init'ed page points to the head page */ + simple_bpage_set_head_link(bpage); + + return 0; +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_init); + +/** + * simple_ring_buffer_unload - Prepare @cpu_buffer for deletion + * @cpu_buffer: A simple_rb_per_cpu that will be deleted. + */ +void simple_ring_buffer_unload(struct simple_rb_per_cpu *cpu_buffer) +{ + if (!simple_rb_loaded(cpu_buffer)) + return; + + simple_rb_enable_tracing(cpu_buffer, false); + + cpu_buffer->bpages = NULL; +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_unload); + +/** + * simple_ring_buffer_enable_tracing - Enable or disable writing to @cpu_buffer + * @cpu_buffer: A simple_rb_per_cpu + * @enable: True to enable tracing, False to disable it + * + * Returns 0 on success or -ENODEV if @cpu_buffer was unloaded + */ +int simple_ring_buffer_enable_tracing(struct simple_rb_per_cpu *cpu_buffer, bool enable) +{ + if (!simple_rb_loaded(cpu_buffer)) + return -ENODEV; + + simple_rb_enable_tracing(cpu_buffer, enable); + + return 0; +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_enable_tracing); -- cgit v1.2.3 From 635923081c792c830fb87e680d6dd5f348926b3f Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 9 Mar 2026 16:25:03 +0000 Subject: tracing: load/unload page callbacks for simple_ring_buffer Add load/unload callback used for each admitted page in the ring-buffer. This will be later useful for the pKVM hypervisor which uses a different VA space and need to dynamically map/unmap the ring-buffer pages. Link: https://patch.msgid.link/20260309162516.2623589-18-vdonnefort@google.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Vincent Donnefort Signed-off-by: Steven Rostedt (Google) --- include/linux/simple_ring_buffer.h | 8 ++++ kernel/trace/simple_ring_buffer.c | 91 ++++++++++++++++++++++++++++++-------- 2 files changed, 80 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/simple_ring_buffer.h b/include/linux/simple_ring_buffer.h index 2c4c0ae336bc..21aec556293e 100644 --- a/include/linux/simple_ring_buffer.h +++ b/include/linux/simple_ring_buffer.h @@ -54,4 +54,12 @@ int simple_ring_buffer_reset(struct simple_rb_per_cpu *cpu_buffer); int simple_ring_buffer_swap_reader_page(struct simple_rb_per_cpu *cpu_buffer); +int simple_ring_buffer_init_mm(struct simple_rb_per_cpu *cpu_buffer, + struct simple_buffer_page *bpages, + const struct ring_buffer_desc *desc, + void *(*load_page)(unsigned long va), + void (*unload_page)(void *va)); + +void simple_ring_buffer_unload_mm(struct simple_rb_per_cpu *cpu_buffer, + void (*unload_page)(void *)); #endif diff --git a/kernel/trace/simple_ring_buffer.c b/kernel/trace/simple_ring_buffer.c index 15df9781411b..02af2297ae5a 100644 --- a/kernel/trace/simple_ring_buffer.c +++ b/kernel/trace/simple_ring_buffer.c @@ -71,7 +71,7 @@ static void simple_bpage_reset(struct simple_buffer_page *bpage) local_set(&bpage->page->commit, 0); } -static void simple_bpage_init(struct simple_buffer_page *bpage, unsigned long page) +static void simple_bpage_init(struct simple_buffer_page *bpage, void *page) { INIT_LIST_HEAD(&bpage->link); bpage->page = (struct buffer_data_page *)page; @@ -372,18 +372,15 @@ int simple_ring_buffer_reset(struct simple_rb_per_cpu *cpu_buffer) } EXPORT_SYMBOL_GPL(simple_ring_buffer_reset); -/** - * simple_ring_buffer_init - Init @cpu_buffer based on @desc - * @cpu_buffer: A simple_rb_per_cpu buffer to init, allocated by the caller. - * @bpages: Array of simple_buffer_pages, with as many elements as @desc->nr_page_va - * @desc: A ring_buffer_desc - * - * Returns 0 on success or -EINVAL if the content of @desc is invalid - */ -int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_buffer_page *bpages, - const struct ring_buffer_desc *desc) +int simple_ring_buffer_init_mm(struct simple_rb_per_cpu *cpu_buffer, + struct simple_buffer_page *bpages, + const struct ring_buffer_desc *desc, + void *(*load_page)(unsigned long va), + void (*unload_page)(void *va)) { struct simple_buffer_page *bpage = bpages; + int ret = 0; + void *page; int i; /* At least 1 reader page and two pages in the ring-buffer */ @@ -392,15 +389,22 @@ int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_ memset(cpu_buffer, 0, sizeof(*cpu_buffer)); - cpu_buffer->bpages = bpages; + cpu_buffer->meta = load_page(desc->meta_va); + if (!cpu_buffer->meta) + return -EINVAL; - cpu_buffer->meta = (void *)desc->meta_va; memset(cpu_buffer->meta, 0, sizeof(*cpu_buffer->meta)); cpu_buffer->meta->meta_page_size = PAGE_SIZE; cpu_buffer->meta->nr_subbufs = cpu_buffer->nr_pages; /* The reader page is not part of the ring initially */ - simple_bpage_init(bpage, desc->page_va[0]); + page = load_page(desc->page_va[0]); + if (!page) { + unload_page(cpu_buffer->meta); + return -EINVAL; + } + + simple_bpage_init(bpage, page); bpage->id = 0; cpu_buffer->nr_pages = 1; @@ -410,7 +414,13 @@ int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_ cpu_buffer->head_page = bpage + 1; for (i = 1; i < desc->nr_page_va; i++) { - simple_bpage_init(++bpage, desc->page_va[i]); + page = load_page(desc->page_va[i]); + if (!page) { + ret = -EINVAL; + break; + } + + simple_bpage_init(++bpage, page); bpage->link.next = &(bpage + 1)->link; bpage->link.prev = &(bpage - 1)->link; @@ -419,6 +429,14 @@ int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_ cpu_buffer->nr_pages = i + 1; } + if (ret) { + for (i--; i >= 0; i--) + unload_page((void *)desc->page_va[i]); + unload_page(cpu_buffer->meta); + + return ret; + } + /* Close the ring */ bpage->link.next = &cpu_buffer->tail_page->link; cpu_buffer->tail_page->link.prev = &bpage->link; @@ -426,23 +444,58 @@ int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_ /* The last init'ed page points to the head page */ simple_bpage_set_head_link(bpage); + cpu_buffer->bpages = bpages; + return 0; } -EXPORT_SYMBOL_GPL(simple_ring_buffer_init); + +static void *__load_page(unsigned long page) +{ + return (void *)page; +} + +static void __unload_page(void *page) { } /** - * simple_ring_buffer_unload - Prepare @cpu_buffer for deletion - * @cpu_buffer: A simple_rb_per_cpu that will be deleted. + * simple_ring_buffer_init - Init @cpu_buffer based on @desc + * @cpu_buffer: A simple_rb_per_cpu buffer to init, allocated by the caller. + * @bpages: Array of simple_buffer_pages, with as many elements as @desc->nr_page_va + * @desc: A ring_buffer_desc + * + * Returns 0 on success or -EINVAL if the content of @desc is invalid */ -void simple_ring_buffer_unload(struct simple_rb_per_cpu *cpu_buffer) +int simple_ring_buffer_init(struct simple_rb_per_cpu *cpu_buffer, struct simple_buffer_page *bpages, + const struct ring_buffer_desc *desc) +{ + return simple_ring_buffer_init_mm(cpu_buffer, bpages, desc, __load_page, __unload_page); +} +EXPORT_SYMBOL_GPL(simple_ring_buffer_init); + +void simple_ring_buffer_unload_mm(struct simple_rb_per_cpu *cpu_buffer, + void (*unload_page)(void *)) { + int p; + if (!simple_rb_loaded(cpu_buffer)) return; simple_rb_enable_tracing(cpu_buffer, false); + unload_page(cpu_buffer->meta); + for (p = 0; p < cpu_buffer->nr_pages; p++) + unload_page(cpu_buffer->bpages[p].page); + cpu_buffer->bpages = NULL; } + +/** + * simple_ring_buffer_unload - Prepare @cpu_buffer for deletion + * @cpu_buffer: A simple_rb_per_cpu that will be deleted. + */ +void simple_ring_buffer_unload(struct simple_rb_per_cpu *cpu_buffer) +{ + return simple_ring_buffer_unload_mm(cpu_buffer, __unload_page); +} EXPORT_SYMBOL_GPL(simple_ring_buffer_unload); /** -- cgit v1.2.3 From c116737e972ea74f4468a1bd0703d623a3c0ee4a Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Mon, 9 Mar 2026 14:15:28 +0100 Subject: workqueue: Add system_dfl_long_wq for long unbound works Currently there are users of queue_delayed_work() who specify system_long_wq, the per-cpu workqueue. This workqueue should be used for long per-cpu works, but queue_delayed_work() queue the work using: queue_delayed_work_on(WORK_CPU_UNBOUND, ...); This would end up calling __queue_delayed_work() that does: if (housekeeping_enabled(HK_TYPE_TIMER)) { // [....] } else { if (likely(cpu == WORK_CPU_UNBOUND)) add_timer_global(timer); else add_timer_on(timer, cpu); } So when cpu == WORK_CPU_UNBOUND the timer is global and is not using a specific CPU. Later, when __queue_work() is called: if (req_cpu == WORK_CPU_UNBOUND) { if (wq->flags & WQ_UNBOUND) cpu = wq_select_unbound_cpu(raw_smp_processor_id()); else cpu = raw_smp_processor_id(); } Because the wq is not unbound, it takes the CPU where the timer fired and enqueue the work on that CPU. The consequence of all of this is that the work can run anywhere, depending on where the timer fired. Introduce system_dfl_long_wq in order to change, in a future step, users that are still calling: queue_delayed_work(system_long_wq, ...); with the new system_dfl_long_wq instead, so that the work may benefit from scheduler task placement. Signed-off-by: Marco Crivellari Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 6 ++++++ kernel/workqueue.c | 5 ++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index fc5744402a66..8e0855d56e74 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -440,6 +440,9 @@ enum wq_consts { * system_long_wq is similar to system_percpu_wq but may host long running * works. Queue flushing might take relatively long. * + * system_dfl_long_wq is similar to system_dfl_wq but it may host long running + * works. + * * system_dfl_wq is unbound workqueue. Workers are not bound to * any specific CPU, not concurrency managed, and all queued works are * executed immediately as long as max_active limit is not reached and @@ -468,6 +471,7 @@ extern struct workqueue_struct *system_power_efficient_wq; extern struct workqueue_struct *system_freezable_power_efficient_wq; extern struct workqueue_struct *system_bh_wq; extern struct workqueue_struct *system_bh_highpri_wq; +extern struct workqueue_struct *system_dfl_long_wq; void workqueue_softirq_action(bool highpri); void workqueue_softirq_dead(unsigned int cpu); @@ -783,6 +787,8 @@ extern void __warn_flushing_systemwide_wq(void) _wq == system_highpri_wq) || \ (__builtin_constant_p(_wq == system_long_wq) && \ _wq == system_long_wq) || \ + (__builtin_constant_p(_wq == system_dfl_long_wq) && \ + _wq == system_dfl_long_wq) || \ (__builtin_constant_p(_wq == system_dfl_wq) && \ _wq == system_dfl_wq) || \ (__builtin_constant_p(_wq == system_freezable_wq) && \ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2f95cb0d2f1b..2d8ff903f113 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -530,6 +530,8 @@ struct workqueue_struct *system_bh_wq; EXPORT_SYMBOL_GPL(system_bh_wq); struct workqueue_struct *system_bh_highpri_wq; EXPORT_SYMBOL_GPL(system_bh_highpri_wq); +struct workqueue_struct *system_dfl_long_wq __ro_after_init; +EXPORT_SYMBOL_GPL(system_dfl_long_wq); static int worker_thread(void *__worker); static void workqueue_sysfs_unregister(struct workqueue_struct *wq); @@ -7954,11 +7956,12 @@ void __init workqueue_init_early(void) system_bh_wq = alloc_workqueue("events_bh", WQ_BH | WQ_PERCPU, 0); system_bh_highpri_wq = alloc_workqueue("events_bh_highpri", WQ_BH | WQ_HIGHPRI | WQ_PERCPU, 0); + system_dfl_long_wq = alloc_workqueue("events_dfl_long", WQ_UNBOUND, WQ_MAX_ACTIVE); BUG_ON(!system_wq || !system_percpu_wq|| !system_highpri_wq || !system_long_wq || !system_unbound_wq || !system_freezable_wq || !system_dfl_wq || !system_power_efficient_wq || !system_freezable_power_efficient_wq || - !system_bh_wq || !system_bh_highpri_wq); + !system_bh_wq || !system_bh_highpri_wq || !system_dfl_long_wq); } static void __init wq_cpu_intensive_thresh_init(void) -- cgit v1.2.3 From 878004e2852bc22ce0687c5597d6fe3909fb59f3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 6 Mar 2026 14:10:32 -0800 Subject: iopoll: fix function parameter names in read_poll_timeout_atomic() Correct the function parameter names to avoid kernel-doc warnings and to emphasize this function is atomic (non-sleeping). Warning: include/linux/iopoll.h:169 function parameter 'sleep_us' not described in 'read_poll_timeout_atomic' Warning: ../include/linux/iopoll.h:169 function parameter 'sleep_before_read' not described in 'read_poll_timeout_atomic' Fixes: 9df8043a546d ("iopoll: Generalize read_poll_timeout() into poll_timeout_us()") Signed-off-by: Randy Dunlap Reviewed-by: Jani Nikula Link: https://patch.msgid.link/20260306221033.2357305-1-rdunlap@infradead.org Signed-off-by: Jani Nikula --- include/linux/iopoll.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iopoll.h b/include/linux/iopoll.h index bdd2e0652bc3..53edd69acb9b 100644 --- a/include/linux/iopoll.h +++ b/include/linux/iopoll.h @@ -159,7 +159,7 @@ * * This macro does not rely on timekeeping. Hence it is safe to call even when * timekeeping is suspended, at the expense of an underestimation of wall clock - * time, which is rather minimal with a non-zero delay_us. + * time, which is rather minimal with a non-zero @delay_us. * * When available, you'll probably want to use one of the specialized * macros defined below rather than this macro directly. @@ -167,9 +167,9 @@ * Returns: 0 on success and -ETIMEDOUT upon a timeout. In either * case, the last read value at @args is stored in @val. */ -#define read_poll_timeout_atomic(op, val, cond, sleep_us, timeout_us, \ - sleep_before_read, args...) \ - poll_timeout_us_atomic((val) = op(args), cond, sleep_us, timeout_us, sleep_before_read) +#define read_poll_timeout_atomic(op, val, cond, delay_us, timeout_us, \ + delay_before_read, args...) \ + poll_timeout_us_atomic((val) = op(args), cond, delay_us, timeout_us, delay_before_read) /** * readx_poll_timeout - Periodically poll an address until a condition is met or a timeout occurs -- cgit v1.2.3 From aca086ff27c3f67e81617e4b063d1126544a4f19 Mon Sep 17 00:00:00 2001 From: Ondrej Kozina Date: Fri, 6 Feb 2026 15:17:58 +0100 Subject: sed-opal: add IOC_OPAL_REACTIVATE_LSP. This adds the 'Reactivate' method as described in the "TCG Storage Opal SSC Feature Set: Single User Mode" document (ch. 3.1.1.1). The method enables switching an already active SED OPAL2 device, with appropriate firmware support for Single User Mode (SUM), to or from SUM. Signed-off-by: Ondrej Kozina Reviewed-and-tested-by: Milan Broz Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/opal_proto.h | 1 + block/sed-opal.c | 99 +++++++++++++++++++++++++++++++++++++++++++ include/linux/sed-opal.h | 1 + include/uapi/linux/sed-opal.h | 14 ++++++ 4 files changed, 115 insertions(+) (limited to 'include/linux') diff --git a/block/opal_proto.h b/block/opal_proto.h index 3ccee5977c10..d138785b8198 100644 --- a/block/opal_proto.h +++ b/block/opal_proto.h @@ -155,6 +155,7 @@ enum opal_method { OPAL_AUTHENTICATE, OPAL_RANDOM, OPAL_ERASE, + OPAL_REACTIVATE, }; enum opal_token { diff --git a/block/sed-opal.c b/block/sed-opal.c index 83bee47aa29f..5d06f5f433bf 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -220,6 +220,8 @@ static const u8 opalmethod[][OPAL_METHOD_LENGTH] = { { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x06, 0x01 }, [OPAL_ERASE] = { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x08, 0x03 }, + [OPAL_REACTIVATE] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x08, 0x01 }, }; static int end_opal_session_error(struct opal_dev *dev); @@ -2287,6 +2289,74 @@ static int activate_lsp(struct opal_dev *dev, void *data) return finalize_and_send(dev, parse_and_check_status); } +static int reactivate_lsp(struct opal_dev *dev, void *data) +{ + struct opal_lr_react *opal_react = data; + u8 user_lr[OPAL_UID_LENGTH]; + int err, i; + + err = cmd_start(dev, opaluid[OPAL_THISSP_UID], + opalmethod[OPAL_REACTIVATE]); + + if (err) { + pr_debug("Error building Reactivate LockingSP command.\n"); + return err; + } + + /* + * If neither 'entire_table' nor 'num_lrs' is set, the device + * gets reactivated with SUM disabled. Only Admin1PIN will change + * if set. + */ + if (opal_react->entire_table) { + /* Entire Locking table (all locking ranges) will be put in SUM. */ + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u64(&err, dev, OPAL_SUM_SET_LIST); + add_token_bytestring(&err, dev, opaluid[OPAL_LOCKING_TABLE], OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_ENDNAME); + } else if (opal_react->num_lrs) { + /* Subset of Locking table (selected locking range(s)) to be put in SUM */ + err = build_locking_range(user_lr, sizeof(user_lr), + opal_react->lr[0]); + if (err) + return err; + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u64(&err, dev, OPAL_SUM_SET_LIST); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_bytestring(&err, dev, user_lr, OPAL_UID_LENGTH); + for (i = 1; i < opal_react->num_lrs; i++) { + user_lr[7] = opal_react->lr[i]; + add_token_bytestring(&err, dev, user_lr, OPAL_UID_LENGTH); + } + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + } + + /* Skipping the rangle policy parameter is same as setting its value to zero */ + if (opal_react->range_policy && (opal_react->num_lrs || opal_react->entire_table)) { + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u64(&err, dev, OPAL_SUM_RANGE_POLICY); + add_token_u8(&err, dev, 1); + add_token_u8(&err, dev, OPAL_ENDNAME); + } + + /* + * Optional parameter. If set, it changes the Admin1 PIN even when SUM + * is being disabled. + */ + if (opal_react->new_admin_key.key_len) { + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u64(&err, dev, OPAL_SUM_ADMIN1_PIN); + add_token_bytestring(&err, dev, opal_react->new_admin_key.key, + opal_react->new_admin_key.key_len); + add_token_u8(&err, dev, OPAL_ENDNAME); + } + + return finalize_and_send(dev, parse_and_check_status); +} + /* Determine if we're in the Manufactured Inactive or Active state */ static int get_lsp_lifecycle(struct opal_dev *dev, void *data) { @@ -2957,6 +3027,32 @@ static int opal_activate_lsp(struct opal_dev *dev, return ret; } +static int opal_reactivate_lsp(struct opal_dev *dev, + struct opal_lr_react *opal_lr_react) +{ + const struct opal_step active_steps[] = { + { start_admin1LSP_opal_session, &opal_lr_react->key }, + { reactivate_lsp, opal_lr_react }, + /* No end_opal_session. The controller terminates the session */ + }; + int ret; + + /* use either 'entire_table' parameter or set of locking ranges */ + if (opal_lr_react->num_lrs > OPAL_MAX_LRS || + (opal_lr_react->num_lrs && opal_lr_react->entire_table)) + return -EINVAL; + + ret = opal_get_key(dev, &opal_lr_react->key); + if (ret) + return ret; + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, active_steps, ARRAY_SIZE(active_steps)); + mutex_unlock(&dev->dev_lock); + + return ret; +} + static int opal_setup_locking_range(struct opal_dev *dev, struct opal_user_lr_setup *opal_lrs) { @@ -3315,6 +3411,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_SET_SID_PW: ret = opal_set_new_sid_pw(dev, p); break; + case IOC_OPAL_REACTIVATE_LSP: + ret = opal_reactivate_lsp(dev, p); + break; default: break; diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index 80f33a93f944..2ae5e6b0ac21 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -53,6 +53,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_DISCOVERY: case IOC_OPAL_REVERT_LSP: case IOC_OPAL_SET_SID_PW: + case IOC_OPAL_REACTIVATE_LSP: return true; } return false; diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index 9025dd5a4f0f..d03e590b6501 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -74,6 +74,19 @@ struct opal_lr_act { __u8 align[2]; /* Align to 8 byte boundary */ }; +struct opal_lr_react { + struct opal_key key; + struct opal_key new_admin_key; /* Set new Admin1 PIN if key_len is > 0 */ + __u8 num_lrs; /* + * Configure selected ranges (from lr[]) in SUM. + * If num_lrs > 0 the 'entire_table' must be 0 + */ + __u8 lr[OPAL_MAX_LRS]; + __u8 range_policy; /* Set RangeStartRangeLengthPolicy parameter */ + __u8 entire_table; /* Set all locking objects in SUM */ + __u8 align[4]; /* Align to 8 byte boundary */ +}; + struct opal_session_info { __u32 sum; __u32 who; @@ -216,5 +229,6 @@ struct opal_revert_lsp { #define IOC_OPAL_DISCOVERY _IOW('p', 239, struct opal_discovery) #define IOC_OPAL_REVERT_LSP _IOW('p', 240, struct opal_revert_lsp) #define IOC_OPAL_SET_SID_PW _IOW('p', 241, struct opal_new_pw) +#define IOC_OPAL_REACTIVATE_LSP _IOW('p', 242, struct opal_lr_react) #endif /* _UAPI_SED_OPAL_H */ -- cgit v1.2.3 From 8e3d34a7ce7386b01947dd649bd24775544e4d3e Mon Sep 17 00:00:00 2001 From: Ondrej Kozina Date: Fri, 6 Feb 2026 15:18:00 +0100 Subject: sed-opal: add IOC_OPAL_LR_SET_START_LEN ioctl. This ioctl is used to set up locking range start (offset) and locking range length attributes only. In Single User Mode (SUM), if the RangeStartRangeLengthPolicy parameter is set in the 'Reactivate' method, only Admin authority maintains the locking range length and start (offset) attributes of Locking objects set up for SUM. All other attributes from struct opal_user_lr_setup (RLE - read locking enabled, WLE - write locking enabled) shall remain in possession of the User authority associated with the Locking object set for SUM. Therefore, we need a separate function for setting up locking range start and locking range length because it may require two different authorities (and sessions) if the RangeStartRangeLengthPolicy attribute is set. With the IOC_OPAL_LR_SET_START_LEN ioctl, the opal_user_lr_setup members 'RLE' and 'WLE' of the ioctl argument are ignored. Signed-off-by: Ondrej Kozina Reviewed-and-tested-by: Milan Broz Signed-off-by: Jens Axboe --- block/sed-opal.c | 28 ++++++++++++++++++++++++++++ include/linux/sed-opal.h | 1 + include/uapi/linux/sed-opal.h | 1 + 3 files changed, 30 insertions(+) (limited to 'include/linux') diff --git a/block/sed-opal.c b/block/sed-opal.c index 7be72f621952..55c8a0953d78 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -3091,6 +3091,31 @@ static int opal_setup_locking_range(struct opal_dev *dev, return ret; } +static int opal_setup_locking_range_start_length(struct opal_dev *dev, + struct opal_user_lr_setup *opal_lrs) +{ + const struct opal_step lr_steps[] = { + { start_auth_opal_session, &opal_lrs->session }, + { setup_locking_range_start_length, opal_lrs }, + { end_opal_session, } + }; + int ret; + + /* we can not set global locking range offset or length */ + if (opal_lrs->session.opal_key.lr == 0) + return -EINVAL; + + ret = opal_get_key(dev, &opal_lrs->session.opal_key); + if (ret) + return ret; + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, lr_steps, ARRAY_SIZE(lr_steps)); + mutex_unlock(&dev->dev_lock); + + return ret; +} + static int opal_locking_range_status(struct opal_dev *dev, struct opal_lr_status *opal_lrst, void __user *data) @@ -3431,6 +3456,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_REACTIVATE_LSP: ret = opal_reactivate_lsp(dev, p); break; + case IOC_OPAL_LR_SET_START_LEN: + ret = opal_setup_locking_range_start_length(dev, p); + break; default: break; diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index 2ae5e6b0ac21..a0df6819b0a9 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -54,6 +54,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_REVERT_LSP: case IOC_OPAL_SET_SID_PW: case IOC_OPAL_REACTIVATE_LSP: + case IOC_OPAL_LR_SET_START_LEN: return true; } return false; diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index d03e590b6501..82de38f3fbeb 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -230,5 +230,6 @@ struct opal_revert_lsp { #define IOC_OPAL_REVERT_LSP _IOW('p', 240, struct opal_revert_lsp) #define IOC_OPAL_SET_SID_PW _IOW('p', 241, struct opal_new_pw) #define IOC_OPAL_REACTIVATE_LSP _IOW('p', 242, struct opal_lr_react) +#define IOC_OPAL_LR_SET_START_LEN _IOW('p', 243, struct opal_user_lr_setup) #endif /* _UAPI_SED_OPAL_H */ -- cgit v1.2.3 From a441a9d22433fea561de131e27fff41715c2d186 Mon Sep 17 00:00:00 2001 From: Ondrej Kozina Date: Fri, 6 Feb 2026 15:18:01 +0100 Subject: sed-opal: add IOC_OPAL_ENABLE_DISABLE_LR. This ioctl is used to set up RLE (read lock enabled) and WLE (write lock enabled) parameters of the Locking object. In Single User Mode (SUM), if the RangeStartRangeLengthPolicy parameter is set in the 'Reactivate' method, only Admin authority maintains the locking range length and start (offset) attributes of Locking objects set up for SUM. All other attributes from struct opal_user_lr_setup (RLE - read locking enabled, WLE - write locking enabled) shall remain in possession of the User authority associated with the Locking object set for SUM. With the IOC_OPAL_ENABLE_DISABLE_LR ioctl, the opal_user_lr_setup members 'range_start' and 'range_length' of the ioctl argument are ignored. Signed-off-by: Ondrej Kozina Reviewed-and-tested-by: Milan Broz Signed-off-by: Jens Axboe --- block/sed-opal.c | 24 ++++++++++++++++++++++++ include/linux/sed-opal.h | 1 + include/uapi/linux/sed-opal.h | 1 + 3 files changed, 26 insertions(+) (limited to 'include/linux') diff --git a/block/sed-opal.c b/block/sed-opal.c index 55c8a0953d78..53a73422911e 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -3116,6 +3116,27 @@ static int opal_setup_locking_range_start_length(struct opal_dev *dev, return ret; } +static int opal_enable_disable_range(struct opal_dev *dev, + struct opal_user_lr_setup *opal_lrs) +{ + const struct opal_step lr_steps[] = { + { start_auth_opal_session, &opal_lrs->session }, + { setup_enable_range, opal_lrs }, + { end_opal_session, } + }; + int ret; + + ret = opal_get_key(dev, &opal_lrs->session.opal_key); + if (ret) + return ret; + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, lr_steps, ARRAY_SIZE(lr_steps)); + mutex_unlock(&dev->dev_lock); + + return ret; +} + static int opal_locking_range_status(struct opal_dev *dev, struct opal_lr_status *opal_lrst, void __user *data) @@ -3459,6 +3480,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_LR_SET_START_LEN: ret = opal_setup_locking_range_start_length(dev, p); break; + case IOC_OPAL_ENABLE_DISABLE_LR: + ret = opal_enable_disable_range(dev, p); + break; default: break; diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index a0df6819b0a9..1d63479838cf 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -55,6 +55,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_SET_SID_PW: case IOC_OPAL_REACTIVATE_LSP: case IOC_OPAL_LR_SET_START_LEN: + case IOC_OPAL_ENABLE_DISABLE_LR: return true; } return false; diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index 82de38f3fbeb..bde023ae2295 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -231,5 +231,6 @@ struct opal_revert_lsp { #define IOC_OPAL_SET_SID_PW _IOW('p', 241, struct opal_new_pw) #define IOC_OPAL_REACTIVATE_LSP _IOW('p', 242, struct opal_lr_react) #define IOC_OPAL_LR_SET_START_LEN _IOW('p', 243, struct opal_user_lr_setup) +#define IOC_OPAL_ENABLE_DISABLE_LR _IOW('p', 244, struct opal_user_lr_setup) #endif /* _UAPI_SED_OPAL_H */ -- cgit v1.2.3 From 0cc9293bccb234552b81c3ebc074f5839f019e01 Mon Sep 17 00:00:00 2001 From: Ondrej Kozina Date: Fri, 6 Feb 2026 15:18:03 +0100 Subject: sed-opal: add IOC_OPAL_GET_SUM_STATUS ioctl. This adds a function for retrieving the set of Locking objects enabled for Single User Mode (SUM) and the value of the RangeStartRangeLengthPolicy parameter. It retrieves data from the LockingInfo table, specifically the columns SingleUserModeRanges and RangeStartLengthPolicy, which were added according to the TCG Opal Feature Set: Single User Mode, as described in chapters 4.4.3.1 and 4.4.3.2. Signed-off-by: Ondrej Kozina Reviewed-and-tested-by: Milan Broz Signed-off-by: Jens Axboe --- block/sed-opal.c | 159 ++++++++++++++++++++++++++++++++++++++++++ include/linux/sed-opal.h | 1 + include/uapi/linux/sed-opal.h | 13 ++++ 3 files changed, 173 insertions(+) (limited to 'include/linux') diff --git a/block/sed-opal.c b/block/sed-opal.c index 6146a1b30421..c34d19e91201 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -1757,6 +1757,12 @@ static int start_anybodyASP_opal_session(struct opal_dev *dev, void *data) OPAL_ADMINSP_UID, NULL, 0); } +static int start_anybodyLSP_opal_session(struct opal_dev *dev, void *data) +{ + return start_generic_opal_session(dev, OPAL_ANYBODY_UID, + OPAL_LOCKINGSP_UID, NULL, 0); +} + static int start_SIDASP_opal_session(struct opal_dev *dev, void *data) { int ret; @@ -3389,6 +3395,156 @@ static int opal_get_geometry(struct opal_dev *dev, void __user *data) return 0; } +static int get_sum_ranges(struct opal_dev *dev, void *data) +{ + const char *lr_uid; + size_t lr_uid_len; + u64 val; + const struct opal_resp_tok *tok; + int err, tok_n = 2; + struct opal_sum_ranges *sranges = data; + const __u8 lr_all[OPAL_MAX_LRS] = { 0, 1, 2, 3, 4, 5, 6, 7, 8 }; + + err = generic_get_columns(dev, opaluid[OPAL_LOCKING_INFO_TABLE], OPAL_SUM_SET_LIST, + OPAL_SUM_RANGE_POLICY); + if (err) { + pr_debug("Couldn't get locking info table columns %d to %d.\n", + OPAL_SUM_SET_LIST, OPAL_SUM_RANGE_POLICY); + return err; + } + + tok = response_get_token(&dev->parsed, tok_n); + if (IS_ERR(tok)) + return PTR_ERR(tok); + + if (!response_token_matches(tok, OPAL_STARTNAME)) { + pr_debug("Unexpected response token type %d.\n", tok_n); + return OPAL_INVAL_PARAM; + } + tok_n++; + + if (response_get_u64(&dev->parsed, tok_n) != OPAL_SUM_SET_LIST) { + pr_debug("Token %d does not match expected column %u.\n", + tok_n, OPAL_SUM_SET_LIST); + return OPAL_INVAL_PARAM; + } + tok_n++; + + tok = response_get_token(&dev->parsed, tok_n); + if (IS_ERR(tok)) + return PTR_ERR(tok); + + /* + * The OPAL_SUM_SET_LIST response contains two distinct values: + * + * - the list of individual locking ranges (UIDs) put in SUM. The list + * may also be empty signaling the SUM is disabled. + * + * - the Locking table UID if the entire Locking table is put in SUM. + */ + if (response_token_matches(tok, OPAL_STARTLIST)) { + sranges->num_lrs = 0; + + tok_n++; + tok = response_get_token(&dev->parsed, tok_n); + if (IS_ERR(tok)) + return PTR_ERR(tok); + + while (!response_token_matches(tok, OPAL_ENDLIST)) { + lr_uid_len = response_get_string(&dev->parsed, tok_n, &lr_uid); + if (lr_uid_len != OPAL_UID_LENGTH) { + pr_debug("Unexpected response token type %d.\n", tok_n); + return OPAL_INVAL_PARAM; + } + + if (memcmp(lr_uid, opaluid[OPAL_LOCKINGRANGE_GLOBAL], OPAL_UID_LENGTH)) { + if (lr_uid[5] != LOCKING_RANGE_NON_GLOBAL) { + pr_debug("Unexpected byte %d at LR UUID position 5.\n", + lr_uid[5]); + return OPAL_INVAL_PARAM; + } + sranges->lr[sranges->num_lrs++] = lr_uid[7]; + } else + sranges->lr[sranges->num_lrs++] = 0; + + tok_n++; + tok = response_get_token(&dev->parsed, tok_n); + if (IS_ERR(tok)) + return PTR_ERR(tok); + } + } else { + /* Only OPAL_LOCKING_TABLE UID is an alternative to OPAL_STARTLIST here. */ + lr_uid_len = response_get_string(&dev->parsed, tok_n, &lr_uid); + if (lr_uid_len != OPAL_UID_LENGTH) { + pr_debug("Unexpected response token type %d.\n", tok_n); + return OPAL_INVAL_PARAM; + } + + if (memcmp(lr_uid, opaluid[OPAL_LOCKING_TABLE], OPAL_UID_LENGTH)) { + pr_debug("Unexpected response UID.\n"); + return OPAL_INVAL_PARAM; + } + + /* sed-opal kernel API already provides following limit in Activate command */ + sranges->num_lrs = OPAL_MAX_LRS; + memcpy(sranges->lr, lr_all, OPAL_MAX_LRS); + } + tok_n++; + + tok = response_get_token(&dev->parsed, tok_n); + if (IS_ERR(tok)) + return PTR_ERR(tok); + + if (!response_token_matches(tok, OPAL_ENDNAME)) { + pr_debug("Unexpected response token type %d.\n", tok_n); + return OPAL_INVAL_PARAM; + } + tok_n++; + + err = response_get_column(&dev->parsed, &tok_n, OPAL_SUM_RANGE_POLICY, &val); + if (err) + return err; + + sranges->range_policy = val ? 1 : 0; + + return 0; +} + +static int opal_get_sum_ranges(struct opal_dev *dev, struct opal_sum_ranges *opal_sum_rngs, + void __user *data) +{ + const struct opal_step admin_steps[] = { + { start_admin1LSP_opal_session, &opal_sum_rngs->key }, + { get_sum_ranges, opal_sum_rngs }, + { end_opal_session, } + }, anybody_steps[] = { + { start_anybodyLSP_opal_session, NULL }, + { get_sum_ranges, opal_sum_rngs }, + { end_opal_session, } + }; + int ret; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + if (opal_sum_rngs->key.key_len) + /* Use Admin1 session (authenticated by PIN) to retrieve LockingInfo columns */ + ret = execute_steps(dev, admin_steps, ARRAY_SIZE(admin_steps)); + else + /* Use Anybody session (no key) to retrieve LockingInfo columns */ + ret = execute_steps(dev, anybody_steps, ARRAY_SIZE(anybody_steps)); + mutex_unlock(&dev->dev_lock); + + /* skip session info when copying back to uspace */ + if (!ret && copy_to_user(data + offsetof(struct opal_sum_ranges, num_lrs), + (void *)opal_sum_rngs + offsetof(struct opal_sum_ranges, num_lrs), + sizeof(*opal_sum_rngs) - offsetof(struct opal_sum_ranges, num_lrs))) { + pr_debug("Error copying SUM ranges info to userspace\n"); + return -EFAULT; + } + + return ret; +} + int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) { void *p; @@ -3483,6 +3639,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_ENABLE_DISABLE_LR: ret = opal_enable_disable_range(dev, p); break; + case IOC_OPAL_GET_SUM_STATUS: + ret = opal_get_sum_ranges(dev, p, arg); + break; default: break; diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index 1d63479838cf..aa006edb612b 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -56,6 +56,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_REACTIVATE_LSP: case IOC_OPAL_LR_SET_START_LEN: case IOC_OPAL_ENABLE_DISABLE_LR: + case IOC_OPAL_GET_SUM_STATUS: return true; } return false; diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index bde023ae2295..9830298ec51c 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -111,6 +111,18 @@ struct opal_lr_status { __u8 align[4]; }; +struct opal_sum_ranges { + /* + * Initiate Admin1 session if key_len > 0, + * use Anybody session otherwise. + */ + struct opal_key key; + __u8 num_lrs; + __u8 lr[OPAL_MAX_LRS]; + __u8 range_policy; + __u8 align[5]; /* Align to 8 byte boundary */ +}; + struct opal_lock_unlock { struct opal_session_info session; __u32 l_state; @@ -232,5 +244,6 @@ struct opal_revert_lsp { #define IOC_OPAL_REACTIVATE_LSP _IOW('p', 242, struct opal_lr_react) #define IOC_OPAL_LR_SET_START_LEN _IOW('p', 243, struct opal_user_lr_setup) #define IOC_OPAL_ENABLE_DISABLE_LR _IOW('p', 244, struct opal_user_lr_setup) +#define IOC_OPAL_GET_SUM_STATUS _IOW('p', 245, struct opal_sum_ranges) #endif /* _UAPI_SED_OPAL_H */ -- cgit v1.2.3 From 0ee8ab5d4dc51704be1157470f3df8090629f9fc Mon Sep 17 00:00:00 2001 From: Bill Wendling Date: Wed, 25 Feb 2026 20:51:05 +0000 Subject: block: annotate struct request_queue with __counted_by_ptr The queue_hw_ctx field in struct request_queue is an array of pointers to struct blk_mq_hw_ctx. The number of elements in this array is tracked by the nr_hw_queues field. The array is allocated in __blk_mq_realloc_hw_ctxs() using kcalloc_node() with set->nr_hw_queues elements. q->nr_hw_queues is subsequently updated to set->nr_hw_queues. When growing the array, the new array is assigned to queue_hw_ctx before nr_hw_queues is updated. This is safe because nr_hw_queues (the old smaller count) is used for bounds checking, which is within the new larger allocation. When shrinking the array, nr_hw_queues is updated to the smaller value, while queue_hw_ctx retains the larger allocation. This is also safe as the count is within the allocation bounds. Annotating queue_hw_ctx with __counted_by_ptr(nr_hw_queues) allows the compiler (with kSAN) to verify that accesses to queue_hw_ctx are within the valid range defined by nr_hw_queues. This patch was generated by CodeMender and reviewed by Bill Wendling. Tested by running blktests. Reviewed-by: Daniel Wagner Signed-off-by: Bill Wendling [axboe: massage commit message] Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d463b9b5a0a5..540c2c6c9afd 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -502,7 +502,7 @@ struct request_queue { /* hw dispatch queues */ unsigned int nr_hw_queues; - struct blk_mq_hw_ctx * __rcu *queue_hw_ctx; + struct blk_mq_hw_ctx * __rcu *queue_hw_ctx __counted_by_ptr(nr_hw_queues); struct percpu_ref q_usage_counter; struct lock_class_key io_lock_cls_key; -- cgit v1.2.3 From b7cbc30e93e3a64ea058230f6d0c764d6d80276f Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 27 Feb 2026 22:19:48 +0900 Subject: block: rename struct gendisk zone_wplugs_lock field Rename struct gendisk zone_wplugs_lock field to zone_wplugs_hash_lock to clearly indicates that this is the spinlock used for manipulating the hash table of zone write plugs. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-zoned.c | 23 ++++++++++++----------- include/linux/blkdev.h | 2 +- 2 files changed, 13 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 26c2aa79faf6..78810e726222 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -514,10 +514,11 @@ static bool disk_insert_zone_wplug(struct gendisk *disk, * are racing with other submission context, so we may already have a * zone write plug for the same zone. */ - spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags); hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) { if (zwplg->zone_no == zwplug->zone_no) { - spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, + flags); return false; } } @@ -529,7 +530,7 @@ static bool disk_insert_zone_wplug(struct gendisk *disk, * necessarilly in the active condition. */ zones_cond = rcu_dereference_check(disk->zones_cond, - lockdep_is_held(&disk->zone_wplugs_lock)); + lockdep_is_held(&disk->zone_wplugs_hash_lock)); if (zones_cond) zwplug->cond = zones_cond[zwplug->zone_no]; else @@ -537,7 +538,7 @@ static bool disk_insert_zone_wplug(struct gendisk *disk, hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]); atomic_inc(&disk->nr_zone_wplugs); - spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags); return true; } @@ -590,13 +591,13 @@ static void disk_free_zone_wplug(struct blk_zone_wplug *zwplug) WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED); WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list)); - spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags); blk_zone_set_cond(rcu_dereference_check(disk->zones_cond, - lockdep_is_held(&disk->zone_wplugs_lock)), + lockdep_is_held(&disk->zone_wplugs_hash_lock)), zwplug->zone_no, zwplug->cond); hlist_del_init_rcu(&zwplug->node); atomic_dec(&disk->nr_zone_wplugs); - spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags); call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu); } @@ -1739,7 +1740,7 @@ put_zwplug: void disk_init_zone_resources(struct gendisk *disk) { - spin_lock_init(&disk->zone_wplugs_lock); + spin_lock_init(&disk->zone_wplugs_hash_lock); } /* @@ -1829,10 +1830,10 @@ static void disk_set_zones_cond_array(struct gendisk *disk, u8 *zones_cond) { unsigned long flags; - spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags); zones_cond = rcu_replace_pointer(disk->zones_cond, zones_cond, - lockdep_is_held(&disk->zone_wplugs_lock)); - spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + lockdep_is_held(&disk->zone_wplugs_hash_lock)); + spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags); kfree_rcu_mightsleep(zones_cond); } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 540c2c6c9afd..a49a1e38c6e7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -200,7 +200,7 @@ struct gendisk { u8 __rcu *zones_cond; unsigned int zone_wplugs_hash_bits; atomic_t nr_zone_wplugs; - spinlock_t zone_wplugs_lock; + spinlock_t zone_wplugs_hash_lock; struct mempool *zone_wplugs_pool; struct hlist_head *zone_wplugs_hash; struct workqueue_struct *zone_wplugs_wq; -- cgit v1.2.3 From 1365b6904fd050bf22ab9f3df375a396de5837a1 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 27 Feb 2026 22:19:49 +0900 Subject: block: allow submitting all zone writes from a single context In order to maintain sequential write patterns per zone with zoned block devices, zone write plugging issues only a single write BIO per zone at any time. This works well but has the side effect that when large sequential write streams are issued by the user and these streams cross zone boundaries, the device ends up receiving a discontiguous set of write commands for different zones. The same also happens when a user writes simultaneously at high queue depth multiple zones: the device does not see all sequential writes per zone and receives discontiguous writes to different zones. While this does not affect the performance of solid state zoned block devices, when using an SMR HDD, this pattern change from sequential writes to discontiguous writes to different zones significantly increases head seek which results in degraded write throughput. In order to reduce this seek overhead for rotational media devices, introduce a per disk zone write plugs kernel thread to issue all write BIOs to zones. This single zone write issuing context is enabled for any zoned block device that has a request queue flagged with the new QUEUE_ZONED_QD1_WRITES flag. The flag QUEUE_ZONED_QD1_WRITES is visible as the sysfs queue attribute zoned_qd1_writes for zoned devices. For regular block devices, this attribute is not visible. For zoned block devices, a user can override the default value set to force the global write maximum queue depth of 1 for a zoned block device, or clear this attribute to fallback to the default behavior of zone write plugging which limits writes to QD=1 per sequential zone. Writing to a zoned block device flagged with QUEUE_ZONED_QD1_WRITES is implemented using a list of zone write plugs that have a non-empty BIO list. Listed zone write plugs are processed by the disk zone write plugs worker kthread in FIFO order, and all BIOs of a zone write plug are all processed before switching to the next listed zone write plug. A newly submitted BIO for a non-FULL zone write plug that is not yet listed causes the addition of the zone write plug at the end of the disk list of zone write plugs. Since the write BIOs queued in a zone write plug BIO list are necessarilly sequential, for rotational media, using the single zone write plugs kthread to issue all BIOs maintains a sequential write pattern and thus reduces seek overhead and improves write throughput. This processing essentially result in always writing to HDDs at QD=1, which is not an issue for HDDs operating with write caching enabled. Performance with write cache disabled is also not degraded thanks to the efficient write handling of modern SMR HDDs. A disk list of zone write plugs is defined using the new struct gendisk zone_wplugs_list, and accesses to this list is protected using the zone_wplugs_list_lock spinlock. The per disk kthread (zone_wplugs_worker) code is implemented by the function disk_zone_wplugs_worker(). A reference on listed zone write plugs is always held until all BIOs of the zone write plug are processed by the worker kthread. BIO issuing at QD=1 is driven using a completion structure (zone_wplugs_worker_bio_done) and calls to blk_io_wait(). With this change, performance when sequentially writing the zones of a 30 TB SMR SATA HDD connected to an AHCI adapter changes as follows (1MiB direct I/Os, results in MB/s unit): +--------------------+ | Write BW (MB/s) | +------------------+----------+---------+ | Sequential write | Baseline | Patched | | Queue Depth | 6.19-rc8 | | +------------------+----------+---------+ | 1 | 244 | 245 | | 2 | 244 | 245 | | 4 | 245 | 245 | | 8 | 242 | 245 | | 16 | 222 | 246 | | 32 | 211 | 245 | | 64 | 193 | 244 | | 128 | 112 | 246 | +------------------+----------+---------+ With the current code (baseline), as the sequential write stream crosses a zone boundary, higher queue depth creates a gap between the last IO to the previous zone and the first IOs to the following zones, causing head seeks and degrading performance. Using the disk zone write plugs worker thread, this pattern disappears and the maximum throughput of the drive is maintained, leading to over 100% improvements in throughput for high queue depth write. Using 16 fio jobs all writing to randomly chosen zones at QD=32 with 1 MiB direct IOs, write throughput also increases significantly. +--------------------+ | Write BW (MB/s) | +------------------+----------+---------+ | Random write | Baseline | Patched | | Number of zones | 6.19-rc7 | | +------------------+----------+---------+ | 1 | 191 | 192 | | 2 | 101 | 128 | | 4 | 115 | 123 | | 8 | 90 | 120 | | 16 | 64 | 115 | | 32 | 58 | 105 | | 64 | 56 | 101 | | 128 | 55 | 99 | +------------------+----------+---------+ Tests using XFS shows that buffered write speed with 8 jobs writing files increases by 12% to 35% depending on the workload. +--------------------+ | Write BW (MB/s) | +------------------+----------+---------+ | Workload | Baseline | Patched | | | 6.19-rc7 | | +------------------+----------+---------+ | 256MiB file size | 212 | 238 | +------------------+----------+---------+ | 4MiB .. 128 MiB | 213 | 243 | | random file size | | | +------------------+----------+---------+ | 2MiB .. 8 MiB | 179 | 242 | | random file size | | | +------------------+----------+---------+ Performance gains are even more significant when using an HBA that limits the maximum size of commands to a small value, e.g. HBAs controlled with the mpi3mr driver limit commands to a maximum of 1 MiB. In such case, the write throughput gains are over 40%. +--------------------+ | Write BW (MB/s) | +------------------+----------+---------+ | Workload | Baseline | Patched | | | 6.19-rc7 | | +------------------+----------+---------+ | 256MiB file size | 175 | 245 | +------------------+----------+---------+ | 4MiB .. 128 MiB | 174 | 244 | | random file size | | | +------------------+----------+---------+ | 2MiB .. 8 MiB | 171 | 243 | | random file size | | | +------------------+----------+---------+ Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 1 + block/blk-sysfs.c | 35 ++++++++- block/blk-zoned.c | 190 +++++++++++++++++++++++++++++++++++++++++++------ include/linux/blkdev.h | 8 +++ 4 files changed, 212 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 28167c9baa55..047ec887456b 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -97,6 +97,7 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(NO_ELV_SWITCH), QUEUE_FLAG_NAME(QOS_ENABLED), QUEUE_FLAG_NAME(BIO_ISSUE_TIME), + QUEUE_FLAG_NAME(ZONED_QD1_WRITES), }; #undef QUEUE_FLAG_NAME diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 55a1bbfef7d4..ca8033e6d699 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -390,6 +390,36 @@ static ssize_t queue_nr_zones_show(struct gendisk *disk, char *page) return queue_var_show(disk_nr_zones(disk), page); } +static ssize_t queue_zoned_qd1_writes_show(struct gendisk *disk, char *page) +{ + return queue_var_show(!!blk_queue_zoned_qd1_writes(disk->queue), + page); +} + +static ssize_t queue_zoned_qd1_writes_store(struct gendisk *disk, + const char *page, size_t count) +{ + struct request_queue *q = disk->queue; + unsigned long qd1_writes; + unsigned int memflags; + ssize_t ret; + + ret = queue_var_store(&qd1_writes, page, count); + if (ret < 0) + return ret; + + memflags = blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + if (qd1_writes) + blk_queue_flag_set(QUEUE_FLAG_ZONED_QD1_WRITES, q); + else + blk_queue_flag_clear(QUEUE_FLAG_ZONED_QD1_WRITES, q); + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q, memflags); + + return count; +} + static ssize_t queue_iostats_passthrough_show(struct gendisk *disk, char *page) { return queue_var_show(!!blk_queue_passthrough_stat(disk->queue), page); @@ -617,6 +647,7 @@ QUEUE_LIM_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes"); QUEUE_LIM_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity"); QUEUE_LIM_RO_ENTRY(queue_zoned, "zoned"); +QUEUE_RW_ENTRY(queue_zoned_qd1_writes, "zoned_qd1_writes"); QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones"); QUEUE_LIM_RO_ENTRY(queue_max_open_zones, "max_open_zones"); QUEUE_LIM_RO_ENTRY(queue_max_active_zones, "max_active_zones"); @@ -754,6 +785,7 @@ static struct attribute *queue_attrs[] = { &queue_nomerges_entry.attr, &queue_poll_entry.attr, &queue_poll_delay_entry.attr, + &queue_zoned_qd1_writes_entry.attr, NULL, }; @@ -786,7 +818,8 @@ static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr, struct request_queue *q = disk->queue; if ((attr == &queue_max_open_zones_entry.attr || - attr == &queue_max_active_zones_entry.attr) && + attr == &queue_max_active_zones_entry.attr || + attr == &queue_zoned_qd1_writes_entry.attr) && !blk_queue_is_zoned(q)) return 0; diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 78810e726222..e1a23c8b676d 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include @@ -40,6 +42,8 @@ static const char *const zone_cond_name[] = { /* * Per-zone write plug. * @node: hlist_node structure for managing the plug using a hash table. + * @entry: list_head structure for listing the plug in the disk list of active + * zone write plugs. * @bio_list: The list of BIOs that are currently plugged. * @bio_work: Work struct to handle issuing of plugged BIOs * @rcu_head: RCU head to free zone write plugs with an RCU grace period. @@ -62,6 +66,7 @@ static const char *const zone_cond_name[] = { */ struct blk_zone_wplug { struct hlist_node node; + struct list_head entry; struct bio_list bio_list; struct work_struct bio_work; struct rcu_head rcu_head; @@ -623,7 +628,19 @@ static void disk_mark_zone_wplug_dead(struct blk_zone_wplug *zwplug) } } -static void blk_zone_wplug_bio_work(struct work_struct *work); +static bool disk_zone_wplug_submit_bio(struct gendisk *disk, + struct blk_zone_wplug *zwplug); + +static void blk_zone_wplug_bio_work(struct work_struct *work) +{ + struct blk_zone_wplug *zwplug = + container_of(work, struct blk_zone_wplug, bio_work); + + disk_zone_wplug_submit_bio(zwplug->disk, zwplug); + + /* Drop the reference we took in disk_zone_wplug_schedule_work(). */ + disk_put_zone_wplug(zwplug); +} /* * Get a zone write plug for the zone containing @sector. @@ -658,6 +675,7 @@ again: zwplug->wp_offset = bdev_offset_from_zone_start(disk->part0, sector); bio_list_init(&zwplug->bio_list); INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work); + INIT_LIST_HEAD(&zwplug->entry); zwplug->disk = disk; /* @@ -690,6 +708,7 @@ static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug, */ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) { + struct gendisk *disk = zwplug->disk; struct bio *bio; lockdep_assert_held(&zwplug->lock); @@ -703,6 +722,20 @@ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) blk_zone_wplug_bio_io_error(zwplug, bio); zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; + + /* + * If we are using the per disk zone write plugs worker thread, remove + * the zone write plug from the work list and drop the reference we + * took when the zone write plug was added to that list. + */ + if (blk_queue_zoned_qd1_writes(disk->queue)) { + spin_lock(&disk->zone_wplugs_list_lock); + if (!list_empty(&zwplug->entry)) { + list_del_init(&zwplug->entry); + disk_put_zone_wplug(zwplug); + } + spin_unlock(&disk->zone_wplugs_list_lock); + } } /* @@ -1137,8 +1170,8 @@ void blk_zone_mgmt_bio_endio(struct bio *bio) } } -static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk, - struct blk_zone_wplug *zwplug) +static void disk_zone_wplug_schedule_work(struct gendisk *disk, + struct blk_zone_wplug *zwplug) { lockdep_assert_held(&zwplug->lock); @@ -1151,6 +1184,7 @@ static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk, * and we also drop this reference if the work is already scheduled. */ WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)); + WARN_ON_ONCE(blk_queue_zoned_qd1_writes(disk->queue)); refcount_inc(&zwplug->ref); if (!queue_work(disk->zone_wplugs_wq, &zwplug->bio_work)) disk_put_zone_wplug(zwplug); @@ -1190,6 +1224,22 @@ static inline void disk_zone_wplug_add_bio(struct gendisk *disk, bio_list_add(&zwplug->bio_list, bio); trace_disk_zone_wplug_add_bio(zwplug->disk->queue, zwplug->zone_no, bio->bi_iter.bi_sector, bio_sectors(bio)); + + /* + * If we are using the disk zone write plugs worker instead of the per + * zone write plug BIO work, add the zone write plug to the work list + * if it is not already there. Make sure to also get an extra reference + * on the zone write plug so that it does not go away until it is + * removed from the work list. + */ + if (blk_queue_zoned_qd1_writes(disk->queue)) { + spin_lock(&disk->zone_wplugs_list_lock); + if (list_empty(&zwplug->entry)) { + list_add_tail(&zwplug->entry, &disk->zone_wplugs_list); + refcount_inc(&zwplug->ref); + } + spin_unlock(&disk->zone_wplugs_list_lock); + } } /* @@ -1423,6 +1473,13 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) goto queue_bio; } + /* + * For rotational devices, we will use the gendisk zone write plugs + * work instead of the per zone write plug BIO work, so queue the BIO. + */ + if (blk_queue_zoned_qd1_writes(disk->queue)) + goto queue_bio; + /* If the zone is already plugged, add the BIO to the BIO plug list. */ if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) goto queue_bio; @@ -1445,7 +1502,10 @@ queue_bio: if (!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)) { zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; - disk_zone_wplug_schedule_bio_work(disk, zwplug); + if (blk_queue_zoned_qd1_writes(disk->queue)) + wake_up_process(disk->zone_wplugs_worker); + else + disk_zone_wplug_schedule_work(disk, zwplug); } spin_unlock_irqrestore(&zwplug->lock, flags); @@ -1586,16 +1646,22 @@ static void disk_zone_wplug_unplug_bio(struct gendisk *disk, spin_lock_irqsave(&zwplug->lock, flags); - /* Schedule submission of the next plugged BIO if we have one. */ - if (!bio_list_empty(&zwplug->bio_list)) { - disk_zone_wplug_schedule_bio_work(disk, zwplug); - spin_unlock_irqrestore(&zwplug->lock, flags); - return; - } + /* + * For rotational devices, signal the BIO completion to the zone write + * plug work. Otherwise, schedule submission of the next plugged BIO + * if we have one. + */ + if (bio_list_empty(&zwplug->bio_list)) + zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; + + if (blk_queue_zoned_qd1_writes(disk->queue)) + complete(&disk->zone_wplugs_worker_bio_done); + else if (!bio_list_empty(&zwplug->bio_list)) + disk_zone_wplug_schedule_work(disk, zwplug); - zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; if (!zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug)) disk_mark_zone_wplug_dead(zwplug); + spin_unlock_irqrestore(&zwplug->lock, flags); } @@ -1685,10 +1751,9 @@ void blk_zone_write_plug_finish_request(struct request *req) disk_put_zone_wplug(zwplug); } -static void blk_zone_wplug_bio_work(struct work_struct *work) +static bool disk_zone_wplug_submit_bio(struct gendisk *disk, + struct blk_zone_wplug *zwplug) { - struct blk_zone_wplug *zwplug = - container_of(work, struct blk_zone_wplug, bio_work); struct block_device *bdev; unsigned long flags; struct bio *bio; @@ -1704,7 +1769,7 @@ again: if (!bio) { zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; spin_unlock_irqrestore(&zwplug->lock, flags); - goto put_zwplug; + return false; } trace_blk_zone_wplug_bio(zwplug->disk->queue, zwplug->zone_no, @@ -1718,14 +1783,15 @@ again: goto again; } - bdev = bio->bi_bdev; - /* * blk-mq devices will reuse the extra reference on the request queue * usage counter we took when the BIO was plugged, but the submission * path for BIO-based devices will not do that. So drop this extra * reference here. */ + if (blk_queue_zoned_qd1_writes(disk->queue)) + reinit_completion(&disk->zone_wplugs_worker_bio_done); + bdev = bio->bi_bdev; if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) { bdev->bd_disk->fops->submit_bio(bio); blk_queue_exit(bdev->bd_disk->queue); @@ -1733,14 +1799,78 @@ again: blk_mq_submit_bio(bio); } -put_zwplug: - /* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */ - disk_put_zone_wplug(zwplug); + return true; +} + +static struct blk_zone_wplug *disk_get_zone_wplugs_work(struct gendisk *disk) +{ + struct blk_zone_wplug *zwplug; + + spin_lock_irq(&disk->zone_wplugs_list_lock); + zwplug = list_first_entry_or_null(&disk->zone_wplugs_list, + struct blk_zone_wplug, entry); + if (zwplug) + list_del_init(&zwplug->entry); + spin_unlock_irq(&disk->zone_wplugs_list_lock); + + return zwplug; +} + +static int disk_zone_wplugs_worker(void *data) +{ + struct gendisk *disk = data; + struct blk_zone_wplug *zwplug; + unsigned int noio_flag; + + noio_flag = memalloc_noio_save(); + set_user_nice(current, MIN_NICE); + set_freezable(); + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE); + + zwplug = disk_get_zone_wplugs_work(disk); + if (zwplug) { + /* + * Process all BIOs of this zone write plug and then + * drop the reference we took when adding the zone write + * plug to the active list. + */ + set_current_state(TASK_RUNNING); + while (disk_zone_wplug_submit_bio(disk, zwplug)) + blk_wait_io(&disk->zone_wplugs_worker_bio_done); + disk_put_zone_wplug(zwplug); + continue; + } + + /* + * Only sleep if nothing sets the state to running. Else check + * for zone write plugs work again as a newly submitted BIO + * might have added a zone write plug to the work list. + */ + if (get_current_state() == TASK_RUNNING) { + try_to_freeze(); + } else { + if (kthread_should_stop()) { + set_current_state(TASK_RUNNING); + break; + } + schedule(); + } + } + + WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list)); + memalloc_noio_restore(noio_flag); + + return 0; } void disk_init_zone_resources(struct gendisk *disk) { spin_lock_init(&disk->zone_wplugs_hash_lock); + spin_lock_init(&disk->zone_wplugs_list_lock); + INIT_LIST_HEAD(&disk->zone_wplugs_list); + init_completion(&disk->zone_wplugs_worker_bio_done); } /* @@ -1756,6 +1886,7 @@ static int disk_alloc_zone_resources(struct gendisk *disk, unsigned int pool_size) { unsigned int i; + int ret = -ENOMEM; atomic_set(&disk->nr_zone_wplugs, 0); disk->zone_wplugs_hash_bits = @@ -1781,8 +1912,21 @@ static int disk_alloc_zone_resources(struct gendisk *disk, if (!disk->zone_wplugs_wq) goto destroy_pool; + disk->zone_wplugs_worker = + kthread_create(disk_zone_wplugs_worker, disk, + "%s_zwplugs_worker", disk->disk_name); + if (IS_ERR(disk->zone_wplugs_worker)) { + ret = PTR_ERR(disk->zone_wplugs_worker); + disk->zone_wplugs_worker = NULL; + goto destroy_wq; + } + wake_up_process(disk->zone_wplugs_worker); + return 0; +destroy_wq: + destroy_workqueue(disk->zone_wplugs_wq); + disk->zone_wplugs_wq = NULL; destroy_pool: mempool_destroy(disk->zone_wplugs_pool); disk->zone_wplugs_pool = NULL; @@ -1790,7 +1934,7 @@ free_hash: kfree(disk->zone_wplugs_hash); disk->zone_wplugs_hash = NULL; disk->zone_wplugs_hash_bits = 0; - return -ENOMEM; + return ret; } static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) @@ -1840,6 +1984,10 @@ static void disk_set_zones_cond_array(struct gendisk *disk, u8 *zones_cond) void disk_free_zone_resources(struct gendisk *disk) { + if (disk->zone_wplugs_worker) + kthread_stop(disk->zone_wplugs_worker); + WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list)); + if (disk->zone_wplugs_wq) { destroy_workqueue(disk->zone_wplugs_wq); disk->zone_wplugs_wq = NULL; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a49a1e38c6e7..ef6457487d23 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -204,6 +205,10 @@ struct gendisk { struct mempool *zone_wplugs_pool; struct hlist_head *zone_wplugs_hash; struct workqueue_struct *zone_wplugs_wq; + spinlock_t zone_wplugs_list_lock; + struct list_head zone_wplugs_list; + struct task_struct *zone_wplugs_worker; + struct completion zone_wplugs_worker_bio_done; #endif /* CONFIG_BLK_DEV_ZONED */ #if IS_ENABLED(CONFIG_CDROM) @@ -668,6 +673,7 @@ enum { QUEUE_FLAG_NO_ELV_SWITCH, /* can't switch elevator any more */ QUEUE_FLAG_QOS_ENABLED, /* qos is enabled */ QUEUE_FLAG_BIO_ISSUE_TIME, /* record bio->issue_time_ns */ + QUEUE_FLAG_ZONED_QD1_WRITES, /* Limit zoned devices writes to QD=1 */ QUEUE_FLAG_MAX }; @@ -707,6 +713,8 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); test_bit(QUEUE_FLAG_DISABLE_WBT_DEF, &(q)->queue_flags) #define blk_queue_no_elv_switch(q) \ test_bit(QUEUE_FLAG_NO_ELV_SWITCH, &(q)->queue_flags) +#define blk_queue_zoned_qd1_writes(q) \ + test_bit(QUEUE_FLAG_ZONED_QD1_WRITES, &(q)->queue_flags) extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q); -- cgit v1.2.3 From ecd92cfec5349876d6a80f8188ea98c5920094b6 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 26 Feb 2026 16:54:48 +0900 Subject: block: remove bdev_nonrot() bdev_nonrot() is simply the negative return value of bdev_rot(). So replace all call sites of bdev_nonrot() with calls to bdev_rot() and remove bdev_nonrot(). Signed-off-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Paul Menzel Signed-off-by: Jens Axboe --- drivers/md/raid1.c | 2 +- drivers/md/raid10.c | 2 +- drivers/md/raid5.c | 2 +- drivers/target/target_core_file.c | 2 +- drivers/target/target_core_iblock.c | 2 +- fs/btrfs/volumes.c | 4 ++-- fs/ext4/mballoc-test.c | 2 +- fs/ext4/mballoc.c | 2 +- include/linux/blkdev.h | 5 ----- mm/swapfile.c | 2 +- 10 files changed, 10 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 181400e147c0..cda6af0712b9 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1878,7 +1878,7 @@ static bool raid1_add_conf(struct r1conf *conf, struct md_rdev *rdev, int disk, if (info->rdev) return false; - if (bdev_nonrot(rdev->bdev)) { + if (!bdev_rot(rdev->bdev)) { set_bit(Nonrot, &rdev->flags); WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks + 1); } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 0653b5d8545a..cfbd345805ca 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -806,7 +806,7 @@ static struct md_rdev *read_balance(struct r10conf *conf, if (!do_balance) break; - nonrot = bdev_nonrot(rdev->bdev); + nonrot = !bdev_rot(rdev->bdev); has_nonrot_disk |= nonrot; pending = atomic_read(&rdev->nr_pending); if (min_pending > pending && nonrot) { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a8e8d431071b..ba9d6d05b089 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7541,7 +7541,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) rdev_for_each(rdev, mddev) { if (test_bit(Journal, &rdev->flags)) continue; - if (bdev_nonrot(rdev->bdev)) { + if (!bdev_rot(rdev->bdev)) { conf->batch_bio_dispatch = false; break; } diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c index 3ae1f7137d9d..d6e3e5214652 100644 --- a/drivers/target/target_core_file.c +++ b/drivers/target/target_core_file.c @@ -173,7 +173,7 @@ static int fd_configure_device(struct se_device *dev) */ dev->dev_attrib.max_write_same_len = 0xFFFF; - if (bdev_nonrot(bdev)) + if (!bdev_rot(bdev)) dev->dev_attrib.is_nonrot = 1; } else { if (!(fd_dev->fbd_flags & FBDF_HAS_SIZE)) { diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index 3c92f94497b4..1087d1d17c36 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -148,7 +148,7 @@ static int iblock_configure_device(struct se_device *dev) else dev->dev_attrib.max_write_same_len = 0xFFFF; - if (bdev_nonrot(bd)) + if (!bdev_rot(bd)) dev->dev_attrib.is_nonrot = 1; target_configure_write_atomic_from_bdev(&dev->dev_attrib, bd); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 648bb09fc416..353c9caa8ab9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -694,7 +694,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); } - if (!bdev_nonrot(file_bdev(bdev_file))) + if (bdev_rot(file_bdev(bdev_file))) fs_devices->rotating = true; if (bdev_max_discard_sectors(file_bdev(bdev_file))) @@ -2919,7 +2919,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path atomic64_add(device->total_bytes, &fs_info->free_chunk_space); - if (!bdev_nonrot(device->bdev)) + if (bdev_rot(device->bdev)) fs_devices->rotating = true; orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c index 9fbdf6a09489..b9f22e3a8d5c 100644 --- a/fs/ext4/mballoc-test.c +++ b/fs/ext4/mballoc-test.c @@ -72,7 +72,7 @@ static int mbt_mb_init(struct super_block *sb) ext4_fsblk_t block; int ret; - /* needed by ext4_mb_init->bdev_nonrot(sb->s_bdev) */ + /* needed by ext4_mb_init->bdev_rot(sb->s_bdev) */ sb->s_bdev = kzalloc_obj(*sb->s_bdev); if (sb->s_bdev == NULL) return -ENOMEM; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 20e9fdaf4301..8a4dfe19878c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3836,7 +3836,7 @@ int ext4_mb_init(struct super_block *sb) spin_lock_init(&lg->lg_prealloc_lock); } - if (bdev_nonrot(sb->s_bdev)) + if (!bdev_rot(sb->s_bdev)) sbi->s_mb_max_linear_groups = 0; else sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ef6457487d23..8d93d8e356d8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1475,11 +1475,6 @@ static inline bool bdev_rot(struct block_device *bdev) return blk_queue_rot(bdev_get_queue(bdev)); } -static inline bool bdev_nonrot(struct block_device *bdev) -{ - return !bdev_rot(bdev); -} - static inline bool bdev_synchronous(struct block_device *bdev) { return bdev->bd_disk->queue->limits.features & BLK_FEAT_SYNCHRONOUS; diff --git a/mm/swapfile.c b/mm/swapfile.c index 94af29d1de88..60e21414624b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3460,7 +3460,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (si->bdev && bdev_synchronous(si->bdev)) si->flags |= SWP_SYNCHRONOUS_IO; - if (si->bdev && bdev_nonrot(si->bdev)) { + if (si->bdev && !bdev_rot(si->bdev)) { si->flags |= SWP_SOLIDSTATE; } else { atomic_inc(&nr_rotate_swap); -- cgit v1.2.3 From 588e7c048d7d2bfcbe7776ee0888ee248adf01d1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 2 Mar 2026 06:18:09 -0800 Subject: ext4, fscrypt: merge fscrypt_mergeable_bio_bh into io_submit_need_new_bio ext4 already has the inode and folio and can't have a NULL folio->mapping in this path. Open code fscrypt_mergeable_bio_bh in io_submit_need_new_bio based on these simplifying assumptions. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20260302141922.370070-5-hch@lst.de Signed-off-by: Eric Biggers --- fs/crypto/inline_crypt.c | 23 ----------------------- fs/ext4/page-io.c | 7 +++++-- include/linux/fscrypt.h | 9 --------- 3 files changed, 5 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index c0852b920dbc..0da53956a9b1 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -406,29 +406,6 @@ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, } EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio); -/** - * fscrypt_mergeable_bio_bh() - test whether data can be added to a bio - * @bio: the bio being built up - * @next_bh: the next buffer_head for which I/O will be submitted - * - * Same as fscrypt_mergeable_bio(), except this takes a buffer_head instead of - * an inode and block number directly. - * - * Return: true iff the I/O is mergeable - */ -bool fscrypt_mergeable_bio_bh(struct bio *bio, - const struct buffer_head *next_bh) -{ - const struct inode *inode; - u64 next_lblk; - - if (!bh_get_inode_and_lblk_num(next_bh, &inode, &next_lblk)) - return !bio->bi_crypt_context; - - return fscrypt_mergeable_bio(bio, inode, next_lblk); -} -EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio_bh); - /** * fscrypt_dio_supported() - check whether DIO (direct I/O) is supported on an * inode, as far as encryption is concerned diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 58cdbd836fd6..293314d7f236 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -440,11 +440,14 @@ static void io_submit_init_bio(struct ext4_io_submit *io, } static bool io_submit_need_new_bio(struct ext4_io_submit *io, + struct inode *inode, + struct folio *folio, struct buffer_head *bh) { if (bh->b_blocknr != io->io_next_block) return true; - if (!fscrypt_mergeable_bio_bh(io->io_bio, bh)) + if (!fscrypt_mergeable_bio(io->io_bio, inode, + (folio_pos(folio) + bh_offset(bh)) >> inode->i_blkbits)) return true; return false; } @@ -455,7 +458,7 @@ static void io_submit_add_bh(struct ext4_io_submit *io, struct folio *io_folio, struct buffer_head *bh) { - if (io->io_bio && io_submit_need_new_bio(io, bh)) { + if (io->io_bio && io_submit_need_new_bio(io, inode, folio, bh)) { submit_and_retry: ext4_io_submit(io); } diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 516aba5b858b..6af3c1907adc 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -876,9 +876,6 @@ void fscrypt_set_bio_crypt_ctx_bh(struct bio *bio, bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, u64 next_lblk); -bool fscrypt_mergeable_bio_bh(struct bio *bio, - const struct buffer_head *next_bh); - bool fscrypt_dio_supported(struct inode *inode); u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks); @@ -906,12 +903,6 @@ static inline bool fscrypt_mergeable_bio(struct bio *bio, return true; } -static inline bool fscrypt_mergeable_bio_bh(struct bio *bio, - const struct buffer_head *next_bh) -{ - return true; -} - static inline bool fscrypt_dio_supported(struct inode *inode) { return !fscrypt_needs_contents_encryption(inode); -- cgit v1.2.3 From a18b1ab81654b06e7ff402e5d0b85249e9504bcb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 2 Mar 2026 06:18:10 -0800 Subject: fscrypt: move fscrypt_set_bio_crypt_ctx_bh to buffer.c fscrypt_set_bio_crypt_ctx_bh is only used by submit_bh_wbc now. Move it there and merge bh_get_inode_and_lblk_num into it. Note that this does not add ifdefs for fscrypt as the compiler will optimize away the dead code if it is not built in. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20260302141922.370070-6-hch@lst.de Signed-off-by: Eric Biggers --- fs/buffer.c | 21 ++++++++++++++++++++- fs/crypto/inline_crypt.c | 45 --------------------------------------------- include/linux/fscrypt.h | 9 --------- 3 files changed, 20 insertions(+), 55 deletions(-) (limited to 'include/linux') diff --git a/fs/buffer.c b/fs/buffer.c index 22b43642ba57..b6504ec7fa4c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2774,6 +2774,24 @@ static void end_bio_bh_io_sync(struct bio *bio) bio_put(bio); } +static void buffer_set_crypto_ctx(struct bio *bio, const struct buffer_head *bh, + gfp_t gfp_mask) +{ + const struct address_space *mapping = folio_mapping(bh->b_folio); + const struct inode *inode; + u64 lblk; + + /* + * The ext4 journal (jbd2) can submit a buffer_head it directly created + * for a non-pagecache page. fscrypt doesn't care about these. + */ + if (!mapping) + return; + inode = mapping->host; + lblk = (folio_pos(bh->b_folio) + bh_offset(bh)) >> inode->i_blkbits; + fscrypt_set_bio_crypt_ctx(bio, inode, lblk, gfp_mask); +} + static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, enum rw_hint write_hint, struct writeback_control *wbc) @@ -2800,7 +2818,8 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, bio = bio_alloc(bh->b_bdev, 1, opf, GFP_NOIO); - fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO); + if (IS_ENABLED(CONFIG_FS_ENCRYPTION)) + buffer_set_crypto_ctx(bio, bh, GFP_NOIO); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_write_hint = write_hint; diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index 0da53956a9b1..702d13d138aa 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -314,51 +314,6 @@ void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, } EXPORT_SYMBOL_GPL(fscrypt_set_bio_crypt_ctx); -/* Extract the inode and logical block number from a buffer_head. */ -static bool bh_get_inode_and_lblk_num(const struct buffer_head *bh, - const struct inode **inode_ret, - u64 *lblk_num_ret) -{ - struct folio *folio = bh->b_folio; - const struct address_space *mapping; - const struct inode *inode; - - /* - * The ext4 journal (jbd2) can submit a buffer_head it directly created - * for a non-pagecache page. fscrypt doesn't care about these. - */ - mapping = folio_mapping(folio); - if (!mapping) - return false; - inode = mapping->host; - - *inode_ret = inode; - *lblk_num_ret = (folio_pos(folio) + bh_offset(bh)) >> inode->i_blkbits; - return true; -} - -/** - * fscrypt_set_bio_crypt_ctx_bh() - prepare a file contents bio for inline - * crypto - * @bio: a bio which will eventually be submitted to the file - * @first_bh: the first buffer_head for which I/O will be submitted - * @gfp_mask: memory allocation flags - * - * Same as fscrypt_set_bio_crypt_ctx(), except this takes a buffer_head instead - * of an inode and block number directly. - */ -void fscrypt_set_bio_crypt_ctx_bh(struct bio *bio, - const struct buffer_head *first_bh, - gfp_t gfp_mask) -{ - const struct inode *inode; - u64 first_lblk; - - if (bh_get_inode_and_lblk_num(first_bh, &inode, &first_lblk)) - fscrypt_set_bio_crypt_ctx(bio, inode, first_lblk, gfp_mask); -} -EXPORT_SYMBOL_GPL(fscrypt_set_bio_crypt_ctx_bh); - /** * fscrypt_mergeable_bio() - test whether data can be added to a bio * @bio: the bio being built up diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 6af3c1907adc..26561b7994e0 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -869,10 +869,6 @@ void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, u64 first_lblk, gfp_t gfp_mask); -void fscrypt_set_bio_crypt_ctx_bh(struct bio *bio, - const struct buffer_head *first_bh, - gfp_t gfp_mask); - bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, u64 next_lblk); @@ -891,11 +887,6 @@ static inline void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, u64 first_lblk, gfp_t gfp_mask) { } -static inline void fscrypt_set_bio_crypt_ctx_bh( - struct bio *bio, - const struct buffer_head *first_bh, - gfp_t gfp_mask) { } - static inline bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, u64 next_lblk) -- cgit v1.2.3 From 22be86a23c5956254b752e4e98f0ef2799565a41 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 2 Mar 2026 06:18:12 -0800 Subject: fscrypt: pass a byte offset to fscrypt_mergeable_bio Logical offsets into an inode are usually expressed as bytes in the VFS. Switch fscrypt_mergeable_bio to that convention. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20260302141922.370070-8-hch@lst.de Signed-off-by: Eric Biggers --- fs/crypto/bio.c | 3 ++- fs/crypto/inline_crypt.c | 6 +++--- fs/ext4/page-io.c | 2 +- fs/ext4/readpage.c | 3 ++- fs/f2fs/data.c | 3 ++- include/linux/fscrypt.h | 4 ++-- 6 files changed, 12 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 6da683ea69dc..0a701d4a17ef 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -100,7 +100,8 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, len -= blocks_this_page; lblk += blocks_this_page; sector += (bytes_this_page >> SECTOR_SHIFT); - if (!len || !fscrypt_mergeable_bio(bio, inode, lblk)) + if (!len || !fscrypt_mergeable_bio(bio, inode, + (loff_t)lblk << blockbits)) break; } diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index 5279565e9846..b0954d17904b 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -316,7 +316,7 @@ EXPORT_SYMBOL_GPL(fscrypt_set_bio_crypt_ctx); * fscrypt_mergeable_bio() - test whether data can be added to a bio * @bio: the bio being built up * @inode: the inode for the next part of the I/O - * @next_lblk: the next file logical block number in the I/O + * @pos: the next file position (in bytes) in the I/O * * When building a bio which may contain data which should undergo inline * encryption (or decryption) via fscrypt, filesystems should call this function @@ -334,7 +334,7 @@ EXPORT_SYMBOL_GPL(fscrypt_set_bio_crypt_ctx); * Return: true iff the I/O is mergeable */ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, - u64 next_lblk) + loff_t pos) { const struct bio_crypt_ctx *bc = bio->bi_crypt_context; const struct fscrypt_inode_info *ci; @@ -354,7 +354,7 @@ bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, if (bc->bc_key != ci->ci_enc_key.blk_key) return false; - fscrypt_generate_dun(ci, next_lblk << inode->i_blkbits, next_dun); + fscrypt_generate_dun(ci, pos, next_dun); return bio_crypt_dun_is_contiguous(bc, bio->bi_iter.bi_size, next_dun); } EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 293314d7f236..50f507bab82c 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -447,7 +447,7 @@ static bool io_submit_need_new_bio(struct ext4_io_submit *io, if (bh->b_blocknr != io->io_next_block) return true; if (!fscrypt_mergeable_bio(io->io_bio, inode, - (folio_pos(folio) + bh_offset(bh)) >> inode->i_blkbits)) + folio_pos(folio) + bh_offset(bh))) return true; return false; } diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 830f3b8a321f..ba7cfddd6038 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -342,7 +342,8 @@ static int ext4_mpage_readpages(struct inode *inode, struct fsverity_info *vi, * BIO off first? */ if (bio && (last_block_in_bio != first_block - 1 || - !fscrypt_mergeable_bio(bio, inode, next_block))) { + !fscrypt_mergeable_bio(bio, inode, + (loff_t)next_block << blkbits))) { submit_and_realloc: blk_crypto_submit_bio(bio); bio = NULL; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 338df7a2aea6..dca273fedfde 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -541,7 +541,8 @@ static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode, if (fio && fio->encrypted_page) return !bio_has_crypt_ctx(bio); - return fscrypt_mergeable_bio(bio, inode, next_idx); + return fscrypt_mergeable_bio(bio, inode, + (loff_t)next_idx << inode->i_blkbits); } void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio, diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 26561b7994e0..98fb14660d40 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -870,7 +870,7 @@ void fscrypt_set_bio_crypt_ctx(struct bio *bio, gfp_t gfp_mask); bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, - u64 next_lblk); + loff_t pos); bool fscrypt_dio_supported(struct inode *inode); @@ -889,7 +889,7 @@ static inline void fscrypt_set_bio_crypt_ctx(struct bio *bio, static inline bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, - u64 next_lblk) + loff_t pos) { return true; } -- cgit v1.2.3 From 3c7eaa775d8e008135646bd4b7aa7db7c5e40a0e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 2 Mar 2026 06:18:13 -0800 Subject: fscrypt: pass a byte offset to fscrypt_set_bio_crypt_ctx Logical offsets into an inode are usually expressed as bytes in the VFS. Switch fscrypt_set_bio_crypt_ctx to that convention. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20260302141922.370070-9-hch@lst.de Signed-off-by: Eric Biggers --- fs/buffer.c | 7 ++----- fs/crypto/bio.c | 8 ++++---- fs/crypto/inline_crypt.c | 6 +++--- fs/ext4/page-io.c | 5 ++--- fs/ext4/readpage.c | 4 ++-- fs/f2fs/data.c | 4 +++- fs/iomap/direct-io.c | 6 ++---- include/linux/fscrypt.h | 7 +++---- 8 files changed, 21 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/fs/buffer.c b/fs/buffer.c index b6504ec7fa4c..1c8ee5a59f88 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2778,8 +2778,6 @@ static void buffer_set_crypto_ctx(struct bio *bio, const struct buffer_head *bh, gfp_t gfp_mask) { const struct address_space *mapping = folio_mapping(bh->b_folio); - const struct inode *inode; - u64 lblk; /* * The ext4 journal (jbd2) can submit a buffer_head it directly created @@ -2787,9 +2785,8 @@ static void buffer_set_crypto_ctx(struct bio *bio, const struct buffer_head *bh, */ if (!mapping) return; - inode = mapping->host; - lblk = (folio_pos(bh->b_folio) + bh_offset(bh)) >> inode->i_blkbits; - fscrypt_set_bio_crypt_ctx(bio, inode, lblk, gfp_mask); + fscrypt_set_bio_crypt_ctx(bio, mapping->host, + folio_pos(bh->b_folio) + bh_offset(bh), gfp_mask); } static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 0a701d4a17ef..e7fb2fdd9728 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -75,6 +75,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, { const unsigned int blockbits = inode->i_blkbits; const unsigned int blocks_per_page = 1 << (PAGE_SHIFT - blockbits); + loff_t pos = (loff_t)lblk << blockbits; struct fscrypt_zero_done done = { .pending = ATOMIC_INIT(1), .done = COMPLETION_INITIALIZER_ONSTACK(done.done), @@ -89,7 +90,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, bio->bi_iter.bi_sector = sector; bio->bi_private = &done; bio->bi_end_io = fscrypt_zeroout_range_end_io; - fscrypt_set_bio_crypt_ctx(bio, inode, lblk, GFP_NOFS); + fscrypt_set_bio_crypt_ctx(bio, inode, pos, GFP_NOFS); for (n = 0; n < BIO_MAX_VECS; n++) { unsigned int blocks_this_page = @@ -98,10 +99,9 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, __bio_add_page(bio, ZERO_PAGE(0), bytes_this_page, 0); len -= blocks_this_page; - lblk += blocks_this_page; + pos += bytes_this_page; sector += (bytes_this_page >> SECTOR_SHIFT); - if (!len || !fscrypt_mergeable_bio(bio, inode, - (loff_t)lblk << blockbits)) + if (!len || !fscrypt_mergeable_bio(bio, inode, pos)) break; } diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index b0954d17904b..37d42d357925 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -285,7 +285,7 @@ static void fscrypt_generate_dun(const struct fscrypt_inode_info *ci, * fscrypt_set_bio_crypt_ctx() - prepare a file contents bio for inline crypto * @bio: a bio which will eventually be submitted to the file * @inode: the file's inode - * @first_lblk: the first file logical block number in the I/O + * @pos: the first file position (in bytes) in the I/O * @gfp_mask: memory allocation flags - these must be a waiting mask so that * bio_crypt_set_ctx can't fail. * @@ -298,7 +298,7 @@ static void fscrypt_generate_dun(const struct fscrypt_inode_info *ci, * The encryption context will be freed automatically when the bio is freed. */ void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, - u64 first_lblk, gfp_t gfp_mask) + loff_t pos, gfp_t gfp_mask) { const struct fscrypt_inode_info *ci; u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; @@ -307,7 +307,7 @@ void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, return; ci = fscrypt_get_inode_info_raw(inode); - fscrypt_generate_dun(ci, first_lblk << inode->i_blkbits, dun); + fscrypt_generate_dun(ci, pos, dun); bio_crypt_set_ctx(bio, ci->ci_enc_key.blk_key, dun, gfp_mask); } EXPORT_SYMBOL_GPL(fscrypt_set_bio_crypt_ctx); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 50f507bab82c..181cda58d387 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -427,9 +427,8 @@ static void io_submit_init_bio(struct ext4_io_submit *io, * __GFP_DIRECT_RECLAIM is set, see comments for bio_alloc_bioset(). */ bio = bio_alloc(bh->b_bdev, BIO_MAX_VECS, REQ_OP_WRITE, GFP_NOIO); - fscrypt_set_bio_crypt_ctx(bio, inode, - (folio_pos(folio) + bh_offset(bh)) >> inode->i_blkbits, - GFP_NOIO); + fscrypt_set_bio_crypt_ctx(bio, inode, folio_pos(folio) + bh_offset(bh), + GFP_NOIO); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_end_io = ext4_end_bio; bio->bi_private = ext4_get_io_end(io->io_end); diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index ba7cfddd6038..fbfa4d830d9a 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -355,8 +355,8 @@ static int ext4_mpage_readpages(struct inode *inode, struct fsverity_info *vi, */ bio = bio_alloc(bdev, bio_max_segs(nr_pages), REQ_OP_READ, GFP_KERNEL); - fscrypt_set_bio_crypt_ctx(bio, inode, next_block, - GFP_KERNEL); + fscrypt_set_bio_crypt_ctx(bio, inode, + (loff_t)next_block << blkbits, GFP_KERNEL); ext4_set_bio_post_read_ctx(bio, inode, vi); bio->bi_iter.bi_sector = first_block << (blkbits - 9); bio->bi_end_io = mpage_end_io; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index dca273fedfde..07b4ed6bb0cc 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -527,7 +527,9 @@ static void f2fs_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, * read/write raw data without encryption. */ if (!fio || !fio->encrypted_page) - fscrypt_set_bio_crypt_ctx(bio, inode, first_idx, gfp_mask); + fscrypt_set_bio_crypt_ctx(bio, inode, + (loff_t)first_idx << inode->i_blkbits, + gfp_mask); } static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode, diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index e911daedff65..9da5d862ef9e 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -311,8 +311,7 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, bio = iomap_dio_alloc_bio(iter, dio, nr_vecs, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE); - fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, - GFP_KERNEL); + fscrypt_set_bio_crypt_ctx(bio, inode, pos, GFP_KERNEL); bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos); bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; @@ -342,8 +341,7 @@ static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter, nr_vecs = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS); bio = iomap_dio_alloc_bio(iter, dio, nr_vecs, op); - fscrypt_set_bio_crypt_ctx(bio, iter->inode, - pos >> iter->inode->i_blkbits, GFP_KERNEL); + fscrypt_set_bio_crypt_ctx(bio, iter->inode, pos, GFP_KERNEL); bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos); bio->bi_write_hint = iter->inode->i_write_hint; bio->bi_ioprio = dio->iocb->ki_ioprio; diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 98fb14660d40..90f75fe0e1c9 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -865,9 +865,8 @@ static inline void fscrypt_set_ops(struct super_block *sb, bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode); -void fscrypt_set_bio_crypt_ctx(struct bio *bio, - const struct inode *inode, u64 first_lblk, - gfp_t gfp_mask); +void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, + loff_t pos, gfp_t gfp_mask); bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, loff_t pos); @@ -885,7 +884,7 @@ static inline bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode) static inline void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, - u64 first_lblk, gfp_t gfp_mask) { } + loff_t pos, gfp_t gfp_mask) { } static inline bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode, -- cgit v1.2.3 From cd7db2e7dfeef99c901156f58ab4a38256b0c3f1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 2 Mar 2026 06:18:16 -0800 Subject: fscrypt: pass a byte offset to fscrypt_zeroout_range Logical offsets into an inode are usually expressed as bytes in the VFS. Switch fscrypt_zeroout_range to that convention. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20260302141922.370070-12-hch@lst.de Signed-off-by: Eric Biggers --- fs/crypto/bio.c | 7 +++---- fs/ext4/inode.c | 3 ++- fs/f2fs/file.c | 4 +++- include/linux/fscrypt.h | 4 ++-- 4 files changed, 10 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 303b5acc66a9..a07ac8dcf851 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -113,7 +113,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, /** * fscrypt_zeroout_range() - zero out a range of blocks in an encrypted file * @inode: the file's inode - * @lblk: the first file logical block to zero out + * @pos: the first file position (in bytes) to zero out * @pblk: the first filesystem physical block to zero out * @len: number of blocks to zero out * @@ -127,7 +127,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, * * Return: 0 on success; -errno on failure. */ -int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, +int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, sector_t pblk, unsigned int len) { const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); @@ -135,9 +135,8 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, const unsigned int du_size = 1U << du_bits; const unsigned int du_per_page_bits = PAGE_SHIFT - du_bits; const unsigned int du_per_page = 1U << du_per_page_bits; - u64 du_index = (u64)lblk << (inode->i_blkbits - du_bits); + u64 du_index = pos >> du_bits; u64 du_remaining = (u64)len << (inode->i_blkbits - du_bits); - loff_t pos = (loff_t)lblk << inode->i_blkbits; sector_t sector = pblk << (inode->i_blkbits - SECTOR_SHIFT); struct page *pages[16]; /* write up to 16 pages at a time */ unsigned int nr_pages; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 396dc3a5d16b..945613c95ffa 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -405,7 +405,8 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, KUNIT_STATIC_STUB_REDIRECT(ext4_issue_zeroout, inode, lblk, pblk, len); if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) - return fscrypt_zeroout_range(inode, lblk, pblk, len); + return fscrypt_zeroout_range(inode, + (loff_t)lblk << inode->i_blkbits, pblk, len); ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS); if (ret > 0) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c8a2f17a8f11..239c2666ceb5 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4162,7 +4162,9 @@ static int f2fs_secure_erase(struct block_device *bdev, struct inode *inode, if (!ret && (flags & F2FS_TRIM_FILE_ZEROOUT)) { if (IS_ENCRYPTED(inode)) - ret = fscrypt_zeroout_range(inode, off, block, len); + ret = fscrypt_zeroout_range(inode, + (loff_t)off << inode->i_blkbits, block, + len); else ret = blkdev_issue_zeroout(bdev, sector, nr_sects, GFP_NOFS, 0); diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 90f75fe0e1c9..9fc15e1fbe57 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -450,7 +450,7 @@ u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name); /* bio.c */ bool fscrypt_decrypt_bio(struct bio *bio); -int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, +int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, sector_t pblk, unsigned int len); /* hooks.c */ @@ -755,7 +755,7 @@ static inline bool fscrypt_decrypt_bio(struct bio *bio) return true; } -static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, +static inline int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, sector_t pblk, unsigned int len) { return -EOPNOTSUPP; -- cgit v1.2.3 From fb87ab4ad3d0df2397648e5ce2384de26463c183 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 2 Mar 2026 06:18:17 -0800 Subject: fscrypt: pass a byte length to fscrypt_zeroout_range Range lengths are usually expressed as bytes in the VFS, switch fscrypt_zeroout_range to this convention. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20260302141922.370070-13-hch@lst.de Signed-off-by: Eric Biggers --- fs/crypto/bio.c | 11 ++++++----- fs/ext4/inode.c | 3 ++- fs/f2fs/file.c | 2 +- include/linux/fscrypt.h | 6 +++--- 4 files changed, 12 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index a07ac8dcf851..9872408f4f52 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -115,12 +115,13 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, * @inode: the file's inode * @pos: the first file position (in bytes) to zero out * @pblk: the first filesystem physical block to zero out - * @len: number of blocks to zero out + * @len: bytes to zero out * * Zero out filesystem blocks in an encrypted regular file on-disk, i.e. write * ciphertext blocks which decrypt to the all-zeroes block. The blocks must be * both logically and physically contiguous. It's also assumed that the - * filesystem only uses a single block device, ->s_bdev. + * filesystem only uses a single block device, ->s_bdev. @len must be a + * multiple of the file system logical block size. * * Note that since each block uses a different IV, this involves writing a * different ciphertext to each block; we can't simply reuse the same one. @@ -128,7 +129,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, * Return: 0 on success; -errno on failure. */ int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, - sector_t pblk, unsigned int len) + sector_t pblk, u64 len) { const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); const unsigned int du_bits = ci->ci_data_unit_bits; @@ -136,7 +137,7 @@ int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, const unsigned int du_per_page_bits = PAGE_SHIFT - du_bits; const unsigned int du_per_page = 1U << du_per_page_bits; u64 du_index = pos >> du_bits; - u64 du_remaining = (u64)len << (inode->i_blkbits - du_bits); + u64 du_remaining = len >> du_bits; sector_t sector = pblk << (inode->i_blkbits - SECTOR_SHIFT); struct page *pages[16]; /* write up to 16 pages at a time */ unsigned int nr_pages; @@ -150,7 +151,7 @@ int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, if (fscrypt_inode_uses_inline_crypto(inode)) return fscrypt_zeroout_range_inline_crypt(inode, pos, sector, - (u64)len << inode->i_blkbits); + len); BUILD_BUG_ON(ARRAY_SIZE(pages) > BIO_MAX_VECS); nr_pages = min_t(u64, ARRAY_SIZE(pages), diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 945613c95ffa..8ef61198e14c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -406,7 +406,8 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) return fscrypt_zeroout_range(inode, - (loff_t)lblk << inode->i_blkbits, pblk, len); + (loff_t)lblk << inode->i_blkbits, pblk, + (u64)len << inode->i_blkbits); ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS); if (ret > 0) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 239c2666ceb5..8785f7c13657 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4164,7 +4164,7 @@ static int f2fs_secure_erase(struct block_device *bdev, struct inode *inode, if (IS_ENCRYPTED(inode)) ret = fscrypt_zeroout_range(inode, (loff_t)off << inode->i_blkbits, block, - len); + (u64)len << inode->i_blkbits); else ret = blkdev_issue_zeroout(bdev, sector, nr_sects, GFP_NOFS, 0); diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 9fc15e1fbe57..90ac62fda926 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -450,8 +450,8 @@ u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name); /* bio.c */ bool fscrypt_decrypt_bio(struct bio *bio); -int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, - sector_t pblk, unsigned int len); +int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, sector_t pblk, + u64 len); /* hooks.c */ int fscrypt_file_open(struct inode *inode, struct file *filp); @@ -756,7 +756,7 @@ static inline bool fscrypt_decrypt_bio(struct bio *bio) } static inline int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, - sector_t pblk, unsigned int len) + sector_t pblk, u64 len) { return -EOPNOTSUPP; } -- cgit v1.2.3 From 5ca1a1f017ea0f0e0bcb6ec52064735f2ac1c393 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 2 Mar 2026 06:18:18 -0800 Subject: fscrypt: pass a real sector_t to fscrypt_zeroout_range While the pblk argument to fscrypt_zeroout_range is declared as a sector_t, it actually is interpreted as a logical block size unit, which is highly unusual. Switch to passing the 512 byte units that sector_t is defined for. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20260302141922.370070-14-hch@lst.de Signed-off-by: Eric Biggers --- fs/crypto/bio.c | 5 ++--- fs/ext4/inode.c | 3 ++- fs/f2fs/file.c | 2 +- include/linux/fscrypt.h | 6 +++--- 4 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 9872408f4f52..d07740680602 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -114,7 +114,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, * fscrypt_zeroout_range() - zero out a range of blocks in an encrypted file * @inode: the file's inode * @pos: the first file position (in bytes) to zero out - * @pblk: the first filesystem physical block to zero out + * @sector: the first sector to zero out * @len: bytes to zero out * * Zero out filesystem blocks in an encrypted regular file on-disk, i.e. write @@ -129,7 +129,7 @@ static int fscrypt_zeroout_range_inline_crypt(const struct inode *inode, * Return: 0 on success; -errno on failure. */ int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, - sector_t pblk, u64 len) + sector_t sector, u64 len) { const struct fscrypt_inode_info *ci = fscrypt_get_inode_info_raw(inode); const unsigned int du_bits = ci->ci_data_unit_bits; @@ -138,7 +138,6 @@ int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, const unsigned int du_per_page = 1U << du_per_page_bits; u64 du_index = pos >> du_bits; u64 du_remaining = len >> du_bits; - sector_t sector = pblk << (inode->i_blkbits - SECTOR_SHIFT); struct page *pages[16]; /* write up to 16 pages at a time */ unsigned int nr_pages; unsigned int i; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8ef61198e14c..fe258ffd4840 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -406,7 +406,8 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) return fscrypt_zeroout_range(inode, - (loff_t)lblk << inode->i_blkbits, pblk, + (loff_t)lblk << inode->i_blkbits, + pblk << (inode->i_blkbits - SECTOR_SHIFT), (u64)len << inode->i_blkbits); ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 8785f7c13657..a264771cfbc2 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4163,7 +4163,7 @@ static int f2fs_secure_erase(struct block_device *bdev, struct inode *inode, if (!ret && (flags & F2FS_TRIM_FILE_ZEROOUT)) { if (IS_ENCRYPTED(inode)) ret = fscrypt_zeroout_range(inode, - (loff_t)off << inode->i_blkbits, block, + (loff_t)off << inode->i_blkbits, sector, (u64)len << inode->i_blkbits); else ret = blkdev_issue_zeroout(bdev, sector, nr_sects, diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 90ac62fda926..54712ec61ffb 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -450,8 +450,8 @@ u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name); /* bio.c */ bool fscrypt_decrypt_bio(struct bio *bio); -int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, sector_t pblk, - u64 len); +int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, + sector_t sector, u64 len); /* hooks.c */ int fscrypt_file_open(struct inode *inode, struct file *filp); @@ -756,7 +756,7 @@ static inline bool fscrypt_decrypt_bio(struct bio *bio) } static inline int fscrypt_zeroout_range(const struct inode *inode, loff_t pos, - sector_t pblk, u64 len) + sector_t sector, u64 len) { return -EOPNOTSUPP; } -- cgit v1.2.3 From 102c8b26b54e363f85c4c86099ca049a0a76bb58 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 17 Feb 2026 08:08:35 -0800 Subject: PCI: Allow all bus devices to use the same slot A PCIe hotplug slot applies to the entire secondary bus. Thus, pciehp only allocates a single hotplug_slot for the bridge to that bus. The existing PCI slot, though, would only match to functions on device 0, meaning any devices beyond that, e.g., ARI functions, are not matched to any slot even though they share it. A slot reset will break all the missing devices because the handling skips them. For example, ARI devices with more than 8 functions fail because their state is not properly handled, nor is the attached driver notified of the reset. In the best case, the device will appear unresponsive to the driver, resulting in unexpected errors. A worse possibility may panic the kernel if in-flight transactions trigger hardware reported errors like this real observation: vfio-pci 0000:01:00.0: resetting vfio-pci 0000:01:00.0: reset done {1}[Hardware Error]: Error 1, type: fatal {1}[Hardware Error]: section_type: PCIe error {1}[Hardware Error]: port_type: 0, PCIe end point {1}[Hardware Error]: version: 0.2 {1}[Hardware Error]: command: 0x0140, status: 0x0010 {1}[Hardware Error]: device_id: 0000:01:01.0 {1}[Hardware Error]: slot: 0 {1}[Hardware Error]: secondary_bus: 0x00 {1}[Hardware Error]: vendor_id: 0x1d9b, device_id: 0x0207 {1}[Hardware Error]: class_code: 020000 {1}[Hardware Error]: bridge: secondary_status: 0x0000, control: 0x0000 {1}[Hardware Error]: aer_cor_status: 0x00008000, aer_cor_mask: 0x00002000 {1}[Hardware Error]: aer_uncor_status: 0x00010000, aer_uncor_mask: 0x00100000 {1}[Hardware Error]: aer_uncor_severity: 0x006f6030 {1}[Hardware Error]: TLP Header: 0a412800 00192080 60000004 00000004 GHES: Fatal hardware error but panic disabled Kernel panic - not syncing: GHES: Fatal hardware error Allow a slot to be created to claim all devices on a bus, not just a matching device. This is done by introducing a sentinel value, named PCI_SLOT_ALL_DEVICES, which then has the PCI slot match to any device on the bus. This fixes slot resets for pciehp. Since 0xff already has special meaning, the chosen value for this new feature is 0xfe. This will not clash with any actual slot number since they are limited to 5 bits. Signed-off-by: Keith Busch Signed-off-by: Bjorn Helgaas Reviewed-by: Dan Williams Link: https://patch.msgid.link/20260217160836.2709885-3-kbusch@meta.com --- drivers/pci/hotplug/pciehp_core.c | 3 ++- drivers/pci/slot.c | 31 +++++++++++++++++++++++++++---- include/linux/pci.h | 10 +++++++++- 3 files changed, 38 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c index 1e9158d7bac7..2cafd3b26f34 100644 --- a/drivers/pci/hotplug/pciehp_core.c +++ b/drivers/pci/hotplug/pciehp_core.c @@ -79,7 +79,8 @@ static int init_slot(struct controller *ctrl) snprintf(name, SLOT_NAME_SIZE, "%u", PSN(ctrl)); retval = pci_hp_initialize(&ctrl->hotplug_slot, - ctrl->pcie->port->subordinate, 0, name); + ctrl->pcie->port->subordinate, + PCI_SLOT_ALL_DEVICES, name); if (retval) { ctrl_err(ctrl, "pci_hp_initialize failed: error %d\n", retval); kfree(ops); diff --git a/drivers/pci/slot.c b/drivers/pci/slot.c index 787311614e5b..e0b7fb43423c 100644 --- a/drivers/pci/slot.c +++ b/drivers/pci/slot.c @@ -42,6 +42,15 @@ static ssize_t address_read_file(struct pci_slot *slot, char *buf) pci_domain_nr(slot->bus), slot->bus->number); + /* + * Preserve legacy ABI expectations that hotplug drivers that manage + * multiple devices per slot emit 0 for the device number. + */ + if (slot->number == PCI_SLOT_ALL_DEVICES) + return sysfs_emit(buf, "%04x:%02x:00\n", + pci_domain_nr(slot->bus), + slot->bus->number); + return sysfs_emit(buf, "%04x:%02x:%02x\n", pci_domain_nr(slot->bus), slot->bus->number, @@ -73,7 +82,8 @@ static void pci_slot_release(struct kobject *kobj) down_read(&pci_bus_sem); list_for_each_entry(dev, &slot->bus->devices, bus_list) - if (PCI_SLOT(dev->devfn) == slot->number) + if (slot->number == PCI_SLOT_ALL_DEVICES || + PCI_SLOT(dev->devfn) == slot->number) dev->slot = NULL; up_read(&pci_bus_sem); @@ -166,7 +176,8 @@ void pci_dev_assign_slot(struct pci_dev *dev) mutex_lock(&pci_slot_mutex); list_for_each_entry(slot, &dev->bus->slots, list) - if (PCI_SLOT(dev->devfn) == slot->number) + if (slot->number == PCI_SLOT_ALL_DEVICES || + PCI_SLOT(dev->devfn) == slot->number) dev->slot = slot; mutex_unlock(&pci_slot_mutex); } @@ -188,7 +199,8 @@ static struct pci_slot *get_slot(struct pci_bus *parent, int slot_nr) /** * pci_create_slot - create or increment refcount for physical PCI slot * @parent: struct pci_bus of parent bridge - * @slot_nr: PCI_SLOT(pci_dev->devfn) or -1 for placeholder + * @slot_nr: PCI_SLOT(pci_dev->devfn), -1 for placeholder, or + * PCI_SLOT_ALL_DEVICES * @name: user visible string presented in /sys/bus/pci/slots/ * @hotplug: set if caller is hotplug driver, NULL otherwise * @@ -222,6 +234,16 @@ static struct pci_slot *get_slot(struct pci_bus *parent, int slot_nr) * consist solely of a dddd:bb tuple, where dddd is the PCI domain of the * %struct pci_bus and bb is the bus number. In other words, the devfn of * the 'placeholder' slot will not be displayed. + * + * Bus-wide slots: + * For PCIe hotplug, the physical slot encompasses the entire secondary + * bus, not just a single device number. If the device supports ARI and ARI + * Forwarding is enabled in the upstream bridge, a multi-function device + * may include functions that appear to have several different device + * numbers, i.e., PCI_SLOT() values. Pass @slot_nr == PCI_SLOT_ALL_DEVICES + * to create a slot that matches all devices on the bus. Unlike placeholder + * slots, bus-wide slots go through normal slot lookup and reuse existing + * slots if present. */ struct pci_slot *pci_create_slot(struct pci_bus *parent, int slot_nr, const char *name, @@ -285,7 +307,8 @@ placeholder: down_read(&pci_bus_sem); list_for_each_entry(dev, &parent->devices, bus_list) - if (PCI_SLOT(dev->devfn) == slot_nr) + if (slot_nr == PCI_SLOT_ALL_DEVICES || + PCI_SLOT(dev->devfn) == slot_nr) dev->slot = slot; up_read(&pci_bus_sem); diff --git a/include/linux/pci.h b/include/linux/pci.h index 1c270f1d5123..5ae2dfdb2d6f 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -72,12 +72,20 @@ /* return bus from PCI devid = ((u16)bus_number) << 8) | devfn */ #define PCI_BUS_NUM(x) (((x) >> 8) & 0xff) +/* + * PCI_SLOT_ALL_DEVICES indicates a slot that covers all devices on the bus. + * Used for PCIe hotplug where the physical slot is the entire secondary bus, + * and, if ARI Forwarding is enabled, functions may appear to be on multiple + * devices. + */ +#define PCI_SLOT_ALL_DEVICES 0xfe + /* pci_slot represents a physical slot */ struct pci_slot { struct pci_bus *bus; /* Bus this slot is on */ struct list_head list; /* Node in list of slots */ struct hotplug_slot *hotplug; /* Hotplug info (move here) */ - unsigned char number; /* PCI_SLOT(pci_dev->devfn) */ + unsigned char number; /* Device nr, or PCI_SLOT_ALL_DEVICES */ struct kobject kobj; }; -- cgit v1.2.3 From abb0eb0b033a0a8980eb9215e02626e4801ead3f Mon Sep 17 00:00:00 2001 From: Qingfang Deng Date: Fri, 6 Mar 2026 17:36:49 +0800 Subject: ppp: simplify input error handling Currently, ppp_input_error() indicates an error by allocating a 0-length skb and calling ppp_do_recv(). It takes an error code argument, which is stored in skb->cb, but not used by ppp_receive_frame(). Simplify the error handling by removing the unused parameter and the unnecessary skb allocation. Instead, call ppp_receive_error() directly from ppp_input_error() under the recv lock, and the length check in ppp_receive_frame() can be removed. Signed-off-by: Qingfang Deng Signed-off-by: Jakub Kicinski --- drivers/net/ppp/ppp_async.c | 2 +- drivers/net/ppp/ppp_generic.c | 31 ++++++++++--------------------- drivers/net/ppp/ppp_synctty.c | 2 +- include/linux/ppp_channel.h | 2 +- net/atm/pppoatm.c | 2 +- 5 files changed, 14 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ppp/ppp_async.c b/drivers/net/ppp/ppp_async.c index b4cf2d09c6bd..93a7b0f6c4e7 100644 --- a/drivers/net/ppp/ppp_async.c +++ b/drivers/net/ppp/ppp_async.c @@ -491,7 +491,7 @@ static void ppp_async_process(struct tasklet_struct *t) /* process received packets */ while ((skb = skb_dequeue(&ap->rqueue)) != NULL) { if (skb->cb[0]) - ppp_input_error(&ap->chan, 0); + ppp_input_error(&ap->chan); ppp_input(&ap->chan, skb); } diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index 2081da6c2144..6344c5eb0f98 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -2383,12 +2383,10 @@ done: rcu_read_unlock_bh(); } -/* Put a 0-length skb in the receive queue as an error indication */ void -ppp_input_error(struct ppp_channel *chan, int code) +ppp_input_error(struct ppp_channel *chan) { struct channel *pch = chan->ppp; - struct sk_buff *skb; struct ppp *ppp; if (!pch) @@ -2397,12 +2395,9 @@ ppp_input_error(struct ppp_channel *chan, int code) rcu_read_lock_bh(); ppp = rcu_dereference_bh(pch->ppp); if (ppp) { - skb = alloc_skb(0, GFP_ATOMIC); - if (skb) { - skb->len = 0; /* probably unnecessary */ - skb->cb[0] = code; - ppp_do_recv(ppp, skb, pch); - } + ppp_recv_lock(ppp); + ppp_receive_error(ppp); + ppp_recv_unlock(ppp); } rcu_read_unlock_bh(); } @@ -2414,20 +2409,14 @@ ppp_input_error(struct ppp_channel *chan, int code) static void ppp_receive_frame(struct ppp *ppp, struct sk_buff *skb, struct channel *pch) { - /* note: a 0-length skb is used as an error indication */ - if (skb->len > 0) { - skb_checksum_complete_unset(skb); + skb_checksum_complete_unset(skb); #ifdef CONFIG_PPP_MULTILINK - /* XXX do channel-level decompression here */ - if (PPP_PROTO(skb) == PPP_MP) - ppp_receive_mp_frame(ppp, skb, pch); - else + /* XXX do channel-level decompression here */ + if (PPP_PROTO(skb) == PPP_MP) + ppp_receive_mp_frame(ppp, skb, pch); + else #endif /* CONFIG_PPP_MULTILINK */ - ppp_receive_nonmp_frame(ppp, skb); - } else { - kfree_skb(skb); - ppp_receive_error(ppp); - } + ppp_receive_nonmp_frame(ppp, skb); } static void diff --git a/drivers/net/ppp/ppp_synctty.c b/drivers/net/ppp/ppp_synctty.c index c2063961f395..b7f243b416f8 100644 --- a/drivers/net/ppp/ppp_synctty.c +++ b/drivers/net/ppp/ppp_synctty.c @@ -483,7 +483,7 @@ static void ppp_sync_process(struct tasklet_struct *t) while ((skb = skb_dequeue(&ap->rqueue)) != NULL) { if (skb->len == 0) { /* zero length buffers indicate error */ - ppp_input_error(&ap->chan, 0); + ppp_input_error(&ap->chan); kfree_skb(skb); } else diff --git a/include/linux/ppp_channel.h b/include/linux/ppp_channel.h index f73fbea0dbc2..ca8ad03eeef0 100644 --- a/include/linux/ppp_channel.h +++ b/include/linux/ppp_channel.h @@ -55,7 +55,7 @@ extern void ppp_input(struct ppp_channel *, struct sk_buff *); /* Called by the channel when an input error occurs, indicating that we may have missed a packet. */ -extern void ppp_input_error(struct ppp_channel *, int code); +extern void ppp_input_error(struct ppp_channel *); /* Attach a channel to a given PPP unit in specified net. */ extern int ppp_register_net_channel(struct net *, struct ppp_channel *); diff --git a/net/atm/pppoatm.c b/net/atm/pppoatm.c index 2574aae3e066..e3c422dc533a 100644 --- a/net/atm/pppoatm.c +++ b/net/atm/pppoatm.c @@ -228,7 +228,7 @@ static void pppoatm_push(struct atm_vcc *atmvcc, struct sk_buff *skb) error: kfree_skb(skb); - ppp_input_error(&pvcc->chan, 0); + ppp_input_error(&pvcc->chan); } static int pppoatm_may_send(struct pppoatm_vcc *pvcc, int size) -- cgit v1.2.3 From 4b78c9cbd8f1fbb9517aee48b372646f4cf05442 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 8 Mar 2026 12:23:02 +0000 Subject: tcp: move tp->chrono_type next tp->chrono_stat[] chrono_type is currently in tcp_sock_read_txrx group, which is supposed to hold read-mostly fields. But chrono_type is mostly written in tx path, it should be moved to tcp_sock_write_tx group, close to other chrono fields (chrono_stat[], chrono_start). Note this adds holes, but data locality is far more important. Use a full u8 for the time being, compiler can generate more efficient code. Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20260308122302.2895067-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/tcp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index f72eef31fa23..c44cf9ae8d16 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -228,8 +228,7 @@ struct tcp_sock { u32 sacked_out; /* SACK'd packets */ u16 tcp_header_len; /* Bytes of tcp header to send */ u8 scaling_ratio; /* see tcp_win_from_space() */ - u8 chrono_type : 2, /* current chronograph type */ - repair : 1, + u8 repair : 1, tcp_usec_ts : 1, /* TSval values in usec */ is_sack_reneg:1, /* in recovery from loss with SACK reneg? */ is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */ @@ -264,6 +263,7 @@ struct tcp_sock { * total number of data bytes sent. */ u32 snd_sml; /* Last byte of the most recently transmitted small packet */ + u8 chrono_type; /* current chronograph type */ u32 chrono_start; /* Start time in jiffies of a TCP chrono */ u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ -- cgit v1.2.3 From 4d25c7d68896b4002c4ab5cd646775392bb7fbb4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Feb 2026 05:20:09 -0800 Subject: iomap: pass the iomap_iter to ->submit_read This provides additional context for file systems. Rename the fuse instance to match the method name while we're at it. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20260223132021.292832-10-hch@lst.de Tested-by: Anuj Gupta Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/fuse/file.c | 5 +++-- fs/iomap/bio.c | 3 ++- fs/iomap/buffered-io.c | 4 ++-- fs/ntfs3/inode.c | 3 ++- include/linux/iomap.h | 3 ++- 5 files changed, 11 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index b1bb7153cb78..a9c836d7f586 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -947,7 +947,8 @@ static int fuse_iomap_read_folio_range_async(const struct iomap_iter *iter, return ret; } -static void fuse_iomap_read_submit(struct iomap_read_folio_ctx *ctx) +static void fuse_iomap_submit_read(const struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx) { struct fuse_fill_read_data *data = ctx->read_ctx; @@ -958,7 +959,7 @@ static void fuse_iomap_read_submit(struct iomap_read_folio_ctx *ctx) static const struct iomap_read_ops fuse_iomap_read_ops = { .read_folio_range = fuse_iomap_read_folio_range_async, - .submit_read = fuse_iomap_read_submit, + .submit_read = fuse_iomap_submit_read, }; static int fuse_read_folio(struct file *file, struct folio *folio) diff --git a/fs/iomap/bio.c b/fs/iomap/bio.c index 578b1202e037..cb60d1facb5a 100644 --- a/fs/iomap/bio.c +++ b/fs/iomap/bio.c @@ -18,7 +18,8 @@ static void iomap_read_end_io(struct bio *bio) bio_put(bio); } -static void iomap_bio_submit_read(struct iomap_read_folio_ctx *ctx) +static void iomap_bio_submit_read(const struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx) { struct bio *bio = ctx->read_ctx; diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 00f0efaf12b2..f4ee2b1cb877 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -597,7 +597,7 @@ void iomap_read_folio(const struct iomap_ops *ops, &bytes_submitted); if (ctx->ops->submit_read) - ctx->ops->submit_read(ctx); + ctx->ops->submit_read(&iter, ctx); if (ctx->cur_folio) iomap_read_end(ctx->cur_folio, bytes_submitted); @@ -664,7 +664,7 @@ void iomap_readahead(const struct iomap_ops *ops, &cur_bytes_submitted); if (ctx->ops->submit_read) - ctx->ops->submit_read(ctx); + ctx->ops->submit_read(&iter, ctx); if (ctx->cur_folio) iomap_read_end(ctx->cur_folio, cur_bytes_submitted); diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 6e65066ebcc1..511967ef7ec9 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -651,7 +651,8 @@ static int ntfs_iomap_bio_read_folio_range(const struct iomap_iter *iter, return 0; } -static void ntfs_iomap_bio_submit_read(struct iomap_read_folio_ctx *ctx) +static void ntfs_iomap_bio_submit_read(const struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx) { struct bio *bio = ctx->read_ctx; diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 99b7209dabd7..6fbe121e2adf 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -512,7 +512,8 @@ struct iomap_read_ops { * * This is optional. */ - void (*submit_read)(struct iomap_read_folio_ctx *ctx); + void (*submit_read)(const struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx); }; /* -- cgit v1.2.3 From 5f4fe046cb3c84eed719f7becbe822000e1a589e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Feb 2026 05:20:11 -0800 Subject: iomap: allow file systems to hook into buffered read bio submission File systems such as btrfs have additional operations with bios such as verifying data checksums. Allow file systems to hook into submission of the bio to allow for this processing by replacing the direct submit_bio call in iomap_read_alloc_bio with a call into ->submit_read and exporting iomap_read_alloc_bio. Also add a new field to struct iomap_read_folio_ctx to track the file logic offset of the current read context. Based on a patch from Goldwyn Rodrigues . Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20260223132021.292832-12-hch@lst.de Tested-by: Anuj Gupta Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/bio.c | 15 +++++++++------ include/linux/iomap.h | 4 ++++ 2 files changed, 13 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/fs/iomap/bio.c b/fs/iomap/bio.c index 80bbd328bd3c..903cb9fe759e 100644 --- a/fs/iomap/bio.c +++ b/fs/iomap/bio.c @@ -32,10 +32,11 @@ static void iomap_read_alloc_bio(const struct iomap_iter *iter, struct folio *folio = ctx->cur_folio; gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL); gfp_t orig_gfp = gfp; - struct bio *bio = ctx->read_ctx; + struct bio *bio; - if (bio) - submit_bio(bio); + /* Submit the existing range if there was one. */ + if (ctx->read_ctx) + ctx->ops->submit_read(iter, ctx); /* Same as readahead_gfp_mask: */ if (ctx->rac) @@ -56,9 +57,10 @@ static void iomap_read_alloc_bio(const struct iomap_iter *iter, bio_add_folio_nofail(bio, folio, plen, offset_in_folio(folio, iter->pos)); ctx->read_ctx = bio; + ctx->read_ctx_file_offset = iter->pos; } -static int iomap_bio_read_folio_range(const struct iomap_iter *iter, +int iomap_bio_read_folio_range(const struct iomap_iter *iter, struct iomap_read_folio_ctx *ctx, size_t plen) { struct folio *folio = ctx->cur_folio; @@ -70,10 +72,11 @@ static int iomap_bio_read_folio_range(const struct iomap_iter *iter, iomap_read_alloc_bio(iter, ctx, plen); return 0; } +EXPORT_SYMBOL_GPL(iomap_bio_read_folio_range); const struct iomap_read_ops iomap_bio_read_ops = { - .read_folio_range = iomap_bio_read_folio_range, - .submit_read = iomap_bio_submit_read, + .read_folio_range = iomap_bio_read_folio_range, + .submit_read = iomap_bio_submit_read, }; EXPORT_SYMBOL_GPL(iomap_bio_read_ops); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 6fbe121e2adf..b2b9e649a3b8 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -493,6 +493,7 @@ struct iomap_read_folio_ctx { struct folio *cur_folio; struct readahead_control *rac; void *read_ctx; + loff_t read_ctx_file_offset; }; struct iomap_read_ops { @@ -599,6 +600,9 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, extern struct bio_set iomap_ioend_bioset; #ifdef CONFIG_BLOCK +int iomap_bio_read_folio_range(const struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx, size_t plen); + extern const struct iomap_read_ops iomap_bio_read_ops; static inline void iomap_bio_read_folio(struct folio *folio, -- cgit v1.2.3 From 57287771fa8d77841149bf847b629f29acbad35b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Feb 2026 05:20:13 -0800 Subject: iomap: add a bioset pointer to iomap_read_folio_ops Optionally allocate the bio from the bioset provided in iomap_read_folio_ops. If no bioset is provided, fs_bio_set is still used, which is the standard bioset for file systems. Based on a patch from Goldwyn Rodrigues . Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20260223132021.292832-14-hch@lst.de Tested-by: Anuj Gupta Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/bio.c | 14 ++++++++++++-- include/linux/iomap.h | 6 ++++++ 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/iomap/bio.c b/fs/iomap/bio.c index 903cb9fe759e..259a2bf95a43 100644 --- a/fs/iomap/bio.c +++ b/fs/iomap/bio.c @@ -24,11 +24,19 @@ static void iomap_bio_submit_read(const struct iomap_iter *iter, submit_bio(ctx->read_ctx); } +static struct bio_set *iomap_read_bio_set(struct iomap_read_folio_ctx *ctx) +{ + if (ctx->ops && ctx->ops->bio_set) + return ctx->ops->bio_set; + return &fs_bio_set; +} + static void iomap_read_alloc_bio(const struct iomap_iter *iter, struct iomap_read_folio_ctx *ctx, size_t plen) { const struct iomap *iomap = &iter->iomap; unsigned int nr_vecs = DIV_ROUND_UP(iomap_length(iter), PAGE_SIZE); + struct bio_set *bio_set = iomap_read_bio_set(ctx); struct folio *folio = ctx->cur_folio; gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL); gfp_t orig_gfp = gfp; @@ -47,9 +55,11 @@ static void iomap_read_alloc_bio(const struct iomap_iter *iter, * having to deal with partial page reads. This emulates what * do_mpage_read_folio does. */ - bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs), REQ_OP_READ, gfp); + bio = bio_alloc_bioset(iomap->bdev, bio_max_segs(nr_vecs), REQ_OP_READ, + gfp, bio_set); if (!bio) - bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ, orig_gfp); + bio = bio_alloc_bioset(iomap->bdev, 1, REQ_OP_READ, orig_gfp, + bio_set); if (ctx->rac) bio->bi_opf |= REQ_RAHEAD; bio->bi_iter.bi_sector = iomap_sector(iomap, iter->pos); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index b2b9e649a3b8..387a1174522f 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -515,6 +515,12 @@ struct iomap_read_ops { */ void (*submit_read)(const struct iomap_iter *iter, struct iomap_read_folio_ctx *ctx); + + /* + * Optional, allows filesystem to specify own bio_set, so new bio's + * can be allocated from the provided bio_set. + */ + struct bio_set *bio_set; }; /* -- cgit v1.2.3 From 0b10a370529cbd7b918c1eef43d409e43d9e0b78 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Feb 2026 05:20:15 -0800 Subject: iomap: support T10 protection information Add support for generating / verifying protection information in iomap. This is done by hooking into the bio submission and then using the generic PI helpers. Compared to just using the block layer auto PI this extends the protection envelope and also prepares for eventually passing through PI from userspace at least for direct I/O. To generate or verify PI, the file system needs to set the IOMAP_F_INTEGRITY flag on the iomap for the request, and ensure the ioends are used for all integrity I/O. Additionally the file system must defer read I/O completions to user context so that the guard tag validation isn't run from interrupt context. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20260223132021.292832-16-hch@lst.de Tested-by: Anuj Gupta Reviewed-by: "Darrick J. Wong" Signed-off-by: Christian Brauner --- fs/iomap/bio.c | 24 +++++++++++++++++++++--- fs/iomap/direct-io.c | 15 ++++++++++++++- fs/iomap/internal.h | 13 +++++++++++++ fs/iomap/ioend.c | 20 ++++++++++++++++++-- include/linux/iomap.h | 7 +++++++ 5 files changed, 73 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/fs/iomap/bio.c b/fs/iomap/bio.c index b4de67bdd513..f989ffcaac96 100644 --- a/fs/iomap/bio.c +++ b/fs/iomap/bio.c @@ -3,6 +3,7 @@ * Copyright (C) 2010 Red Hat, Inc. * Copyright (C) 2016-2023 Christoph Hellwig. */ +#include #include #include #include "internal.h" @@ -17,6 +18,8 @@ static u32 __iomap_read_end_io(struct bio *bio, int error) iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error); folio_count++; } + if (bio_integrity(bio)) + fs_bio_integrity_free(bio); bio_put(bio); return folio_count; } @@ -34,7 +37,11 @@ u32 iomap_finish_ioend_buffered_read(struct iomap_ioend *ioend) static void iomap_bio_submit_read(const struct iomap_iter *iter, struct iomap_read_folio_ctx *ctx) { - submit_bio(ctx->read_ctx); + struct bio *bio = ctx->read_ctx; + + if (iter->iomap.flags & IOMAP_F_INTEGRITY) + fs_bio_integrity_alloc(bio); + submit_bio(bio); } static struct bio_set *iomap_read_bio_set(struct iomap_read_folio_ctx *ctx) @@ -91,6 +98,7 @@ int iomap_bio_read_folio_range(const struct iomap_iter *iter, if (!bio || bio_end_sector(bio) != iomap_sector(&iter->iomap, iter->pos) || + bio->bi_iter.bi_size > iomap_max_bio_size(&iter->iomap) - plen || !bio_add_folio(bio, folio, plen, offset_in_folio(folio, iter->pos))) iomap_read_alloc_bio(iter, ctx, plen); return 0; @@ -107,11 +115,21 @@ int iomap_bio_read_folio_range_sync(const struct iomap_iter *iter, struct folio *folio, loff_t pos, size_t len) { const struct iomap *srcmap = iomap_iter_srcmap(iter); + sector_t sector = iomap_sector(srcmap, pos); struct bio_vec bvec; struct bio bio; + int error; bio_init(&bio, srcmap->bdev, &bvec, 1, REQ_OP_READ); - bio.bi_iter.bi_sector = iomap_sector(srcmap, pos); + bio.bi_iter.bi_sector = sector; bio_add_folio_nofail(&bio, folio, len, offset_in_folio(folio, pos)); - return submit_bio_wait(&bio); + if (srcmap->flags & IOMAP_F_INTEGRITY) + fs_bio_integrity_alloc(&bio); + error = submit_bio_wait(&bio); + if (srcmap->flags & IOMAP_F_INTEGRITY) { + if (!error) + error = fs_bio_integrity_verify(&bio, sector, len); + fs_bio_integrity_free(&bio); + } + return error; } diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 2cb0c0f43215..c24d94349ca5 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -3,6 +3,7 @@ * Copyright (C) 2010 Red Hat, Inc. * Copyright (c) 2016-2025 Christoph Hellwig. */ +#include #include #include #include @@ -240,6 +241,9 @@ static void __iomap_dio_bio_end_io(struct bio *bio, bool inline_completion) { struct iomap_dio *dio = bio->bi_private; + if (bio_integrity(bio)) + fs_bio_integrity_free(bio); + if (dio->flags & IOMAP_DIO_BOUNCE) { bio_iov_iter_unbounce(bio, !!dio->error, dio->flags & IOMAP_DIO_USER_BACKED); @@ -350,8 +354,10 @@ static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter, bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; + if (dio->flags & IOMAP_DIO_BOUNCE) - ret = bio_iov_iter_bounce(bio, dio->submit.iter, BIO_MAX_SIZE); + ret = bio_iov_iter_bounce(bio, dio->submit.iter, + iomap_max_bio_size(&iter->iomap)); else ret = bio_iov_iter_get_pages(bio, dio->submit.iter, alignment - 1); @@ -368,6 +374,13 @@ static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter, goto out_put_bio; } + if (iter->iomap.flags & IOMAP_F_INTEGRITY) { + if (dio->flags & IOMAP_DIO_WRITE) + fs_bio_integrity_generate(bio); + else + fs_bio_integrity_alloc(bio); + } + if (dio->flags & IOMAP_DIO_WRITE) task_io_account_write(ret); else if ((dio->flags & IOMAP_DIO_USER_BACKED) && diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h index b39dbc17e3f0..74e898b196dc 100644 --- a/fs/iomap/internal.h +++ b/fs/iomap/internal.h @@ -4,6 +4,19 @@ #define IOEND_BATCH_SIZE 4096 +/* + * Normally we can build bios as big as the data structure supports. + * + * But for integrity protected I/O we need to respect the maximum size of the + * single contiguous allocation for the integrity buffer. + */ +static inline size_t iomap_max_bio_size(const struct iomap *iomap) +{ + if (iomap->flags & IOMAP_F_INTEGRITY) + return max_integrity_io_size(bdev_limits(iomap->bdev)); + return BIO_MAX_SIZE; +} + u32 iomap_finish_ioend_buffered_read(struct iomap_ioend *ioend); u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend); diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c index 450ab002eb91..7c034b6a583e 100644 --- a/fs/iomap/ioend.c +++ b/fs/iomap/ioend.c @@ -2,6 +2,7 @@ /* * Copyright (c) 2016-2025 Christoph Hellwig. */ +#include #include #include #include @@ -65,6 +66,8 @@ static u32 iomap_finish_ioend_buffered_write(struct iomap_ioend *ioend) folio_count++; } + if (bio_integrity(bio)) + fs_bio_integrity_free(bio); bio_put(bio); /* frees the ioend */ return folio_count; } @@ -144,6 +147,8 @@ int iomap_ioend_writeback_submit(struct iomap_writepage_ctx *wpc, int error) return error; } + if (wpc->iomap.flags & IOMAP_F_INTEGRITY) + fs_bio_integrity_generate(&ioend->io_bio); submit_bio(&ioend->io_bio); return 0; } @@ -165,10 +170,13 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, } static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos, - u16 ioend_flags) + unsigned int map_len, u16 ioend_flags) { struct iomap_ioend *ioend = wpc->wb_ctx; + if (ioend->io_bio.bi_iter.bi_size > + iomap_max_bio_size(&wpc->iomap) - map_len) + return false; if (ioend_flags & IOMAP_IOEND_BOUNDARY) return false; if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) != @@ -234,7 +242,7 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio, if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY)) ioend_flags |= IOMAP_IOEND_BOUNDARY; - if (!ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) { + if (!ioend || !iomap_can_add_to_ioend(wpc, pos, map_len, ioend_flags)) { new_ioend: if (ioend) { error = wpc->ops->writeback_submit(wpc, 0); @@ -311,6 +319,14 @@ static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error) if (!atomic_dec_and_test(&ioend->io_remaining)) return 0; + + if (!ioend->io_error && + bio_integrity(&ioend->io_bio) && + bio_op(&ioend->io_bio) == REQ_OP_READ) { + ioend->io_error = fs_bio_integrity_verify(&ioend->io_bio, + ioend->io_sector, ioend->io_size); + } + if (ioend->io_flags & IOMAP_IOEND_DIRECT) return iomap_finish_ioend_direct(ioend); if (bio_op(&ioend->io_bio) == REQ_OP_READ) diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 387a1174522f..531f9ebdeeae 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -65,6 +65,8 @@ struct vm_fault; * * IOMAP_F_ATOMIC_BIO indicates that (write) I/O will be issued as an atomic * bio, i.e. set REQ_ATOMIC. + * + * IOMAP_F_INTEGRITY indicates that the filesystems handles integrity metadata. */ #define IOMAP_F_NEW (1U << 0) #define IOMAP_F_DIRTY (1U << 1) @@ -79,6 +81,11 @@ struct vm_fault; #define IOMAP_F_BOUNDARY (1U << 6) #define IOMAP_F_ANON_WRITE (1U << 7) #define IOMAP_F_ATOMIC_BIO (1U << 8) +#ifdef CONFIG_BLK_DEV_INTEGRITY +#define IOMAP_F_INTEGRITY (1U << 9) +#else +#define IOMAP_F_INTEGRITY 0 +#endif /* CONFIG_BLK_DEV_INTEGRITY */ /* * Flag reserved for file system specific usage -- cgit v1.2.3 From 1b891f4c852817b3afd231712ab7e171932e1eb1 Mon Sep 17 00:00:00 2001 From: "Derek J. Clark" Date: Tue, 10 Mar 2026 07:29:19 +0000 Subject: include: device.h: Add named device attributes Adds DEVICE_ATTR_[RW|RO|WO]_NAMED macros for adding attributes that reuse the same sysfs name in a driver under separate subdirectories. When dealing with some devices it can be useful to be able to reuse the same name for similar attributes under a different subdirectory. For example, a single logical HID endpoint may provide a configuration interface for multiple physical devices. In such a case it is useful to provide symmetrical attribute names under different subdirectories on the configuration device. The Lenovo Legion Go is one such device, providing configuration to a detachable left controller, detachable right controller, the wireless transmission dongle, and the MCU. It is therefore beneficial to treat each of these as individual devices in the driver, providing a subdirectory for each physical device in the sysfs. As some attributes are reused by each physical device, it provides a much cleaner interface if the same driver can reuse the same attribute name in sysfs while uniquely distinguishing the store/show functions in the driver, rather than repeat string portions. Example new WO attrs: ATTRS{left_handle/reset}=="(not readable)" ATTRS{right_handle/reset}=="(not readable)" ATTRS{tx_dongle/reset}=="(not readable)" vs old WO attrs in a subdir: ATTRS{left_handle/left_handle_reset}=="(not readable)" ATTRS{right_handle/right_handle_reset}=="(not readable)" ATTRS{tx_dongle/tx_dongle_reset}=="(not readable)" or old WO attrs with no subdir: ATTRS{left_handle_reset}=="(not readable)" ATTRS{right_handle_reset}=="(not readable)" ATTRS{tx_dongle_reset}=="(not readable)" While the third option is usable, it doesn't logically break up the physical devices and creates a device directory with over 80 attributes once all attrs are defined. Reviewed-by: Mark Pearson Signed-off-by: Derek J. Clark Acked-by: Greg Kroah-Hartman Signed-off-by: Jiri Kosina --- include/linux/device.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 0be95294b6e6..381463baed6d 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -189,6 +189,22 @@ ssize_t device_show_string(struct device *dev, struct device_attribute *attr, #define DEVICE_ATTR_ADMIN_RW(_name) \ struct device_attribute dev_attr_##_name = __ATTR_RW_MODE(_name, 0600) +/** + * DEVICE_ATTR_RW_NAMED - Define a read-write device attribute with a sysfs name + * that differs from the function name. + * @_name: Attribute function preface + * @_attrname: Attribute name as it wil be exposed in the sysfs. + * + * Like DEVICE_ATTR_RW(), but allows for reusing names under separate paths in + * the same driver. + */ +#define DEVICE_ATTR_RW_NAMED(_name, _attrname) \ + struct device_attribute dev_attr_##_name = { \ + .attr = { .name = _attrname, .mode = 0644 }, \ + .show = _name##_show, \ + .store = _name##_store, \ + } + /** * DEVICE_ATTR_RO - Define a readable device attribute. * @_name: Attribute name. @@ -207,6 +223,21 @@ ssize_t device_show_string(struct device *dev, struct device_attribute *attr, #define DEVICE_ATTR_ADMIN_RO(_name) \ struct device_attribute dev_attr_##_name = __ATTR_RO_MODE(_name, 0400) +/** + * DEVICE_ATTR_RO_NAMED - Define a read-only device attribute with a sysfs name + * that differs from the function name. + * @_name: Attribute function preface + * @_attrname: Attribute name as it wil be exposed in the sysfs. + * + * Like DEVICE_ATTR_RO(), but allows for reusing names under separate paths in + * the same driver. + */ +#define DEVICE_ATTR_RO_NAMED(_name, _attrname) \ + struct device_attribute dev_attr_##_name = { \ + .attr = { .name = _attrname, .mode = 0444 }, \ + .show = _name##_show, \ + } + /** * DEVICE_ATTR_WO - Define an admin-only writable device attribute. * @_name: Attribute name. @@ -216,6 +247,21 @@ ssize_t device_show_string(struct device *dev, struct device_attribute *attr, #define DEVICE_ATTR_WO(_name) \ struct device_attribute dev_attr_##_name = __ATTR_WO(_name) +/** + * DEVICE_ATTR_WO_NAMED - Define a read-only device attribute with a sysfs name + * that differs from the function name. + * @_name: Attribute function preface + * @_attrname: Attribute name as it wil be exposed in the sysfs. + * + * Like DEVICE_ATTR_WO(), but allows for reusing names under separate paths in + * the same driver. + */ +#define DEVICE_ATTR_WO_NAMED(_name, _attrname) \ + struct device_attribute dev_attr_##_name = { \ + .attr = { .name = _attrname, .mode = 0200 }, \ + .store = _name##_store, \ + } + /** * DEVICE_ULONG_ATTR - Define a device attribute backed by an unsigned long. * @_name: Attribute name. -- cgit v1.2.3 From 6ca9029c823b7853e980585e757343e0e84227cd Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 10 Mar 2026 07:29:27 +0000 Subject: HID: Include firmware version in the uevent Userspace software fwupd probes some HID devices when the daemon starts up to determine the current firmware version in order to be able to offer updated firmware if the manufacturer has made it available. In order to do this fwupd will detach the existing kernel driver if one is present, send a HID command and then reattach the kernel driver. This can be problematic if the user is using the HID device at the time that fwupd probes the hardware and can cause a few frames of input to be dropped. In some cases HID drivers already have a command to look up the firmware version, and so if that is exported to userspace fwupd can discover it and avoid needing to detach the kernel driver until it's time to update the device. Introduce a new member in the struct hid_device for the version and export a new uevent variable HID_FIRMWARE_VERSION that will display the version that HID drivers obtained. Reviewed-by: Derek J. Clark Reviewed-by: Mark Pearson Cc: Richard Hughes Signed-off-by: Mario Limonciello Signed-off-by: Jiri Kosina --- drivers/hid/hid-core.c | 5 +++++ include/linux/hid.h | 1 + 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c index 840a60113868..da57cbf0af26 100644 --- a/drivers/hid/hid-core.c +++ b/drivers/hid/hid-core.c @@ -2887,6 +2887,11 @@ static int hid_uevent(const struct device *dev, struct kobj_uevent_env *env) if (add_uevent_var(env, "MODALIAS=hid:b%04Xg%04Xv%08Xp%08X", hdev->bus, hdev->group, hdev->vendor, hdev->product)) return -ENOMEM; + if (hdev->firmware_version) { + if (add_uevent_var(env, "HID_FIRMWARE_VERSION=0x%04llX", + hdev->firmware_version)) + return -ENOMEM; + } return 0; } diff --git a/include/linux/hid.h b/include/linux/hid.h index 2990b9f94cb5..b0b70c05049d 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -698,6 +698,7 @@ struct hid_device { char name[128]; /* Device name */ char phys[64]; /* Device physical location */ char uniq[64]; /* Device unique identifier (serial #) */ + u64 firmware_version; /* Firmware version */ void *driver_data; -- cgit v1.2.3 From 1dfc9d60a69ec148e1cb709256617d86e5f0e8f8 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 5 Mar 2026 22:45:40 +0100 Subject: workqueue: devres: Add device-managed allocate workqueue Add a Resource-managed version of alloc_workqueue() to fix common problem of drivers mixing devm() calls with destroy_workqueue. Such naive and discouraged driver approach leads to difficult to debug bugs when the driver: 1. Allocates workqueue in standard way and destroys it in driver remove() callback, 2. Sets work struct with devm_work_autocancel(), 3. Registers interrupt handler with devm_request_threaded_irq(). Which leads to following unbind/removal path: 1. destroy_workqueue() via driver remove(), Any interrupt coming now would still execute the interrupt handler, which queues work on destroyed workqueue. 2. devm_irq_release(), 3. devm_work_drop() -> cancel_work_sync() on destroyed workqueue. devm_alloc_workqueue() has two benefits: 1. Solves above problem of mix-and-match devres and non-devres code in driver, 2. Simplify any sane drivers which were correctly using alloc_workqueue() + devm_add_action_or_reset(). Signed-off-by: Krzysztof Kozlowski Acked-by: Tejun Heo Reviewed-by: Andy Shevchenko Signed-off-by: Tejun Heo --- Documentation/driver-api/driver-model/devres.rst | 4 ++++ include/linux/workqueue.h | 22 +++++++++++++++++++ kernel/workqueue.c | 28 ++++++++++++++++++++++++ 3 files changed, 54 insertions(+) (limited to 'include/linux') diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst index 7d2b897d66fa..017fb155a5bc 100644 --- a/Documentation/driver-api/driver-model/devres.rst +++ b/Documentation/driver-api/driver-model/devres.rst @@ -464,3 +464,7 @@ SPI WATCHDOG devm_watchdog_register_device() + +WORKQUEUE + devm_alloc_workqueue() + devm_alloc_ordered_workqueue() diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index a4749f56398f..f8d235aef10d 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -512,6 +512,26 @@ __printf(1, 4) struct workqueue_struct * alloc_workqueue_noprof(const char *fmt, unsigned int flags, int max_active, ...); #define alloc_workqueue(...) alloc_hooks(alloc_workqueue_noprof(__VA_ARGS__)) +/** + * devm_alloc_workqueue - Resource-managed allocate a workqueue + * @dev: Device to allocate workqueue for + * @fmt: printf format for the name of the workqueue + * @flags: WQ_* flags + * @max_active: max in-flight work items, 0 for default + * @...: args for @fmt + * + * Resource managed workqueue, see alloc_workqueue() for details. + * + * The workqueue will be automatically destroyed on driver detach. Typically + * this should be used in drivers already relying on devm interafaces. + * + * RETURNS: + * Pointer to the allocated workqueue on success, %NULL on failure. + */ +__printf(2, 5) struct workqueue_struct * +devm_alloc_workqueue(struct device *dev, const char *fmt, unsigned int flags, + int max_active, ...); + #ifdef CONFIG_LOCKDEP /** * alloc_workqueue_lockdep_map - allocate a workqueue with user-defined lockdep_map @@ -568,6 +588,8 @@ alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, int max_active, */ #define alloc_ordered_workqueue(fmt, flags, args...) \ alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args) +#define devm_alloc_ordered_workqueue(dev, fmt, flags, args...) \ + devm_alloc_workqueue(dev, fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args) #define create_workqueue(name) \ alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM | WQ_PERCPU, 1, (name)) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index aeaec79bc09c..19d20f3039d9 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -5891,6 +5892,33 @@ struct workqueue_struct *alloc_workqueue_noprof(const char *fmt, } EXPORT_SYMBOL_GPL(alloc_workqueue_noprof); +static void devm_workqueue_release(void *res) +{ + destroy_workqueue(res); +} + +__printf(2, 5) struct workqueue_struct * +devm_alloc_workqueue(struct device *dev, const char *fmt, unsigned int flags, + int max_active, ...) +{ + struct workqueue_struct *wq; + va_list args; + int ret; + + va_start(args, max_active); + wq = alloc_workqueue(fmt, flags, max_active, args); + va_end(args); + if (!wq) + return NULL; + + ret = devm_add_action_or_reset(dev, devm_workqueue_release, wq); + if (ret) + return NULL; + + return wq; +} +EXPORT_SYMBOL_GPL(devm_alloc_workqueue); + #ifdef CONFIG_LOCKDEP __printf(1, 5) struct workqueue_struct * -- cgit v1.2.3 From 5a8103a6fb0ae9cf99c0271b17474468d6bae2b2 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 4 Mar 2026 18:21:57 +0100 Subject: genirq: Document interaction between and DT binding defines Document that the DT binding definitions in shadow the first six IRQ_TYPE_* definitions in . The values must be the same anyway, so this is harmless (as long as the latter is included first when both are included), but it is good to document this explicitly. Signed-off-by: Geert Uytterhoeven Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/fbcc65dcee6c5437fab5ef18d21766bb4effb7cb.1772644406.git.geert+renesas@glider.be --- include/linux/irq.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 951acbdb9f84..efa514ee562f 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -35,6 +35,10 @@ enum irqchip_irq_state; * * Bits 0-7 are the same as the IRQF_* bits in linux/interrupt.h * + * Note that the first 6 definitions are shadowed by C preprocessor definitions + * in include/dt-bindings/interrupt-controller/irq.h. This is not an issue, as + * the actual values must be the same, due to being part of the stable DT ABI. + * * IRQ_TYPE_NONE - default, unspecified type * IRQ_TYPE_EDGE_RISING - rising edge triggered * IRQ_TYPE_EDGE_FALLING - falling edge triggered -- cgit v1.2.3 From 360160f75592bdc85edba8fe78fb20d90924c7e8 Mon Sep 17 00:00:00 2001 From: Ricardo Robaina Date: Mon, 9 Mar 2026 10:05:33 -0300 Subject: audit: handle unknown status requests in audit_receive_msg() Currently, audit_receive_msg() ignores unknown status bits in AUDIT_SET requests, incorrectly returning success to newer user space tools querying unsupported features. This breaks forward compatibility. Fix this by defining AUDIT_STATUS_ALL and returning -EINVAL if any unrecognized bits are set (s.mask & ~AUDIT_STATUS_ALL). This ensures invalid requests are safely rejected, allowing user space to reliably test for and gracefully handle feature detection on older kernels. Suggested-by: Steve Grubb Signed-off-by: Ricardo Robaina [PM: subject line tweak] Signed-off-by: Paul Moore --- include/linux/audit.h | 9 +++++++++ kernel/audit.c | 2 ++ 2 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/audit.h b/include/linux/audit.h index b642b5faca65..d79218bf075a 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -15,6 +15,15 @@ #include #include +#define AUDIT_STATUS_ALL (AUDIT_STATUS_ENABLED | \ + AUDIT_STATUS_FAILURE | \ + AUDIT_STATUS_PID | \ + AUDIT_STATUS_RATE_LIMIT | \ + AUDIT_STATUS_BACKLOG_LIMIT | \ + AUDIT_STATUS_BACKLOG_WAIT_TIME | \ + AUDIT_STATUS_LOST | \ + AUDIT_STATUS_BACKLOG_WAIT_TIME_ACTUAL) + #define AUDIT_INO_UNSET ((unsigned long)-1) #define AUDIT_DEV_UNSET ((dev_t)-1) diff --git a/kernel/audit.c b/kernel/audit.c index 08793e71b975..e1d489bc2dff 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1295,6 +1295,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh, memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ memcpy(&s, data, min_t(size_t, sizeof(s), data_len)); + if (s.mask & ~AUDIT_STATUS_ALL) + return -EINVAL; if (s.mask & AUDIT_STATUS_ENABLED) { err = audit_set_enabled(s.enabled); if (err < 0) -- cgit v1.2.3 From 7a6387dec8cee5a237dc5092269e97028f5a983b Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 9 Mar 2026 09:39:18 +0000 Subject: net: stmmac: provide plat_dat->dma_cfg in stmmac_plat_dat_alloc() plat_dat->dma_cfg is unconditionally required for the operation of the driver, so it would make sense to allocate it along with the plat_dat. On Arm64, sizeof(*plat_dat) has recently shrunk from 880 to 816 bytes and sizeof(*plat_dat->dma_cfg) has shrunk from 32 to 20 bytes. Given that dma_cfg is required, and it is now less than a cache line, It doesn't make sense to allocate this separateny, so place it at the end of struct plat_stmmacenet_data, and set plat_dat->dma_cfg to point at that to avoid mass changes. Signed-off-by: Russell King (Oracle) Reviewed-by: Mohd Ayaan Anwar Tested-by: Mohd Ayaan Anwar Link: https://patch.msgid.link/E1vzX54-0000000CVrw-2jfu@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 5 ----- drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c | 4 ---- drivers/net/ethernet/stmicro/stmmac/dwmac-motorcomm.c | 4 ---- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 2 ++ drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c | 5 ----- drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 8 +------- include/linux/stmmac.h | 1 + 7 files changed, 4 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c index fc13bfb47783..0b32560cd059 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c @@ -1251,11 +1251,6 @@ static int intel_eth_pci_probe(struct pci_dev *pdev, if (!plat->mdio_bus_data) return -ENOMEM; - plat->dma_cfg = devm_kzalloc(&pdev->dev, sizeof(*plat->dma_cfg), - GFP_KERNEL); - if (!plat->dma_cfg) - return -ENOMEM; - plat->safety_feat_cfg = devm_kzalloc(&pdev->dev, sizeof(*plat->safety_feat_cfg), GFP_KERNEL); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c index ada6c6ef1f5c..51b1562f84d1 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c @@ -513,10 +513,6 @@ static int loongson_dwmac_probe(struct pci_dev *pdev, const struct pci_device_id if (!plat->mdio_bus_data) return -ENOMEM; - plat->dma_cfg = devm_kzalloc(&pdev->dev, sizeof(*plat->dma_cfg), GFP_KERNEL); - if (!plat->dma_cfg) - return -ENOMEM; - ld = devm_kzalloc(&pdev->dev, sizeof(*ld), GFP_KERNEL); if (!ld) return -ENOMEM; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-motorcomm.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-motorcomm.c index 8b45b9cf7202..d245546b90db 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-motorcomm.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-motorcomm.c @@ -218,10 +218,6 @@ motorcomm_default_plat_data(struct pci_dev *pdev) if (!plat->mdio_bus_data) return NULL; - plat->dma_cfg = devm_kzalloc(dev, sizeof(*plat->dma_cfg), GFP_KERNEL); - if (!plat->dma_cfg) - return NULL; - plat->axi = devm_kzalloc(dev, sizeof(*plat->axi), GFP_KERNEL); if (!plat->axi) return NULL; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index f0160ff54a59..87f43811faa0 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -7730,6 +7730,8 @@ struct plat_stmmacenet_data *stmmac_plat_dat_alloc(struct device *dev) if (!plat_dat) return NULL; + plat_dat->dma_cfg = &plat_dat->__dma_cfg; + /* Set the defaults: * - phy autodetection * - determine GMII_Address CR field from CSR clock diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c index 270ad066ced3..836fed7d60ab 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c @@ -134,11 +134,6 @@ static int stmmac_pci_probe(struct pci_dev *pdev, if (!plat->mdio_bus_data) return -ENOMEM; - plat->dma_cfg = devm_kzalloc(&pdev->dev, sizeof(*plat->dma_cfg), - GFP_KERNEL); - if (!plat->dma_cfg) - return -ENOMEM; - plat->safety_feat_cfg = devm_kzalloc(&pdev->dev, sizeof(*plat->safety_feat_cfg), GFP_KERNEL); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index c34998486293..1aed48fe0db6 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -548,13 +548,7 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) &plat->multicast_filter_bins); } - dma_cfg = devm_kzalloc(&pdev->dev, sizeof(*dma_cfg), - GFP_KERNEL); - if (!dma_cfg) { - ret = ERR_PTR(-ENOMEM); - goto error_put_mdio; - } - plat->dma_cfg = dma_cfg; + dma_cfg = plat->dma_cfg; of_property_read_u32(np, "snps,pbl", &dma_cfg->pbl); if (!dma_cfg->pbl) diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 965ada809fdf..919196713c05 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -306,5 +306,6 @@ struct plat_stmmacenet_data { int msi_tx_base_vec; const struct dwmac4_addrs *dwmac4_addrs; unsigned int flags; + struct stmmac_dma_cfg __dma_cfg; }; #endif -- cgit v1.2.3 From c3d08424e025aaac8fb54134f76e611ef919cd08 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 9 Mar 2026 09:39:23 +0000 Subject: net: stmmac: convert plat_stmmacenet_data booleans to type bool Convert members of struct plat_stmmacenet_data that are booleans to type 'bool' and ensure their initialisers are true/false. Move the has_xxx for the GMAC cores together, and move the COE members to the end of the list of bool to avoid unused holes in the struct. Signed-off-by: Russell King (Oracle) Reviewed-by: Mohd Ayaan Anwar Tested-by: Mohd Ayaan Anwar Link: https://patch.msgid.link/E1vzX59-0000000CVs2-3MHc@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- .../net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c | 2 +- .../net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c | 2 +- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 2 +- drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c | 6 +++--- .../net/ethernet/stmicro/stmmac/stmmac_platform.c | 20 ++++++++++---------- include/linux/stmmac.h | 14 +++++++------- 14 files changed, 31 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c index 0495437d3a6e..b0c5d1ecabce 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.c @@ -88,7 +88,7 @@ static int dwc_eth_dwmac_config_dt(struct platform_device *pdev, plat_dat->core_type = DWMAC_CORE_GMAC4; plat_dat->dma_cfg->aal = 1; plat_dat->flags |= STMMAC_FLAG_TSO_EN; - plat_dat->pmt = 1; + plat_dat->pmt = true; return 0; } diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c index 0b32560cd059..421c6c81ca5e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c @@ -566,7 +566,7 @@ static void common_default_data(struct plat_stmmacenet_data *plat) /* clk_csr_i = 20-35MHz & MDC = clk_csr_i/16 */ plat->clk_csr = STMMAC_CSR_20_35M; plat->core_type = DWMAC_CORE_GMAC; - plat->force_sf_dma_mode = 1; + plat->force_sf_dma_mode = true; plat->mdio_bus_data->needs_reset = true; } diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c index 51b1562f84d1..eb14c197d6ae 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c @@ -94,7 +94,7 @@ static void loongson_default_data(struct pci_dev *pdev, /* clk_csr_i = 100-150MHz & MDC = clk_csr_i/62 */ plat->clk_csr = STMMAC_CSR_100_150M; plat->core_type = DWMAC_CORE_GMAC; - plat->force_sf_dma_mode = 1; + plat->force_sf_dma_mode = true; /* Increase the default value for multicast hash bins */ plat->multicast_filter_bins = 256; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c index 1f2d7d19ca56..a139db6a8cbb 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c @@ -564,7 +564,7 @@ static int mediatek_dwmac_common_data(struct platform_device *pdev, plat->flags &= ~STMMAC_FLAG_USE_PHY_WOL; else plat->flags |= STMMAC_FLAG_USE_PHY_WOL; - plat->riwt_off = 1; + plat->riwt_off = true; plat->maxmtu = ETH_DATA_LEN; plat->host_dma_width = priv_plat->variant->dma_bit_mask; plat->bsp_priv = priv_plat; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c index cb1c074c2053..388e9fdeb86c 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c @@ -817,7 +817,7 @@ static int qcom_ethqos_probe(struct platform_device *pdev) plat_dat->core_type = DWMAC_CORE_GMAC4; if (ethqos->has_emac_ge_3) plat_dat->dwmac4_addrs = &data->dwmac4_addrs; - plat_dat->pmt = 1; + plat_dat->pmt = true; if (of_property_read_bool(np, "snps,tso")) plat_dat->flags |= STMMAC_FLAG_TSO_EN; if (of_device_is_compatible(np, "qcom,qcs404-ethqos")) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c index af594a096676..48fceadc55b1 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-s32.c @@ -163,7 +163,7 @@ static int s32_dwmac_probe(struct platform_device *pdev) /* S32CC core feature set */ plat->core_type = DWMAC_CORE_GMAC4; - plat->pmt = 1; + plat->pmt = true; plat->flags |= STMMAC_FLAG_SPH_DISABLE; plat->rx_fifo_size = 20480; plat->tx_fifo_size = 20480; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c index c6b99814d391..5f89fd968ae9 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c @@ -565,7 +565,7 @@ static void socfpga_gen5_setup_plat_dat(struct socfpga_dwmac *dwmac) plat_dat->core_type = DWMAC_CORE_GMAC; /* Rx watchdog timer in dwmac is buggy in this hw */ - plat_dat->riwt_off = 1; + plat_dat->riwt_off = true; } static void socfpga_agilex5_setup_plat_dat(struct socfpga_dwmac *dwmac) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c index 3ce03b059277..6dbe5d5a3224 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c @@ -1179,7 +1179,7 @@ static int sun8i_dwmac_probe(struct platform_device *pdev) * hardware features were copied from Allwinner drivers. */ plat_dat->rx_coe = STMMAC_RX_COE_TYPE2; - plat_dat->tx_coe = 1; + plat_dat->tx_coe = true; plat_dat->flags |= STMMAC_FLAG_HAS_SUN8I; plat_dat->bsp_priv = gmac; plat_dat->init = sun8i_dwmac_init; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c index 52593ba3a3a3..74bd996d93c9 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sunxi.c @@ -135,7 +135,7 @@ static int sun7i_gmac_probe(struct platform_device *pdev) /* platform data specifying hardware features and callbacks. * hardware features were copied from Allwinner drivers. */ - plat_dat->tx_coe = 1; + plat_dat->tx_coe = true; plat_dat->core_type = DWMAC_CORE_GMAC; plat_dat->bsp_priv = gmac; plat_dat->init = sun7i_gmac_init; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c index d765acbe3754..b4b39e6a169e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-tegra.c @@ -310,7 +310,7 @@ static int tegra_mgbe_probe(struct platform_device *pdev) plat->core_type = DWMAC_CORE_XGMAC; plat->flags |= STMMAC_FLAG_TSO_EN; - plat->pmt = 1; + plat->pmt = true; plat->bsp_priv = mgbe; if (!plat->mdio_node) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 87f43811faa0..939431255fa5 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -7401,7 +7401,7 @@ static int stmmac_hw_init(struct stmmac_priv *priv) /* TXCOE doesn't work in thresh DMA mode */ if (priv->plat->force_thresh_dma_mode) - priv->plat->tx_coe = 0; + priv->plat->tx_coe = false; else priv->plat->tx_coe = priv->dma_cap.tx_coe; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c index 836fed7d60ab..d584fd2daa6f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c @@ -25,7 +25,7 @@ static void common_default_data(struct plat_stmmacenet_data *plat) /* clk_csr_i = 20-35MHz & MDC = clk_csr_i/16 */ plat->clk_csr = STMMAC_CSR_20_35M; plat->core_type = DWMAC_CORE_GMAC; - plat->force_sf_dma_mode = 1; + plat->force_sf_dma_mode = true; plat->mdio_bus_data->needs_reset = true; } @@ -58,9 +58,9 @@ static int snps_gmac5_default_data(struct pci_dev *pdev, plat->clk_csr = STMMAC_CSR_250_300M; plat->core_type = DWMAC_CORE_GMAC4; - plat->force_sf_dma_mode = 1; + plat->force_sf_dma_mode = true; plat->flags |= STMMAC_FLAG_TSO_EN; - plat->pmt = 1; + plat->pmt = true; /* Set default number of RX and TX queues to use */ plat->tx_queues_to_use = 4; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index 1aed48fe0db6..0d3bad0f8915 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -514,34 +514,34 @@ stmmac_probe_config_dt(struct platform_device *pdev, u8 *mac) plat->multicast_filter_bins = dwmac1000_validate_mcast_bins( &pdev->dev, plat->multicast_filter_bins); plat->core_type = DWMAC_CORE_GMAC; - plat->pmt = 1; + plat->pmt = true; } if (of_device_is_compatible(np, "snps,dwmac-3.40a")) { plat->core_type = DWMAC_CORE_GMAC; - plat->enh_desc = 1; - plat->tx_coe = 1; - plat->bugged_jumbo = 1; - plat->pmt = 1; + plat->enh_desc = true; + plat->tx_coe = true; + plat->bugged_jumbo = true; + plat->pmt = true; } if (of_device_compatible_match(np, stmmac_gmac4_compats)) { plat->core_type = DWMAC_CORE_GMAC4; - plat->pmt = 1; + plat->pmt = true; if (of_property_read_bool(np, "snps,tso")) plat->flags |= STMMAC_FLAG_TSO_EN; } if (of_device_is_compatible(np, "snps,dwmac-3.610") || of_device_is_compatible(np, "snps,dwmac-3.710")) { - plat->enh_desc = 1; - plat->bugged_jumbo = 1; - plat->force_sf_dma_mode = 1; + plat->enh_desc = true; + plat->bugged_jumbo = true; + plat->force_sf_dma_mode = true; } if (of_device_is_compatible(np, "snps,dwxgmac")) { plat->core_type = DWMAC_CORE_XGMAC; - plat->pmt = 1; + plat->pmt = true; if (of_property_read_bool(np, "snps,tso")) plat->flags |= STMMAC_FLAG_TSO_EN; of_property_read_u32(np, "snps,multicast-filter-bins", diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 919196713c05..9420da96a4ff 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -229,14 +229,14 @@ struct plat_stmmacenet_data { struct stmmac_dma_cfg *dma_cfg; struct stmmac_safety_feature_cfg *safety_feat_cfg; int clk_csr; - int enh_desc; - int tx_coe; + bool enh_desc; + bool tx_coe; + bool bugged_jumbo; + bool pmt; + bool force_sf_dma_mode; + bool force_thresh_dma_mode; + bool riwt_off; int rx_coe; - int bugged_jumbo; - int pmt; - int force_sf_dma_mode; - int force_thresh_dma_mode; - int riwt_off; int max_speed; int maxmtu; int multicast_filter_bins; -- cgit v1.2.3 From 3357642e65e9454c3da64b62c0ed987ee4010008 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 9 Mar 2026 09:39:28 +0000 Subject: net: stmmac: reorder structs to reduce memory consumption Reorder some of the stmmac structures to allow them to pack better, thereby using less memory. On aarch64, sizeof(struct stmmac_priv) was 880, and with this change becomes 816, saving 64 bytes, which is an 8% saving. Signed-off-by: Russell King (Oracle) Tested-by: Mohd Ayaan Anwar Link: https://patch.msgid.link/E1vzX5E-0000000CVs8-40w4@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 9420da96a4ff..411cdd3ea034 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -108,37 +108,37 @@ struct stmmac_dma_cfg { #define AXI_BLEN 7 struct stmmac_axi { - bool axi_lpi_en; - bool axi_xit_frm; u32 axi_wr_osr_lmt; u32 axi_rd_osr_lmt; - bool axi_kbbe; u32 axi_blen_regval; + bool axi_lpi_en; + bool axi_xit_frm; + bool axi_kbbe; bool axi_fb; bool axi_mb; bool axi_rb; }; struct stmmac_rxq_cfg { - u8 mode_to_use; u32 chan; + u32 prio; + u8 mode_to_use; u8 pkt_route; bool use_prio; - u32 prio; }; struct stmmac_txq_cfg { u32 weight; - bool coe_unsupported; - u8 mode_to_use; /* Credit Base Shaper parameters */ u32 send_slope; u32 idle_slope; u32 high_credit; u32 low_credit; - bool use_prio; u32 prio; int tbs_en; + bool use_prio; + bool coe_unsupported; + u8 mode_to_use; }; struct stmmac_safety_feature_cfg { -- cgit v1.2.3 From 94808793fed71ee47741df0923d353024b6904ff Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 9 Mar 2026 09:39:34 +0000 Subject: net: stmmac: use u8 for ?x_queues_to_use and number_?x_queues The maximum number of queues is a compile time constant of only eight. This makes using a 32-bit quantity wastefulf. Instead, use u8 for these and their associated variables. When reading the DT properties, saturdate at U8_MAX. Provided the core provides DMA capabilities to describe the number of queues, this will be capped by stmmac_hw_init() with a warning. Signed-off-by: Russell King (Oracle) Tested-by: Mohd Ayaan Anwar Link: https://patch.msgid.link/E1vzX5K-0000000CVsE-0J0Y@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/common.h | 4 +- drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c | 2 +- .../net/ethernet/stmicro/stmmac/dwmac1000_core.c | 2 +- .../net/ethernet/stmicro/stmmac/dwmac100_core.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c | 4 +- .../net/ethernet/stmicro/stmmac/dwxgmac2_core.c | 4 +- drivers/net/ethernet/stmicro/stmmac/hwif.h | 2 +- drivers/net/ethernet/stmicro/stmmac/stmmac.h | 2 +- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 223 +++++++++++---------- .../net/ethernet/stmicro/stmmac/stmmac_platform.c | 15 +- include/linux/stmmac.h | 4 +- 11 files changed, 136 insertions(+), 128 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h index 46454e2886ce..f1628de8ed18 100644 --- a/drivers/net/ethernet/stmicro/stmmac/common.h +++ b/drivers/net/ethernet/stmicro/stmmac/common.h @@ -446,8 +446,8 @@ struct dma_features { unsigned int number_rx_channel; unsigned int number_tx_channel; /* TX and RX number of queues */ - unsigned int number_rx_queues; - unsigned int number_tx_queues; + u8 number_rx_queues; + u8 number_tx_queues; /* PPS output */ unsigned int pps_out_num; /* Number of Traffic Classes */ diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c index 6dbe5d5a3224..48c52eb96233 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c @@ -718,7 +718,7 @@ static void sun8i_dwmac_set_filter(struct mac_device_info *hw, static void sun8i_dwmac_flow_ctrl(struct mac_device_info *hw, unsigned int duplex, unsigned int fc, - unsigned int pause_time, u32 tx_cnt) + unsigned int pause_time, u8 tx_cnt) { void __iomem *ioaddr = hw->pcsr; u32 v; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c index c7cb30672604..01f8353eb6ef 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c @@ -222,7 +222,7 @@ static void dwmac1000_set_filter(struct mac_device_info *hw, static void dwmac1000_flow_ctrl(struct mac_device_info *hw, unsigned int duplex, unsigned int fc, unsigned int pause_time, - u32 tx_cnt) + u8 tx_cnt) { void __iomem *ioaddr = hw->pcsr; /* Set flow such that DZPQ in Mac Register 6 is 0, diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c index 6b5cf3a0866a..94d24d355d95 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac100_core.c @@ -126,7 +126,7 @@ static void dwmac100_set_filter(struct mac_device_info *hw, static void dwmac100_flow_ctrl(struct mac_device_info *hw, unsigned int duplex, unsigned int fc, unsigned int pause_time, - u32 tx_cnt) + u8 tx_cnt) { void __iomem *ioaddr = hw->pcsr; unsigned int flow = MAC_FLOW_CTRL_ENABLE; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c index e6bcb77b22a2..4c6fed3ecbcf 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c @@ -547,11 +547,11 @@ static void dwmac4_set_filter(struct mac_device_info *hw, static void dwmac4_flow_ctrl(struct mac_device_info *hw, unsigned int duplex, unsigned int fc, unsigned int pause_time, - u32 tx_cnt) + u8 tx_cnt) { void __iomem *ioaddr = hw->pcsr; unsigned int flow = 0; - u32 queue = 0; + u8 queue; pr_debug("GMAC Flow-Control:\n"); if (fc & FLOW_RX) { diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c index efa76b147f9e..f02b434bbd50 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c @@ -355,10 +355,10 @@ static int dwxgmac2_host_mtl_irq_status(struct stmmac_priv *priv, static void dwxgmac2_flow_ctrl(struct mac_device_info *hw, unsigned int duplex, unsigned int fc, unsigned int pause_time, - u32 tx_cnt) + u8 tx_cnt) { void __iomem *ioaddr = hw->pcsr; - u32 i; + u8 i; if (fc & FLOW_RX) writel(XGMAC_RFE, ioaddr + XGMAC_RX_FLOW_CTRL); diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h index 374f326efa01..010b4d32484a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/hwif.h +++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h @@ -352,7 +352,7 @@ struct stmmac_ops { void (*set_filter)(struct mac_device_info *hw, struct net_device *dev); /* Flow control setting */ void (*flow_ctrl)(struct mac_device_info *hw, unsigned int duplex, - unsigned int fc, unsigned int pause_time, u32 tx_cnt); + unsigned int fc, unsigned int pause_time, u8 tx_cnt); /* Set power management mode (e.g. magic frame) */ void (*pmt)(struct mac_device_info *hw, unsigned long mode); /* Set/Get Unicast MAC addresses */ diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h index 335e60439b42..bba9bb9c95bf 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h @@ -407,7 +407,7 @@ void stmmac_dvr_remove(struct device *dev); int stmmac_dvr_probe(struct device *device, struct plat_stmmacenet_data *plat_dat, struct stmmac_resources *res); -int stmmac_reinit_queues(struct net_device *dev, u32 rx_cnt, u32 tx_cnt); +int stmmac_reinit_queues(struct net_device *dev, u8 rx_cnt, u8 tx_cnt); int stmmac_reinit_ringparam(struct net_device *dev, u32 rx_size, u32 tx_size); int stmmac_set_clk_tx_rate(void *bsp_priv, struct clk *clk_tx_i, phy_interface_t interface, int speed); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 939431255fa5..11150bddd872 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -264,10 +264,10 @@ static void stmmac_verify_args(void) static void __stmmac_disable_all_queues(struct stmmac_priv *priv) { - u32 rx_queues_cnt = priv->plat->rx_queues_to_use; - u32 tx_queues_cnt = priv->plat->tx_queues_to_use; - u32 maxq = max(rx_queues_cnt, tx_queues_cnt); - u32 queue; + u8 rx_queues_cnt = priv->plat->rx_queues_to_use; + u8 tx_queues_cnt = priv->plat->tx_queues_to_use; + u8 maxq = max(rx_queues_cnt, tx_queues_cnt); + u8 queue; for (queue = 0; queue < maxq; queue++) { struct stmmac_channel *ch = &priv->channel[queue]; @@ -291,9 +291,9 @@ static void __stmmac_disable_all_queues(struct stmmac_priv *priv) */ static void stmmac_disable_all_queues(struct stmmac_priv *priv) { - u32 rx_queues_cnt = priv->plat->rx_queues_to_use; + u8 rx_queues_cnt = priv->plat->rx_queues_to_use; struct stmmac_rx_queue *rx_q; - u32 queue; + u8 queue; /* synchronize_rcu() needed for pending XDP buffers to drain */ for (queue = 0; queue < rx_queues_cnt; queue++) { @@ -313,10 +313,10 @@ static void stmmac_disable_all_queues(struct stmmac_priv *priv) */ static void stmmac_enable_all_queues(struct stmmac_priv *priv) { - u32 rx_queues_cnt = priv->plat->rx_queues_to_use; - u32 tx_queues_cnt = priv->plat->tx_queues_to_use; - u32 maxq = max(rx_queues_cnt, tx_queues_cnt); - u32 queue; + u8 rx_queues_cnt = priv->plat->rx_queues_to_use; + u8 tx_queues_cnt = priv->plat->tx_queues_to_use; + u8 maxq = max(rx_queues_cnt, tx_queues_cnt); + u8 queue; for (queue = 0; queue < maxq; queue++) { struct stmmac_channel *ch = &priv->channel[queue]; @@ -377,8 +377,8 @@ static inline u32 stmmac_rx_dirty(struct stmmac_priv *priv, u32 queue) static bool stmmac_eee_tx_busy(struct stmmac_priv *priv) { - u32 tx_cnt = priv->plat->tx_queues_to_use; - u32 queue; + u8 tx_cnt = priv->plat->tx_queues_to_use; + u8 queue; /* check if all TX queues have the work finished */ for (queue = 0; queue < tx_cnt; queue++) { @@ -909,7 +909,7 @@ static int stmmac_legacy_serdes_power_up(struct stmmac_priv *priv) static void stmmac_mac_flow_ctrl(struct stmmac_priv *priv, u32 duplex, unsigned int flow_ctrl) { - u32 tx_cnt = priv->plat->tx_queues_to_use; + u8 tx_cnt = priv->plat->tx_queues_to_use; stmmac_flow_ctrl(priv, priv->hw, duplex, flow_ctrl, priv->pause_time, tx_cnt); @@ -1410,10 +1410,10 @@ static int stmmac_phylink_setup(struct stmmac_priv *priv) static void stmmac_display_rx_rings(struct stmmac_priv *priv, struct stmmac_dma_conf *dma_conf) { - u32 rx_cnt = priv->plat->rx_queues_to_use; + u8 rx_cnt = priv->plat->rx_queues_to_use; unsigned int desc_size; void *head_rx; - u32 queue; + u8 queue; /* Display RX rings */ for (queue = 0; queue < rx_cnt; queue++) { @@ -1438,10 +1438,10 @@ static void stmmac_display_rx_rings(struct stmmac_priv *priv, static void stmmac_display_tx_rings(struct stmmac_priv *priv, struct stmmac_dma_conf *dma_conf) { - u32 tx_cnt = priv->plat->tx_queues_to_use; + u8 tx_cnt = priv->plat->tx_queues_to_use; unsigned int desc_size; void *head_tx; - u32 queue; + u8 queue; /* Display TX rings */ for (queue = 0; queue < tx_cnt; queue++) { @@ -1571,9 +1571,9 @@ static void stmmac_clear_tx_descriptors(struct stmmac_priv *priv, static void stmmac_clear_descriptors(struct stmmac_priv *priv, struct stmmac_dma_conf *dma_conf) { - u32 rx_queue_cnt = priv->plat->rx_queues_to_use; - u32 tx_queue_cnt = priv->plat->tx_queues_to_use; - u32 queue; + u8 rx_queue_cnt = priv->plat->rx_queues_to_use; + u8 tx_queue_cnt = priv->plat->tx_queues_to_use; + u8 queue; /* Clear the RX descriptors */ for (queue = 0; queue < rx_queue_cnt; queue++) @@ -1891,7 +1891,7 @@ static int init_dma_rx_desc_rings(struct net_device *dev, gfp_t flags) { struct stmmac_priv *priv = netdev_priv(dev); - u32 rx_count = priv->plat->rx_queues_to_use; + u8 rx_count = priv->plat->rx_queues_to_use; int queue; int ret; @@ -1985,8 +1985,8 @@ static int init_dma_tx_desc_rings(struct net_device *dev, struct stmmac_dma_conf *dma_conf) { struct stmmac_priv *priv = netdev_priv(dev); - u32 tx_queue_cnt; - u32 queue; + u8 tx_queue_cnt; + u8 queue; tx_queue_cnt = priv->plat->tx_queues_to_use; @@ -2057,8 +2057,8 @@ static void dma_free_tx_skbufs(struct stmmac_priv *priv, */ static void stmmac_free_tx_skbufs(struct stmmac_priv *priv) { - u32 tx_queue_cnt = priv->plat->tx_queues_to_use; - u32 queue; + u8 tx_queue_cnt = priv->plat->tx_queues_to_use; + u8 queue; for (queue = 0; queue < tx_queue_cnt; queue++) dma_free_tx_skbufs(priv, &priv->dma_conf, queue); @@ -2106,8 +2106,8 @@ static void __free_dma_rx_desc_resources(struct stmmac_priv *priv, static void free_dma_rx_desc_resources(struct stmmac_priv *priv, struct stmmac_dma_conf *dma_conf) { - u32 rx_count = priv->plat->rx_queues_to_use; - u32 queue; + u8 rx_count = priv->plat->rx_queues_to_use; + u8 queue; /* Free RX queue resources */ for (queue = 0; queue < rx_count; queue++) @@ -2153,8 +2153,8 @@ static void __free_dma_tx_desc_resources(struct stmmac_priv *priv, static void free_dma_tx_desc_resources(struct stmmac_priv *priv, struct stmmac_dma_conf *dma_conf) { - u32 tx_count = priv->plat->tx_queues_to_use; - u32 queue; + u8 tx_count = priv->plat->tx_queues_to_use; + u8 queue; /* Free TX queue resources */ for (queue = 0; queue < tx_count; queue++) @@ -2255,8 +2255,8 @@ static int __alloc_dma_rx_desc_resources(struct stmmac_priv *priv, static int alloc_dma_rx_desc_resources(struct stmmac_priv *priv, struct stmmac_dma_conf *dma_conf) { - u32 rx_count = priv->plat->rx_queues_to_use; - u32 queue; + u8 rx_count = priv->plat->rx_queues_to_use; + u8 queue; int ret; /* RX queues buffers and DMA */ @@ -2331,8 +2331,8 @@ static int __alloc_dma_tx_desc_resources(struct stmmac_priv *priv, static int alloc_dma_tx_desc_resources(struct stmmac_priv *priv, struct stmmac_dma_conf *dma_conf) { - u32 tx_count = priv->plat->tx_queues_to_use; - u32 queue; + u8 tx_count = priv->plat->tx_queues_to_use; + u8 queue; int ret; /* TX queues buffers and DMA */ @@ -2396,8 +2396,8 @@ static void free_dma_desc_resources(struct stmmac_priv *priv, */ static void stmmac_mac_enable_rx_queues(struct stmmac_priv *priv) { - u32 rx_queues_count = priv->plat->rx_queues_to_use; - int queue; + u8 rx_queues_count = priv->plat->rx_queues_to_use; + u8 queue; u8 mode; for (queue = 0; queue < rx_queues_count; queue++) { @@ -2460,10 +2460,10 @@ static void stmmac_stop_tx_dma(struct stmmac_priv *priv, u32 chan) static void stmmac_enable_all_dma_irq(struct stmmac_priv *priv) { - u32 rx_channels_count = priv->plat->rx_queues_to_use; - u32 tx_channels_count = priv->plat->tx_queues_to_use; - u32 dma_csr_ch = max(rx_channels_count, tx_channels_count); - u32 chan; + u8 rx_channels_count = priv->plat->rx_queues_to_use; + u8 tx_channels_count = priv->plat->tx_queues_to_use; + u8 dma_csr_ch = max(rx_channels_count, tx_channels_count); + u8 chan; for (chan = 0; chan < dma_csr_ch; chan++) { struct stmmac_channel *ch = &priv->channel[chan]; @@ -2483,9 +2483,9 @@ static void stmmac_enable_all_dma_irq(struct stmmac_priv *priv) */ static void stmmac_start_all_dma(struct stmmac_priv *priv) { - u32 rx_channels_count = priv->plat->rx_queues_to_use; - u32 tx_channels_count = priv->plat->tx_queues_to_use; - u32 chan = 0; + u8 rx_channels_count = priv->plat->rx_queues_to_use; + u8 tx_channels_count = priv->plat->tx_queues_to_use; + u8 chan; for (chan = 0; chan < rx_channels_count; chan++) stmmac_start_rx_dma(priv, chan); @@ -2502,9 +2502,9 @@ static void stmmac_start_all_dma(struct stmmac_priv *priv) */ static void stmmac_stop_all_dma(struct stmmac_priv *priv) { - u32 rx_channels_count = priv->plat->rx_queues_to_use; - u32 tx_channels_count = priv->plat->tx_queues_to_use; - u32 chan = 0; + u8 rx_channels_count = priv->plat->rx_queues_to_use; + u8 tx_channels_count = priv->plat->tx_queues_to_use; + u8 chan; for (chan = 0; chan < rx_channels_count; chan++) stmmac_stop_rx_dma(priv, chan); @@ -2521,14 +2521,14 @@ static void stmmac_stop_all_dma(struct stmmac_priv *priv) */ static void stmmac_dma_operation_mode(struct stmmac_priv *priv) { - u32 rx_channels_count = priv->plat->rx_queues_to_use; - u32 tx_channels_count = priv->plat->tx_queues_to_use; + u8 rx_channels_count = priv->plat->rx_queues_to_use; + u8 tx_channels_count = priv->plat->tx_queues_to_use; int rxfifosz = priv->plat->rx_fifo_size; int txfifosz = priv->plat->tx_fifo_size; u32 txmode = 0; u32 rxmode = 0; - u32 chan = 0; u8 qmode = 0; + u8 chan; if (rxfifosz == 0) rxfifosz = priv->dma_cap.rx_fifo_size; @@ -3012,8 +3012,8 @@ static void stmmac_set_dma_operation_mode(struct stmmac_priv *priv, u32 txmode, { u8 rxqmode = priv->plat->rx_queues_cfg[chan].mode_to_use; u8 txqmode = priv->plat->tx_queues_cfg[chan].mode_to_use; - u32 rx_channels_count = priv->plat->rx_queues_to_use; - u32 tx_channels_count = priv->plat->tx_queues_to_use; + u8 rx_channels_count = priv->plat->rx_queues_to_use; + u8 tx_channels_count = priv->plat->tx_queues_to_use; int rxfifosz = priv->plat->rx_fifo_size; int txfifosz = priv->plat->tx_fifo_size; @@ -3088,12 +3088,12 @@ static int stmmac_napi_check(struct stmmac_priv *priv, u32 chan, u32 dir) */ static void stmmac_dma_interrupt(struct stmmac_priv *priv) { - u32 tx_channel_count = priv->plat->tx_queues_to_use; - u32 rx_channel_count = priv->plat->rx_queues_to_use; - u32 channels_to_check = tx_channel_count > rx_channel_count ? - tx_channel_count : rx_channel_count; - u32 chan; + u8 tx_channel_count = priv->plat->tx_queues_to_use; + u8 rx_channel_count = priv->plat->rx_queues_to_use; + u8 channels_to_check = tx_channel_count > rx_channel_count ? + tx_channel_count : rx_channel_count; int status[MAX_T(u32, MTL_MAX_TX_QUEUES, MTL_MAX_RX_QUEUES)]; + u8 chan; /* Make sure we never check beyond our status buffer. */ if (WARN_ON_ONCE(channels_to_check > ARRAY_SIZE(status))) @@ -3237,13 +3237,13 @@ static int stmmac_prereset_configure(struct stmmac_priv *priv) */ static int stmmac_init_dma_engine(struct stmmac_priv *priv) { - u32 rx_channels_count = priv->plat->rx_queues_to_use; - u32 tx_channels_count = priv->plat->tx_queues_to_use; - u32 dma_csr_ch = max(rx_channels_count, tx_channels_count); + u8 rx_channels_count = priv->plat->rx_queues_to_use; + u8 tx_channels_count = priv->plat->tx_queues_to_use; + u8 dma_csr_ch = max(rx_channels_count, tx_channels_count); struct stmmac_rx_queue *rx_q; struct stmmac_tx_queue *tx_q; - u32 chan = 0; int ret = 0; + u8 chan; ret = stmmac_prereset_configure(priv); if (ret) @@ -3359,9 +3359,9 @@ static enum hrtimer_restart stmmac_tx_timer(struct hrtimer *t) */ static void stmmac_init_coalesce(struct stmmac_priv *priv) { - u32 tx_channel_count = priv->plat->tx_queues_to_use; - u32 rx_channel_count = priv->plat->rx_queues_to_use; - u32 chan; + u8 tx_channel_count = priv->plat->tx_queues_to_use; + u8 rx_channel_count = priv->plat->rx_queues_to_use; + u8 chan; for (chan = 0; chan < tx_channel_count; chan++) { struct stmmac_tx_queue *tx_q = &priv->dma_conf.tx_queue[chan]; @@ -3378,9 +3378,9 @@ static void stmmac_init_coalesce(struct stmmac_priv *priv) static void stmmac_set_rings_length(struct stmmac_priv *priv) { - u32 rx_channels_count = priv->plat->rx_queues_to_use; - u32 tx_channels_count = priv->plat->tx_queues_to_use; - u32 chan; + u8 rx_channels_count = priv->plat->rx_queues_to_use; + u8 tx_channels_count = priv->plat->tx_queues_to_use; + u8 chan; /* set TX ring length */ for (chan = 0; chan < tx_channels_count; chan++) @@ -3400,9 +3400,9 @@ static void stmmac_set_rings_length(struct stmmac_priv *priv) */ static void stmmac_set_tx_queue_weight(struct stmmac_priv *priv) { - u32 tx_queues_count = priv->plat->tx_queues_to_use; + u8 tx_queues_count = priv->plat->tx_queues_to_use; u32 weight; - u32 queue; + u8 queue; for (queue = 0; queue < tx_queues_count; queue++) { weight = priv->plat->tx_queues_cfg[queue].weight; @@ -3417,9 +3417,9 @@ static void stmmac_set_tx_queue_weight(struct stmmac_priv *priv) */ static void stmmac_configure_cbs(struct stmmac_priv *priv) { - u32 tx_queues_count = priv->plat->tx_queues_to_use; + u8 tx_queues_count = priv->plat->tx_queues_to_use; u32 mode_to_use; - u32 queue; + u8 queue; /* queue 0 is reserved for legacy traffic */ for (queue = 1; queue < tx_queues_count; queue++) { @@ -3443,8 +3443,8 @@ static void stmmac_configure_cbs(struct stmmac_priv *priv) */ static void stmmac_rx_queue_dma_chan_map(struct stmmac_priv *priv) { - u32 rx_queues_count = priv->plat->rx_queues_to_use; - u32 queue; + u8 rx_queues_count = priv->plat->rx_queues_to_use; + u8 queue; u32 chan; for (queue = 0; queue < rx_queues_count; queue++) { @@ -3460,8 +3460,8 @@ static void stmmac_rx_queue_dma_chan_map(struct stmmac_priv *priv) */ static void stmmac_mac_config_rx_queues_prio(struct stmmac_priv *priv) { - u32 rx_queues_count = priv->plat->rx_queues_to_use; - u32 queue; + u8 rx_queues_count = priv->plat->rx_queues_to_use; + u8 queue; u32 prio; for (queue = 0; queue < rx_queues_count; queue++) { @@ -3480,8 +3480,8 @@ static void stmmac_mac_config_rx_queues_prio(struct stmmac_priv *priv) */ static void stmmac_mac_config_tx_queues_prio(struct stmmac_priv *priv) { - u32 tx_queues_count = priv->plat->tx_queues_to_use; - u32 queue; + u8 tx_queues_count = priv->plat->tx_queues_to_use; + u8 queue; u32 prio; for (queue = 0; queue < tx_queues_count; queue++) { @@ -3500,9 +3500,9 @@ static void stmmac_mac_config_tx_queues_prio(struct stmmac_priv *priv) */ static void stmmac_mac_config_rx_queues_routing(struct stmmac_priv *priv) { - u32 rx_queues_count = priv->plat->rx_queues_to_use; - u32 queue; + u8 rx_queues_count = priv->plat->rx_queues_to_use; u8 packet; + u8 queue; for (queue = 0; queue < rx_queues_count; queue++) { /* no specific packet type routing specified for the queue */ @@ -3537,8 +3537,8 @@ static void stmmac_mac_config_rss(struct stmmac_priv *priv) */ static void stmmac_mtl_configuration(struct stmmac_priv *priv) { - u32 rx_queues_count = priv->plat->rx_queues_to_use; - u32 tx_queues_count = priv->plat->tx_queues_to_use; + u8 rx_queues_count = priv->plat->rx_queues_to_use; + u8 tx_queues_count = priv->plat->tx_queues_to_use; if (tx_queues_count > 1) stmmac_set_tx_queue_weight(priv); @@ -3606,10 +3606,10 @@ static void stmmac_safety_feat_configuration(struct stmmac_priv *priv) static int stmmac_hw_setup(struct net_device *dev) { struct stmmac_priv *priv = netdev_priv(dev); - u32 rx_cnt = priv->plat->rx_queues_to_use; - u32 tx_cnt = priv->plat->tx_queues_to_use; + u8 rx_cnt = priv->plat->rx_queues_to_use; + u8 tx_cnt = priv->plat->tx_queues_to_use; bool sph_en; - u32 chan; + u8 chan; int ret; /* Make sure RX clock is enabled */ @@ -4001,7 +4001,8 @@ static struct stmmac_dma_conf * stmmac_setup_dma_desc(struct stmmac_priv *priv, unsigned int mtu) { struct stmmac_dma_conf *dma_conf; - int chan, bfsize, ret; + int bfsize, ret; + u8 chan; dma_conf = kzalloc_obj(*dma_conf); if (!dma_conf) { @@ -4076,7 +4077,7 @@ static int __stmmac_open(struct net_device *dev, struct stmmac_dma_conf *dma_conf) { struct stmmac_priv *priv = netdev_priv(dev); - u32 chan; + u8 chan; int ret; for (int i = 0; i < MTL_MAX_TX_QUEUES; i++) @@ -4175,7 +4176,7 @@ err_dma_resources: static void __stmmac_release(struct net_device *dev) { struct stmmac_priv *priv = netdev_priv(dev); - u32 chan; + u8 chan; /* Stop and disconnect the PHY */ phylink_stop(priv->phylink); @@ -6123,7 +6124,7 @@ static int stmmac_set_features(struct net_device *netdev, if (priv->sph_capable) { bool sph_en = (priv->hw->rx_csum > 0) && priv->sph_active; - u32 chan; + u8 chan; for (chan = 0; chan < priv->plat->rx_queues_to_use; chan++) stmmac_enable_sph(priv, priv->ioaddr, sph_en, chan); @@ -6143,11 +6144,11 @@ static int stmmac_set_features(struct net_device *netdev, static void stmmac_common_interrupt(struct stmmac_priv *priv) { - u32 rx_cnt = priv->plat->rx_queues_to_use; - u32 tx_cnt = priv->plat->tx_queues_to_use; - u32 queues_count; - u32 queue; + u8 rx_cnt = priv->plat->rx_queues_to_use; + u8 tx_cnt = priv->plat->tx_queues_to_use; + u8 queues_count; bool xmac; + u8 queue; xmac = dwmac_is_xmac(priv->plat->core_type); queues_count = (rx_cnt > tx_cnt) ? rx_cnt : tx_cnt; @@ -6445,9 +6446,9 @@ static int stmmac_rings_status_show(struct seq_file *seq, void *v) { struct net_device *dev = seq->private; struct stmmac_priv *priv = netdev_priv(dev); - u32 rx_count = priv->plat->rx_queues_to_use; - u32 tx_count = priv->plat->tx_queues_to_use; - u32 queue; + u8 rx_count = priv->plat->rx_queues_to_use; + u8 tx_count = priv->plat->tx_queues_to_use; + u8 queue; if ((dev->flags & IFF_UP) == 0) return 0; @@ -6572,9 +6573,9 @@ static int stmmac_dma_cap_show(struct seq_file *seq, void *v) priv->dma_cap.number_rx_channel); seq_printf(seq, "\tNumber of Additional TX channel: %d\n", priv->dma_cap.number_tx_channel); - seq_printf(seq, "\tNumber of Additional RX queues: %d\n", + seq_printf(seq, "\tNumber of Additional RX queues: %u\n", priv->dma_cap.number_rx_queues); - seq_printf(seq, "\tNumber of Additional TX queues: %d\n", + seq_printf(seq, "\tNumber of Additional TX queues: %u\n", priv->dma_cap.number_tx_queues); seq_printf(seq, "\tEnhanced descriptors: %s\n", (priv->dma_cap.enh_desc) ? "Y" : "N"); @@ -7043,7 +7044,7 @@ void stmmac_enable_tx_queue(struct stmmac_priv *priv, u32 queue) void stmmac_xdp_release(struct net_device *dev) { struct stmmac_priv *priv = netdev_priv(dev); - u32 chan; + u8 chan; /* Ensure tx function is not running */ netif_tx_disable(dev); @@ -7076,14 +7077,14 @@ void stmmac_xdp_release(struct net_device *dev) int stmmac_xdp_open(struct net_device *dev) { struct stmmac_priv *priv = netdev_priv(dev); - u32 rx_cnt = priv->plat->rx_queues_to_use; - u32 tx_cnt = priv->plat->tx_queues_to_use; - u32 dma_csr_ch = max(rx_cnt, tx_cnt); + u8 rx_cnt = priv->plat->rx_queues_to_use; + u8 tx_cnt = priv->plat->tx_queues_to_use; + u8 dma_csr_ch = max(rx_cnt, tx_cnt); struct stmmac_rx_queue *rx_q; struct stmmac_tx_queue *tx_q; u32 buf_size; bool sph_en; - u32 chan; + u8 chan; int ret; ret = alloc_dma_desc_resources(priv, &priv->dma_conf); @@ -7219,10 +7220,10 @@ int stmmac_xsk_wakeup(struct net_device *dev, u32 queue, u32 flags) static void stmmac_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct stmmac_priv *priv = netdev_priv(dev); - u32 tx_cnt = priv->plat->tx_queues_to_use; - u32 rx_cnt = priv->plat->rx_queues_to_use; + u8 tx_cnt = priv->plat->tx_queues_to_use; + u8 rx_cnt = priv->plat->rx_queues_to_use; unsigned int start; - int q; + u8 q; for (q = 0; q < tx_cnt; q++) { struct stmmac_txq_stats *txq_stats = &priv->xstats.txq_stats[q]; @@ -7511,7 +7512,7 @@ static int stmmac_hw_init(struct stmmac_priv *priv) static void stmmac_napi_add(struct net_device *dev) { struct stmmac_priv *priv = netdev_priv(dev); - u32 queue, maxq; + u8 queue, maxq; maxq = max(priv->plat->rx_queues_to_use, priv->plat->tx_queues_to_use); @@ -7540,7 +7541,7 @@ static void stmmac_napi_add(struct net_device *dev) static void stmmac_napi_del(struct net_device *dev) { struct stmmac_priv *priv = netdev_priv(dev); - u32 queue, maxq; + u8 queue, maxq; maxq = max(priv->plat->rx_queues_to_use, priv->plat->tx_queues_to_use); @@ -7558,7 +7559,7 @@ static void stmmac_napi_del(struct net_device *dev) } } -int stmmac_reinit_queues(struct net_device *dev, u32 rx_cnt, u32 tx_cnt) +int stmmac_reinit_queues(struct net_device *dev, u8 rx_cnt, u8 tx_cnt) { struct stmmac_priv *priv = netdev_priv(dev); int ret = 0, i; @@ -7763,8 +7764,8 @@ static int __stmmac_dvr_probe(struct device *device, { struct net_device *ndev = NULL; struct stmmac_priv *priv; - u32 rxq; int i, ret = 0; + u8 rxq; if (!plat_dat->dma_cfg || !plat_dat->dma_cfg->pbl) { dev_err(device, "invalid DMA configuration\n"); @@ -8147,7 +8148,7 @@ int stmmac_suspend(struct device *dev) { struct net_device *ndev = dev_get_drvdata(dev); struct stmmac_priv *priv = netdev_priv(ndev); - u32 chan; + u8 chan; if (!ndev || !netif_running(ndev)) goto suspend_bsp; @@ -8222,9 +8223,9 @@ static void stmmac_reset_tx_queue(struct stmmac_priv *priv, u32 queue) */ static void stmmac_reset_queues_param(struct stmmac_priv *priv) { - u32 rx_cnt = priv->plat->rx_queues_to_use; - u32 tx_cnt = priv->plat->tx_queues_to_use; - u32 queue; + u8 rx_cnt = priv->plat->rx_queues_to_use; + u8 tx_cnt = priv->plat->tx_queues_to_use; + u8 queue; for (queue = 0; queue < rx_cnt; queue++) stmmac_reset_rx_queue(priv, queue); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c index 0d3bad0f8915..3b514a702612 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c @@ -138,6 +138,7 @@ static int stmmac_mtl_setup(struct platform_device *pdev, struct device_node *tx_node; u8 queue = 0; int ret = 0; + u32 value; /* First Queue must always be in DCB mode. As MTL_QUEUE_DCB = 1 we need * to always set this, otherwise Queue will be classified as AVB @@ -157,8 +158,11 @@ static int stmmac_mtl_setup(struct platform_device *pdev, } /* Processing RX queues common config */ - of_property_read_u32(rx_node, "snps,rx-queues-to-use", - &plat->rx_queues_to_use); + if (!of_property_read_u32(rx_node, "snps,rx-queues-to-use", &value)) { + if (value > U8_MAX) + value = U8_MAX; + plat->rx_queues_to_use = value; + } if (of_property_read_bool(rx_node, "snps,rx-sched-sp")) plat->rx_sched_algorithm = MTL_RX_ALGORITHM_SP; @@ -208,8 +212,11 @@ static int stmmac_mtl_setup(struct platform_device *pdev, } /* Processing TX queues common config */ - of_property_read_u32(tx_node, "snps,tx-queues-to-use", - &plat->tx_queues_to_use); + if (!of_property_read_u32(tx_node, "snps,tx-queues-to-use", &value)) { + if (value > U8_MAX) + value = U8_MAX; + plat->tx_queues_to_use = value; + } if (of_property_read_bool(tx_node, "snps,tx-sched-wrr")) plat->tx_sched_algorithm = MTL_TX_ALGORITHM_WRR; diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 411cdd3ea034..03fd85060a73 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -244,8 +244,8 @@ struct plat_stmmacenet_data { int tx_fifo_size; int rx_fifo_size; u32 host_dma_width; - u32 rx_queues_to_use; - u32 tx_queues_to_use; + u8 rx_queues_to_use; + u8 tx_queues_to_use; u8 rx_sched_algorithm; u8 tx_sched_algorithm; struct stmmac_rxq_cfg rx_queues_cfg[MTL_MAX_RX_QUEUES]; -- cgit v1.2.3 From 758ed85aadd0668c66cb359c63f384992b10938c Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 9 Mar 2026 09:39:39 +0000 Subject: net: stmmac: use u8 for host_dma_width and similar struct members We aren't going to see >= 256-bit address busses soon, so reduce host_dma_width and associated other struct members that initialise this from u32 to u8. Signed-off-by: Russell King (Oracle) Acked-by: Mohd Ayaan Anwar # qcom-ethqos Tested-by: Mohd Ayaan Anwar Link: https://patch.msgid.link/E1vzX5P-0000000CVsK-0iwX@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c | 6 +++--- drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c | 2 +- include/linux/stmmac.h | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c index 9f5a15b81f8a..9d1bd72ffb73 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-imx.c @@ -42,8 +42,8 @@ struct imx_priv_data; struct imx_dwmac_ops { - u32 addr_width; u32 flags; + u8 addr_width; bool mac_rgmii_txclk_auto_adj; int (*fix_soc_reset)(struct stmmac_priv *priv); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c index a139db6a8cbb..30ae0dba7fff 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c @@ -93,9 +93,9 @@ struct mediatek_dwmac_variant { const char * const *clk_list; int num_clks; - u32 dma_bit_mask; u32 rx_delay_max; u32 tx_delay_max; + u8 dma_bit_mask; }; /* list of clocks required for mac */ @@ -268,9 +268,9 @@ static const struct mediatek_dwmac_variant mt2712_gmac_variant = { .dwmac_set_delay = mt2712_set_delay, .clk_list = mt2712_dwmac_clk_l, .num_clks = ARRAY_SIZE(mt2712_dwmac_clk_l), - .dma_bit_mask = 33, .rx_delay_max = 17600, .tx_delay_max = 17600, + .dma_bit_mask = 33, }; static int mt8195_set_interface(struct mediatek_dwmac_plat_data *plat, @@ -418,9 +418,9 @@ static const struct mediatek_dwmac_variant mt8195_gmac_variant = { .dwmac_set_delay = mt8195_set_delay, .clk_list = mt8195_dwmac_clk_l, .num_clks = ARRAY_SIZE(mt8195_dwmac_clk_l), - .dma_bit_mask = 35, .rx_delay_max = 9280, .tx_delay_max = 9280, + .dma_bit_mask = 35, }; static int mediatek_dwmac_config_dt(struct mediatek_dwmac_plat_data *plat) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c index 388e9fdeb86c..3ccf20fdf52a 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c @@ -91,8 +91,8 @@ struct ethqos_emac_driver_data { unsigned int num_rgmii_por; bool rgmii_config_loopback_en; bool has_emac_ge_3; + u8 dma_addr_width; const char *link_clk_name; - u32 dma_addr_width; struct dwmac4_addrs dwmac4_addrs; bool needs_sgmii_loopback; }; diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 03fd85060a73..11886189bf51 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -243,7 +243,7 @@ struct plat_stmmacenet_data { int unicast_filter_entries; int tx_fifo_size; int rx_fifo_size; - u32 host_dma_width; + u8 host_dma_width; u8 rx_queues_to_use; u8 tx_queues_to_use; u8 rx_sched_algorithm; -- cgit v1.2.3 From 9fe167ab790b10c9eb9ef82f46a03c83f9953b61 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 9 Mar 2026 09:39:44 +0000 Subject: net: stmmac: add documentation for stmmac_dma_cfg members Add documentation of each of the struct stmmac_dma_cfg members. dche remains undocumented as I don't have documentation that covers this. Signed-off-by: Russell King (Oracle) Tested-by: Mohd Ayaan Anwar Link: https://patch.msgid.link/E1vzX5U-0000000CVsQ-162V@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 11886189bf51..1af3a5e197c9 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -93,16 +93,37 @@ struct stmmac_mdio_bus_data { }; struct stmmac_dma_cfg { + /* pbl: programmable burst limit + * txpbl: transmit programmable burst limit + * rxpbl: receive programmable burst limit + * If txpbl or rxpbl are zero, the value of pbl will be substituted. + * Range 0 - 63. + */ int pbl; int txpbl; int rxpbl; + /* pblx8: multiplies pbl, txpbl, rxpbl by a factor of 8 for dwmac >= + * 3.50a, or a factor of 4 for previous versions. + */ bool pblx8; + /* fixed_burst: + * when set, AXI bursts defined by axi_blen_regval are permitted. + * AHB uses SINGLE, INCR4, INCR8 or INCR16 during burst transfers. + * when clear, AXI and AHB use SINGLE or INCR bursts. + */ bool fixed_burst; + /* mixed_burst: + * when set and fixed_burst is clear, AHB uses INCR for bursts > 16 + * and SINGLE or INCRx for bursts <= 16. + */ bool mixed_burst; + /* aal: address aligned bursts for AHB and AXI master interface */ bool aal; + bool dche; bool eame; + /* multi_msi_en: stmmac core internal */ bool multi_msi_en; - bool dche; + /* atds: stmmac core internal */ bool atds; }; -- cgit v1.2.3 From 315bab9411f3bd3465a47a64a3e44323bfab60be Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 9 Mar 2026 09:39:49 +0000 Subject: net: stmmac: add documentation for clocks Add documentation covering stmmac_clk, pclk, clk_ptp_ref and clk_tx_i in the hope that this will help understand what each of these clocks are for. There is confusion around stmmac_clk and pclk which can't be easily resolved today as the Imagination Technologies Pistachio board that pclk was introduced for has no public documentation and is likely now obsolete. So the origins of pclk are lost to the winds of time. Signed-off-by: Russell King (Oracle) Tested-by: Mohd Ayaan Anwar Link: https://patch.msgid.link/E1vzX5Z-0000000CVsb-1XTm@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 1af3a5e197c9..937985276e6b 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -300,10 +300,41 @@ struct plat_stmmacenet_data { struct phylink_pcs *(*select_pcs)(struct stmmac_priv *priv, phy_interface_t interface); void *bsp_priv; + + /* stmmac clocks: + * stmmac_clk: CSR clock (which can be hclk_i, clk_csr_i, aclk_i, + * or clk_app_i depending on GMAC configuration). This clock + * generates the MDC clock. + * + * pclk: introduced for Imagination Technologies Pistachio board - + * see 5f9755d26fbf ("stmmac: Add an optional register interface + * clock"). This is probably used for cases where separate clocks + * are provided for the host interface and register interface. In + * this case, as the MDC clock is derived from stmmac_clk, pclk + * can only really be the "application clock" for the "host + * interface" and not the "register interface" aka CSR clock as + * it is never used when determining the divider for the MDC + * clock. + * + * clk_ptp_ref: optional PTP reference clock (clk_ptp_ref_i). When + * present, this clock increments the timestamp value. Otherwise, + * the rate of stmmac_clk will be used. + * + * clk_tx_i: MAC transmit clock, which will be 2.5MHz for 10M, + * 25MHz for 100M, or 125MHz for 1G irrespective of the interface + * mode. For the DWMAC PHY interface modes: + * + * GMII/MII PHY's transmit clock for 10M (2.5MHz) or 100M (25MHz), + * or 125MHz local clock for 1G mode + * RMII 50MHz RMII clock divided by 2 or 20. + * RGMII 125MHz local clock divided by 1, 5, or 50. + * SGMII 125MHz SerDes clock divided by 1, 5, or 50. + * TBI/RTBI 125MHz SerDes clock + */ struct clk *stmmac_clk; struct clk *pclk; struct clk *clk_ptp_ref; - struct clk *clk_tx_i; /* clk_tx_i to MAC core */ + struct clk *clk_tx_i; unsigned long clk_ptp_rate; unsigned long clk_ref_rate; struct clk_bulk_data *clks; -- cgit v1.2.3 From 7aba71dbc41641e43a79fb9f6fac91719094b4fb Mon Sep 17 00:00:00 2001 From: Thomas Hellström Date: Thu, 5 Mar 2026 10:39:06 +0100 Subject: mm/mmu_notifier: Allow two-pass struct mmu_interval_notifiers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GPU use-cases for mmu_interval_notifiers with hmm often involve starting a gpu operation and then waiting for it to complete. These operations are typically context preemption or TLB flushing. With single-pass notifiers per GPU this doesn't scale in multi-gpu scenarios. In those scenarios we'd want to first start preemption- or TLB flushing on all GPUs and as a second pass wait for them to complete. One can do this on per-driver basis multiplexing per-driver notifiers but that would mean sharing the notifier "user" lock across all GPUs and that doesn't scale well either, so adding support for multi-pass in the core appears to be the right choice. Implement two-pass capability in the mmu_interval_notifier. Use a linked list for the final passes to minimize the impact for use-cases that don't need the multi-pass functionality by avoiding a second interval tree walk, and to be able to easily pass data between the two passes. v1: - Restrict to two passes (Jason Gunthorpe) - Improve on documentation (Jason Gunthorpe) - Improve on function naming (Alistair Popple) v2: - Include the invalidate_finish() callback in the struct mmu_interval_notifier_ops. - Update documentation (GitHub Copilot:claude-sonnet-4.6) - Use lockless list for list management. v3: - Update kerneldoc for the struct mmu_interval_notifier_finish::list member (Matthew Brost) - Add a WARN_ON_ONCE() checking for NULL invalidate_finish() op if if invalidate_start() is non-NULL. (Matthew Brost) v4: - Addressed documentation review comments by David Hildenbrand. Cc: Matthew Brost Cc: Christian König Cc: David Hildenbrand Cc: Lorenzo Stoakes Cc: Liam R. Howlett Cc: Vlastimil Babka Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Michal Hocko Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Simona Vetter Cc: Dave Airlie Cc: Alistair Popple Cc: Cc: Cc: Assisted-by: GitHub Copilot:claude-sonnet-4.6 # Documentation only. Signed-off-by: Thomas Hellström Acked-by: David Hildenbrand (Arm) Reviewed-by: Maarten Lankhorst Link: https://patch.msgid.link/20260305093909.43623-2-thomas.hellstrom@linux.intel.com --- include/linux/mmu_notifier.h | 42 ++++++++++++++++++++++++++++ mm/mmu_notifier.c | 65 ++++++++++++++++++++++++++++++++++++++------ 2 files changed, 98 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index d1094c2d5fb6..b60673a8e0bb 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -233,16 +233,58 @@ struct mmu_notifier { unsigned int users; }; +/** + * struct mmu_interval_notifier_finish - mmu_interval_notifier two-pass abstraction + * @link: Lockless list link for the notifiers pending pass list + * @notifier: The mmu_interval_notifier for which the finish pass is called. + * + * Allocate, typically using GFP_NOWAIT in the interval notifier's start pass. + * Note that with a large number of notifiers implementing two passes, + * allocation with GFP_NOWAIT will become increasingly likely to fail, so consider + * implementing a small pool instead of using kmalloc() allocations. + * + * If the implementation needs to pass data between the start and the finish passes, + * the recommended way is to embed struct mmu_interval_notifier_finish into a larger + * structure that also contains the data needed to be shared. Keep in mind that + * a notifier callback can be invoked in parallel, and each invocation needs its + * own struct mmu_interval_notifier_finish. + * + * If allocation fails, then the &mmu_interval_notifier_ops->invalidate_start op + * needs to implements the full notifier functionality. Please refer to its + * documentation. + */ +struct mmu_interval_notifier_finish { + struct llist_node link; + struct mmu_interval_notifier *notifier; +}; + /** * struct mmu_interval_notifier_ops * @invalidate: Upon return the caller must stop using any SPTEs within this * range. This function can sleep. Return false only if sleeping * was required but mmu_notifier_range_blockable(range) is false. + * @invalidate_start: Similar to @invalidate, but intended for two-pass notifier + * callbacks where the call to @invalidate_start is the first + * pass and any struct mmu_interval_notifier_finish pointer + * returned in the @finish parameter describes the finish pass. + * If *@finish is %NULL on return, then no final pass will be + * called, and @invalidate_start needs to implement the full + * notifier, behaving like @invalidate. The value of *@finish + * is guaranteed to be %NULL at function entry. + * @invalidate_finish: Called as the second pass for any notifier that returned + * a non-NULL *@finish from @invalidate_start. The @finish + * pointer passed here is the same one returned by + * @invalidate_start. */ struct mmu_interval_notifier_ops { bool (*invalidate)(struct mmu_interval_notifier *interval_sub, const struct mmu_notifier_range *range, unsigned long cur_seq); + bool (*invalidate_start)(struct mmu_interval_notifier *interval_sub, + const struct mmu_notifier_range *range, + unsigned long cur_seq, + struct mmu_interval_notifier_finish **finish); + void (*invalidate_finish)(struct mmu_interval_notifier_finish *finish); }; struct mmu_interval_notifier { diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 8e0125dc0522..33023dbbd76d 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -260,6 +260,15 @@ mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub) } EXPORT_SYMBOL_GPL(mmu_interval_read_begin); +static void mn_itree_finish_pass(struct llist_head *finish_passes) +{ + struct llist_node *first = llist_reverse_order(__llist_del_all(finish_passes)); + struct mmu_interval_notifier_finish *f, *next; + + llist_for_each_entry_safe(f, next, first, link) + f->notifier->ops->invalidate_finish(f); +} + static void mn_itree_release(struct mmu_notifier_subscriptions *subscriptions, struct mm_struct *mm) { @@ -271,6 +280,7 @@ static void mn_itree_release(struct mmu_notifier_subscriptions *subscriptions, .end = ULONG_MAX, }; struct mmu_interval_notifier *interval_sub; + LLIST_HEAD(finish_passes); unsigned long cur_seq; bool ret; @@ -278,11 +288,27 @@ static void mn_itree_release(struct mmu_notifier_subscriptions *subscriptions, mn_itree_inv_start_range(subscriptions, &range, &cur_seq); interval_sub; interval_sub = mn_itree_inv_next(interval_sub, &range)) { - ret = interval_sub->ops->invalidate(interval_sub, &range, - cur_seq); + if (interval_sub->ops->invalidate_start) { + struct mmu_interval_notifier_finish *finish = NULL; + + ret = interval_sub->ops->invalidate_start(interval_sub, + &range, + cur_seq, + &finish); + if (ret && finish) { + finish->notifier = interval_sub; + __llist_add(&finish->link, &finish_passes); + } + + } else { + ret = interval_sub->ops->invalidate(interval_sub, + &range, + cur_seq); + } WARN_ON(!ret); } + mn_itree_finish_pass(&finish_passes); mn_itree_inv_end(subscriptions); } @@ -430,7 +456,9 @@ static int mn_itree_invalidate(struct mmu_notifier_subscriptions *subscriptions, const struct mmu_notifier_range *range) { struct mmu_interval_notifier *interval_sub; + LLIST_HEAD(finish_passes); unsigned long cur_seq; + int err = 0; for (interval_sub = mn_itree_inv_start_range(subscriptions, range, &cur_seq); @@ -438,23 +466,41 @@ static int mn_itree_invalidate(struct mmu_notifier_subscriptions *subscriptions, interval_sub = mn_itree_inv_next(interval_sub, range)) { bool ret; - ret = interval_sub->ops->invalidate(interval_sub, range, - cur_seq); + if (interval_sub->ops->invalidate_start) { + struct mmu_interval_notifier_finish *finish = NULL; + + ret = interval_sub->ops->invalidate_start(interval_sub, + range, + cur_seq, + &finish); + if (ret && finish) { + finish->notifier = interval_sub; + __llist_add(&finish->link, &finish_passes); + } + + } else { + ret = interval_sub->ops->invalidate(interval_sub, + range, + cur_seq); + } if (!ret) { if (WARN_ON(mmu_notifier_range_blockable(range))) continue; - goto out_would_block; + err = -EAGAIN; + break; } } - return 0; -out_would_block: + mn_itree_finish_pass(&finish_passes); + /* * On -EAGAIN the non-blocking caller is not allowed to call * invalidate_range_end() */ - mn_itree_inv_end(subscriptions); - return -EAGAIN; + if (err) + mn_itree_inv_end(subscriptions); + + return err; } static int mn_hlist_invalidate_range_start( @@ -977,6 +1023,7 @@ int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub, struct mmu_notifier_subscriptions *subscriptions; int ret; + WARN_ON_ONCE(ops->invalidate_start && !ops->invalidate_finish); might_lock(&mm->mmap_lock); subscriptions = smp_load_acquire(&mm->notifier_subscriptions); -- cgit v1.2.3 From 05988dba11791ccbb458254484826b32f17f4ad2 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Wed, 4 Mar 2026 08:49:00 +0100 Subject: vdso/datastore: Allocate data pages dynamically MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allocating the data pages as part of the kernel image does not work on SPARC. The MMU will raise a fault when userspace tries to access them. Allocate the data pages through the page allocator instead. Unused pages in the vDSO VMA are still allocated to keep the virtual addresses aligned. Switch the mapping from PFNs to 'struct page' as that is required for dynamically allocated pages. This also aligns the allocation of the datapages with the code pages and is a prerequisite for mlockall() support. VM_MIXEDMAP is necessary for the call to vmf_insert_page() in the timens prefault path to work. The data pages need to be order-0, non-compound pages so that the mapping to userspace and the different orderings work. These pages are also used by the timekeeping, random pool and architecture initialization code. Some of these are running before the page allocator is available. To keep these subsytems working without changes, introduce early, statically data storage which will then replaced by the real one as soon as that is available. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Reviewed-by: Christophe Leroy (CS GROUP) Link: https://patch.msgid.link/20260304-vdso-sparc64-generic-2-v6-3-d8eb3b0e1410@linutronix.de --- include/linux/vdso_datastore.h | 6 +++ init/main.c | 2 + lib/vdso/datastore.c | 92 +++++++++++++++++++++++++++--------------- 3 files changed, 68 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vdso_datastore.h b/include/linux/vdso_datastore.h index a91fa24b06e0..0b530428db71 100644 --- a/include/linux/vdso_datastore.h +++ b/include/linux/vdso_datastore.h @@ -2,9 +2,15 @@ #ifndef _LINUX_VDSO_DATASTORE_H #define _LINUX_VDSO_DATASTORE_H +#ifdef CONFIG_HAVE_GENERIC_VDSO #include extern const struct vm_special_mapping vdso_vvar_mapping; struct vm_area_struct *vdso_install_vvar_mapping(struct mm_struct *mm, unsigned long addr); +void __init vdso_setup_data_pages(void); +#else /* !CONFIG_HAVE_GENERIC_VDSO */ +static inline void vdso_setup_data_pages(void) { } +#endif /* CONFIG_HAVE_GENERIC_VDSO */ + #endif /* _LINUX_VDSO_DATASTORE_H */ diff --git a/init/main.c b/init/main.c index 1cb395dd94e4..de867b2693d2 100644 --- a/init/main.c +++ b/init/main.c @@ -105,6 +105,7 @@ #include #include #include +#include #include #include @@ -1119,6 +1120,7 @@ void start_kernel(void) srcu_init(); hrtimers_init(); softirq_init(); + vdso_setup_data_pages(); timekeeping_init(); time_init(); diff --git a/lib/vdso/datastore.c b/lib/vdso/datastore.c index 7377fcb6e1df..faebf5b7cd6e 100644 --- a/lib/vdso/datastore.c +++ b/lib/vdso/datastore.c @@ -1,52 +1,79 @@ // SPDX-License-Identifier: GPL-2.0-only -#include +#include +#include #include #include #include #include #include -/* - * The vDSO data page. - */ +static u8 vdso_initdata[VDSO_NR_PAGES * PAGE_SIZE] __aligned(PAGE_SIZE) __initdata = {}; + #ifdef CONFIG_GENERIC_GETTIMEOFDAY -static union { - struct vdso_time_data data; - u8 page[PAGE_SIZE]; -} vdso_time_data_store __page_aligned_data; -struct vdso_time_data *vdso_k_time_data = &vdso_time_data_store.data; -static_assert(sizeof(vdso_time_data_store) == PAGE_SIZE); +struct vdso_time_data *vdso_k_time_data __refdata = + (void *)&vdso_initdata[VDSO_TIME_PAGE_OFFSET * PAGE_SIZE]; + +static_assert(sizeof(struct vdso_time_data) <= PAGE_SIZE); #endif /* CONFIG_GENERIC_GETTIMEOFDAY */ #ifdef CONFIG_VDSO_GETRANDOM -static union { - struct vdso_rng_data data; - u8 page[PAGE_SIZE]; -} vdso_rng_data_store __page_aligned_data; -struct vdso_rng_data *vdso_k_rng_data = &vdso_rng_data_store.data; -static_assert(sizeof(vdso_rng_data_store) == PAGE_SIZE); +struct vdso_rng_data *vdso_k_rng_data __refdata = + (void *)&vdso_initdata[VDSO_RNG_PAGE_OFFSET * PAGE_SIZE]; + +static_assert(sizeof(struct vdso_rng_data) <= PAGE_SIZE); #endif /* CONFIG_VDSO_GETRANDOM */ #ifdef CONFIG_ARCH_HAS_VDSO_ARCH_DATA -static union { - struct vdso_arch_data data; - u8 page[VDSO_ARCH_DATA_SIZE]; -} vdso_arch_data_store __page_aligned_data; -struct vdso_arch_data *vdso_k_arch_data = &vdso_arch_data_store.data; +struct vdso_arch_data *vdso_k_arch_data __refdata = + (void *)&vdso_initdata[VDSO_ARCH_PAGES_START * PAGE_SIZE]; #endif /* CONFIG_ARCH_HAS_VDSO_ARCH_DATA */ +void __init vdso_setup_data_pages(void) +{ + unsigned int order = get_order(VDSO_NR_PAGES * PAGE_SIZE); + struct page *pages; + + /* + * Allocate the data pages dynamically. SPARC does not support mapping + * static pages to be mapped into userspace. + * It is also a requirement for mlockall() support. + * + * Do not use folios. In time namespaces the pages are mapped in a different order + * to userspace, which is not handled by the folio optimizations in finish_fault(). + */ + pages = alloc_pages(GFP_KERNEL, order); + if (!pages) + panic("Unable to allocate VDSO storage pages"); + + /* The pages are mapped one-by-one into userspace and each one needs to be refcounted. */ + split_page(pages, order); + + /* Move the data already written by other subsystems to the new pages */ + memcpy(page_address(pages), vdso_initdata, VDSO_NR_PAGES * PAGE_SIZE); + + if (IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)) + vdso_k_time_data = page_address(pages + VDSO_TIME_PAGE_OFFSET); + + if (IS_ENABLED(CONFIG_VDSO_GETRANDOM)) + vdso_k_rng_data = page_address(pages + VDSO_RNG_PAGE_OFFSET); + + if (IS_ENABLED(CONFIG_ARCH_HAS_VDSO_ARCH_DATA)) + vdso_k_arch_data = page_address(pages + VDSO_ARCH_PAGES_START); +} + static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf) { - struct page *timens_page = find_timens_vvar_page(vma); - unsigned long pfn; + struct page *page, *timens_page; + + timens_page = find_timens_vvar_page(vma); switch (vmf->pgoff) { case VDSO_TIME_PAGE_OFFSET: if (!IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)) return VM_FAULT_SIGBUS; - pfn = __phys_to_pfn(__pa_symbol(vdso_k_time_data)); + page = virt_to_page(vdso_k_time_data); if (timens_page) { /* * Fault in VVAR page too, since it will be accessed @@ -56,10 +83,10 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, vm_fault_t err; addr = vmf->address + VDSO_TIMENS_PAGE_OFFSET * PAGE_SIZE; - err = vmf_insert_pfn(vma, addr, pfn); + err = vmf_insert_page(vma, addr, page); if (unlikely(err & VM_FAULT_ERROR)) return err; - pfn = page_to_pfn(timens_page); + page = timens_page; } break; case VDSO_TIMENS_PAGE_OFFSET: @@ -72,24 +99,25 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, */ if (!IS_ENABLED(CONFIG_TIME_NS) || !timens_page) return VM_FAULT_SIGBUS; - pfn = __phys_to_pfn(__pa_symbol(vdso_k_time_data)); + page = virt_to_page(vdso_k_time_data); break; case VDSO_RNG_PAGE_OFFSET: if (!IS_ENABLED(CONFIG_VDSO_GETRANDOM)) return VM_FAULT_SIGBUS; - pfn = __phys_to_pfn(__pa_symbol(vdso_k_rng_data)); + page = virt_to_page(vdso_k_rng_data); break; case VDSO_ARCH_PAGES_START ... VDSO_ARCH_PAGES_END: if (!IS_ENABLED(CONFIG_ARCH_HAS_VDSO_ARCH_DATA)) return VM_FAULT_SIGBUS; - pfn = __phys_to_pfn(__pa_symbol(vdso_k_arch_data)) + - vmf->pgoff - VDSO_ARCH_PAGES_START; + page = virt_to_page(vdso_k_arch_data) + vmf->pgoff - VDSO_ARCH_PAGES_START; break; default: return VM_FAULT_SIGBUS; } - return vmf_insert_pfn(vma, vmf->address, pfn); + get_page(page); + vmf->page = page; + return 0; } const struct vm_special_mapping vdso_vvar_mapping = { @@ -101,7 +129,7 @@ struct vm_area_struct *vdso_install_vvar_mapping(struct mm_struct *mm, unsigned { return _install_special_mapping(mm, addr, VDSO_NR_PAGES * PAGE_SIZE, VM_READ | VM_MAYREAD | VM_IO | VM_DONTDUMP | - VM_PFNMAP | VM_SEALED_SYSMAP, + VM_MIXEDMAP | VM_SEALED_SYSMAP, &vdso_vvar_mapping); } -- cgit v1.2.3 From c453b9abb4f422461c1493ef74d63af0961a2d30 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 4 Mar 2026 08:49:11 +0100 Subject: clocksource: Remove ARCH_CLOCKSOURCE_DATA MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After sparc64, there are no remaining users of ARCH_CLOCKSOURCE_DATA and it can just be removed. Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Tested-by: Andreas Larsson Reviewed-by: Andreas Larsson Acked-by: John Stultz Link: https://patch.msgid.link/20260304-vdso-sparc64-generic-2-v6-14-d8eb3b0e1410@linutronix.de [Thomas: drop sparc64 bits from the patch] --- include/linux/clocksource.h | 6 +----- kernel/time/Kconfig | 4 ---- 2 files changed, 1 insertion(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 65b7c41471c3..12d853b18832 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -25,8 +25,7 @@ struct clocksource_base; struct clocksource; struct module; -#if defined(CONFIG_ARCH_CLOCKSOURCE_DATA) || \ - defined(CONFIG_GENERIC_GETTIMEOFDAY) +#if defined(CONFIG_GENERIC_GETTIMEOFDAY) #include #endif @@ -106,9 +105,6 @@ struct clocksource { u64 max_idle_ns; u32 maxadj; u32 uncertainty_margin; -#ifdef CONFIG_ARCH_CLOCKSOURCE_DATA - struct arch_clocksource_data archdata; -#endif u64 max_cycles; u64 max_raw_delta; const char *name; diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 7c6a52f7836c..fe3311877097 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -9,10 +9,6 @@ config CLOCKSOURCE_WATCHDOG bool -# Architecture has extra clocksource data -config ARCH_CLOCKSOURCE_DATA - bool - # Architecture has extra clocksource init called from registration config ARCH_CLOCKSOURCE_INIT bool -- cgit v1.2.3 From 282b8eec8a4eab9a3ff3addf6dad2ce699594fe8 Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Tue, 10 Feb 2026 13:22:08 +0100 Subject: net: cdc-ncm: cleanup device descriptor Flags are boolean values, hence they should be typed as bool, not as u8. Signed-off-by: Oliver Neukum Link: https://patch.msgid.link/20260210122208.29244-1-oneukum@suse.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/cdc_ncm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/usb/cdc_ncm.h b/include/linux/usb/cdc_ncm.h index 4ac082a63173..97ef37a1ff4a 100644 --- a/include/linux/usb/cdc_ncm.h +++ b/include/linux/usb/cdc_ncm.h @@ -118,8 +118,8 @@ struct cdc_ncm_ctx { u32 timer_interval; u32 max_ndp_size; - u8 is_ndp16; - u8 filtering_supported; + bool is_ndp16; + bool filtering_supported; union { struct usb_cdc_ncm_ndp16 *delayed_ndp16; struct usb_cdc_ncm_ndp32 *delayed_ndp32; -- cgit v1.2.3 From b558a9cc107287bd49bd9256e5d965afa80acfd6 Mon Sep 17 00:00:00 2001 From: Amit Sunil Dhamne Date: Mon, 23 Feb 2026 20:05:38 +0000 Subject: usb: typec: tcpm: add support for Sink Cap Extended msg response Add support for responding to Sink Cap Extended msg request. To achieve this, include parsing support for DT properties related to Sink Cap Extended. The request for Sink Cap Ext is a control message while the response is an extended message (chunked). As the Sink Caps Extended Data Block size (24 Byte) is less than MaxExtendedMsgChunkLen (26 Byte), a single chunk is sufficient to complete this AMS. Supporting sink cap extended messages while responding to a Get_Sink_Caps_Extended request when port is in Sink role is required in order to be compliant with at least USB PD Rev3.1 Ver1.8. Signed-off-by: Amit Sunil Dhamne Reviewed-by: Badhri Jagan Sridharan Reviewed-by: Heikki Krogerus Link: https://patch.msgid.link/20260223-skedb-v2-2-60675765bc7e@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/tcpm.c | 253 +++++++++++++++++++++++++++++++++++++++++- include/linux/usb/pd.h | 82 +++++++++++++- 2 files changed, 332 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index 3295d804cf87..5ea0b0e99e4d 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -188,7 +189,8 @@ S(STRUCTURED_VDMS), \ S(COUNTRY_INFO), \ S(COUNTRY_CODES), \ - S(REVISION_INFORMATION) + S(REVISION_INFORMATION), \ + S(GETTING_SINK_EXTENDED_CAPABILITIES) #define GENERATE_ENUM(e) e #define GENERATE_STRING(s) #s @@ -229,6 +231,7 @@ enum pd_msg_request { PD_MSG_DATA_SINK_CAP, PD_MSG_DATA_SOURCE_CAP, PD_MSG_DATA_REV, + PD_MSG_EXT_SINK_CAP_EXT }; enum adev_actions { @@ -337,6 +340,42 @@ struct pd_timings { u32 snk_bc12_cmpletion_time; }; +/* Convert microwatt to watt */ +#define UW_TO_W(pow) ((pow) / 1000000) + +/* + * struct pd_identifier - Contains info about PD identifiers + * @vid: Vendor ID (assigned by USB-IF) + * @pid: Product ID (assigned by manufacturer) + * @xid: Value assigned by USB-IF for product + */ +struct pd_identifier { + u16 vid; + u16 pid; + u32 xid; +}; + +/* + * struct sink_caps_ext_data - Sink extended capability data + * @load_step: Indicates the load step slew rate. Value of 0 indicates 150mA/us + * & 1 indicates 500 mA/us + * @load_char: Snk overload characteristics + * @compliance: Types of sources the sink has been tested & certified on + * @modes: Charging caps & power sources supported + * @spr_min_pdp: Sink Minimum PDP for SPR mode (in Watts) + * @spr_op_pdp: Sink Operational PDP for SPR mode (in Watts) + * @spr_max_pdp: Sink Maximum PDP for SPR mode (in Watts) + */ +struct sink_caps_ext_data { + u8 load_step; + u16 load_char; + u8 compliance; + u8 modes; + u8 spr_min_pdp; + u8 spr_op_pdp; + u8 spr_max_pdp; +}; + struct tcpm_port { struct device *dev; @@ -585,6 +624,9 @@ struct tcpm_port { /* Indicates maximum (revision, version) supported */ struct pd_revision_info pd_rev; + + struct pd_identifier pd_ident; + struct sink_caps_ext_data sink_caps_ext; #ifdef CONFIG_DEBUG_FS struct dentry *dentry; struct mutex logbuffer_lock; /* log buffer access lock */ @@ -1367,6 +1409,64 @@ static int tcpm_pd_send_sink_caps(struct tcpm_port *port) return tcpm_pd_transmit(port, TCPC_TX_SOP, &msg); } +static int tcpm_pd_send_sink_cap_ext(struct tcpm_port *port) +{ + u16 operating_snk_watt = port->operating_snk_mw / 1000; + struct sink_caps_ext_data *data = &port->sink_caps_ext; + struct pd_identifier *pd_ident = &port->pd_ident; + struct sink_caps_ext_msg skedb = {0}; + struct pd_message msg; + u8 data_obj_cnt; + + if (!port->self_powered) + data->spr_op_pdp = operating_snk_watt; + + /* + * SPR Sink Minimum PDP indicates the minimum power required to operate + * a sink device in its lowest level of functionality without requiring + * power from the battery. We can use the operating_snk_watt value to + * populate it, as operating_snk_watt indicates device's min operating + * power. + */ + data->spr_min_pdp = operating_snk_watt; + + if (data->spr_op_pdp < data->spr_min_pdp || + data->spr_max_pdp < data->spr_op_pdp) { + tcpm_log(port, + "Invalid PDP values, Min PDP:%u, Op PDP:%u, Max PDP:%u", + data->spr_min_pdp, data->spr_op_pdp, data->spr_max_pdp); + return -EOPNOTSUPP; + } + + memset(&msg, 0, sizeof(msg)); + skedb.vid = cpu_to_le16(pd_ident->vid); + skedb.pid = cpu_to_le16(pd_ident->pid); + skedb.xid = cpu_to_le32(pd_ident->xid); + skedb.skedb_ver = SKEDB_VER_1_0; + skedb.load_step = data->load_step; + skedb.load_char = cpu_to_le16(data->load_char); + skedb.compliance = data->compliance; + skedb.modes = data->modes; + skedb.spr_min_pdp = data->spr_min_pdp; + skedb.spr_op_pdp = data->spr_op_pdp; + skedb.spr_max_pdp = data->spr_max_pdp; + memcpy(msg.ext_msg.data, &skedb, sizeof(skedb)); + msg.ext_msg.header = PD_EXT_HDR_LE(sizeof(skedb), + 0, /* Denotes if request chunk */ + 0, /* Chunk Number */ + 1 /* Chunked */); + + data_obj_cnt = count_chunked_data_objs(sizeof(skedb)); + msg.header = cpu_to_le16(PD_HEADER(PD_EXT_SINK_CAP_EXT, + port->pwr_role, + port->data_role, + port->negotiated_rev, + port->message_id, + data_obj_cnt, + 1 /* Denotes if ext header */)); + return tcpm_pd_transmit(port, TCPC_TX_SOP, &msg); +} + static void mod_tcpm_delayed_work(struct tcpm_port *port, unsigned int delay_ms) { if (delay_ms) { @@ -3646,6 +3746,19 @@ static void tcpm_pd_ctrl_request(struct tcpm_port *port, PD_MSG_CTRL_NOT_SUPP, NONE_AMS); break; + case PD_CTRL_GET_SINK_CAP_EXT: + /* This is an unsupported message if port type is SRC */ + if (port->negotiated_rev >= PD_REV30 && + port->port_type != TYPEC_PORT_SRC) + tcpm_pd_handle_msg(port, PD_MSG_EXT_SINK_CAP_EXT, + GETTING_SINK_EXTENDED_CAPABILITIES); + else + tcpm_pd_handle_msg(port, + port->negotiated_rev < PD_REV30 ? + PD_MSG_CTRL_REJECT : + PD_MSG_CTRL_NOT_SUPP, + NONE_AMS); + break; default: tcpm_pd_handle_msg(port, port->negotiated_rev < PD_REV30 ? @@ -3898,6 +4011,16 @@ static bool tcpm_send_queued_message(struct tcpm_port *port) ret); tcpm_ams_finish(port); break; + case PD_MSG_EXT_SINK_CAP_EXT: + ret = tcpm_pd_send_sink_cap_ext(port); + if (ret == -EOPNOTSUPP) + tcpm_pd_send_control(port, PD_CTRL_NOT_SUPP, TCPC_TX_SOP); + else if (ret < 0) + tcpm_log(port, + "Unable to transmit sink cap extended, ret=%d", + ret); + tcpm_ams_finish(port); + break; default: break; } @@ -7282,6 +7405,129 @@ static void tcpm_fw_get_timings(struct tcpm_port *port, struct fwnode_handle *fw port->timings.snk_bc12_cmpletion_time = val; } +static void tcpm_fw_get_pd_ident(struct tcpm_port *port) +{ + struct pd_identifier *pd_ident = &port->pd_ident; + u32 *vdo; + + /* First 3 vdo values contain info regarding USB PID, VID & XID */ + if (port->nr_snk_vdo >= 3) + vdo = port->snk_vdo; + else if (port->nr_snk_vdo_v1 >= 3) + vdo = port->snk_vdo_v1; + else + return; + + pd_ident->vid = PD_IDH_VID(vdo[0]); + pd_ident->pid = PD_PRODUCT_PID(vdo[2]); + pd_ident->xid = PD_CSTAT_XID(vdo[1]); + tcpm_log(port, "vid:%#x pid:%#x xid:%#x", + pd_ident->vid, pd_ident->pid, pd_ident->xid); +} + +static void tcpm_parse_snk_pdos(struct tcpm_port *port) +{ + struct sink_caps_ext_data *caps = &port->sink_caps_ext; + u32 max_mv, max_ma; + u8 avs_tier1_pdp, avs_tier2_pdp; + int i, pdo_itr; + u32 *snk_pdos; + + for (i = 0; i < port->pd_count; ++i) { + snk_pdos = port->pd_list[i]->sink_desc.pdo; + for (pdo_itr = 0; pdo_itr < PDO_MAX_OBJECTS && snk_pdos[pdo_itr]; + ++pdo_itr) { + u32 pdo = snk_pdos[pdo_itr]; + u8 curr_snk_pdp = 0; + + switch (pdo_type(pdo)) { + case PDO_TYPE_FIXED: + max_mv = pdo_fixed_voltage(pdo); + max_ma = pdo_fixed_current(pdo); + curr_snk_pdp = UW_TO_W(max_mv * max_ma); + break; + case PDO_TYPE_BATT: + curr_snk_pdp = UW_TO_W(pdo_max_power(pdo)); + break; + case PDO_TYPE_VAR: + max_mv = pdo_max_voltage(pdo); + max_ma = pdo_max_current(pdo); + curr_snk_pdp = UW_TO_W(max_mv * max_ma); + break; + case PDO_TYPE_APDO: + if (pdo_apdo_type(pdo) == APDO_TYPE_PPS) { + max_mv = pdo_pps_apdo_max_voltage(pdo); + max_ma = pdo_pps_apdo_max_current(pdo); + curr_snk_pdp = UW_TO_W(max_mv * max_ma); + caps->modes |= SINK_MODE_PPS; + } else if (pdo_apdo_type(pdo) == + APDO_TYPE_SPR_AVS) { + avs_tier1_pdp = UW_TO_W(SPR_AVS_TIER1_MAX_VOLT_MV + * pdo_spr_avs_apdo_9v_to_15v_max_current_ma(pdo)); + avs_tier2_pdp = UW_TO_W(SPR_AVS_TIER2_MAX_VOLT_MV + * pdo_spr_avs_apdo_15v_to_20v_max_current_ma(pdo)); + curr_snk_pdp = max(avs_tier1_pdp, avs_tier2_pdp); + caps->modes |= SINK_MODE_AVS; + } + break; + default: + tcpm_log(port, "Invalid source PDO type, ignoring"); + continue; + } + + caps->spr_max_pdp = max(caps->spr_max_pdp, + curr_snk_pdp); + } + } +} + +static void tcpm_fw_get_sink_caps_ext(struct tcpm_port *port, + struct fwnode_handle *fwnode) +{ + struct sink_caps_ext_data *caps = &port->sink_caps_ext; + int ret; + u32 val; + + /* + * Load step represents the change in current per usec that a given + * source can tolerate while maintaining Vbus within the vSrcValid + * range. For a sink this represents the "preferred" load-step value. It + * can only have 2 values (150 mA/usec or 500 mA/usec) with 150 mA/usec + * being the default. + */ + ret = fwnode_property_read_u32(fwnode, "sink-load-step", &val); + if (!ret) + caps->load_step = val == 500 ? 1 : 0; + + fwnode_property_read_u16(fwnode, "sink-load-characteristics", + &caps->load_char); + fwnode_property_read_u8(fwnode, "sink-compliance", &caps->compliance); + caps->modes = SINK_MODE_VBUS; + + /* + * As per "6.5.13.14" SPR Sink Operational PDP definition, for battery + * powered devices, this value will correspond to the PDP of the + * charging adapter either shipped or recommended for use with it. For + * batteryless sink devices SPR Operational PDP indicates the power + * required to operate all the device's functional modes. Hence, this + * value may be considered equal to port's operating_snk_mw. As + * operating_sink_mw can change as per the pd set used thus, OP PDP + * is determined when populating Sink Caps Extended Data Block. + */ + if (port->self_powered) { + fwnode_property_read_u32(fwnode, "charging-adapter-pdp-milliwatt", + &val); + caps->spr_op_pdp = (u8)(val / 1000); + caps->modes |= SINK_MODE_BATT; + } + + tcpm_parse_snk_pdos(port); + tcpm_log(port, + "load-step:%#x load-char:%#x compl:%#x op-pdp:%#x max-pdp:%#x", + caps->load_step, caps->load_char, caps->compliance, + caps->spr_op_pdp, caps->spr_max_pdp); +} + static int tcpm_fw_get_caps(struct tcpm_port *port, struct fwnode_handle *fwnode) { struct fwnode_handle *capabilities, *caps = NULL; @@ -7455,6 +7701,9 @@ static int tcpm_fw_get_caps(struct tcpm_port *port, struct fwnode_handle *fwnode } } + if (port->port_type != TYPEC_PORT_SRC) + tcpm_fw_get_sink_caps_ext(port, fwnode); + put_caps: if (caps != fwnode) fwnode_handle_put(caps); @@ -7497,6 +7746,8 @@ static int tcpm_fw_get_snk_vdos(struct tcpm_port *port, struct fwnode_handle *fw return ret; } + tcpm_fw_get_pd_ident(port); + return 0; } diff --git a/include/linux/usb/pd.h b/include/linux/usb/pd.h index 6ccd1b2af993..5a98983195cb 100644 --- a/include/linux/usb/pd.h +++ b/include/linux/usb/pd.h @@ -34,7 +34,8 @@ enum pd_ctrl_msg_type { PD_CTRL_FR_SWAP = 19, PD_CTRL_GET_PPS_STATUS = 20, PD_CTRL_GET_COUNTRY_CODES = 21, - /* 22-23 Reserved */ + PD_CTRL_GET_SINK_CAP_EXT = 22, + /* 23 Reserved */ PD_CTRL_GET_REVISION = 24, /* 25-31 Reserved */ }; @@ -72,7 +73,8 @@ enum pd_ext_msg_type { PD_EXT_PPS_STATUS = 12, PD_EXT_COUNTRY_INFO = 13, PD_EXT_COUNTRY_CODES = 14, - /* 15-31 Reserved */ + PD_EXT_SINK_CAP_EXT = 15, + /* 16-31 Reserved */ }; #define PD_REV10 0x0 @@ -205,6 +207,72 @@ struct pd_message { }; } __packed; +/* + * count_chunked_data_objs - Helper to calculate number of Data Objects on a 4 + * byte boundary. + * @size: Size of data block for extended message. Should *not* include extended + * header size. + */ +static inline u8 count_chunked_data_objs(u32 size) +{ + size += offsetof(struct pd_chunked_ext_message_data, data); + return ((size / 4) + (size % 4 ? 1 : 0)); +} + +/* Sink Caps Extended Data Block Version */ +#define SKEDB_VER_1_0 1 + +/* Sink Caps Extended Sink Modes */ +#define SINK_MODE_PPS BIT(0) +#define SINK_MODE_VBUS BIT(1) +#define SINK_MODE_AC_SUPPLY BIT(2) +#define SINK_MODE_BATT BIT(3) +#define SINK_MODE_BATT_UL BIT(4) /* Unlimited battery power supply */ +#define SINK_MODE_AVS BIT(5) + +/** + * struct sink_caps_ext_msg - Sink extended capability PD message + * @vid: Vendor ID + * @pid: Product ID + * @xid: Value assigned by USB-IF for product + * @fw: Firmware version + * @hw: Hardware version + * @skedb_ver: Sink Caps Extended Data Block (SKEDB) Version + * @load_step: Indicates the load step slew rate. + * @load_char: Sink overload characteristics + * @compliance: Types of sources the sink has been tested & certified on + * @touch_temp: Indicates the IEC standard to which the touch temperature + * conforms to (if applicable). + * @batt_info: Indicates number batteries and hot swappable ports + * @modes: Charging caps & power sources supported + * @spr_min_pdp: Sink Minimum PDP for SPR mode + * @spr_op_pdp: Sink Operational PDP for SPR mode + * @spr_max_pdp: Sink Maximum PDP for SPR mode + * @epr_min_pdp: Sink Minimum PDP for EPR mode + * @epr_op_pdp: Sink Operational PDP for EPR mode + * @epr_max_pdp: Sink Maximum PDP for EPR mode + */ +struct sink_caps_ext_msg { + __le16 vid; + __le16 pid; + __le32 xid; + u8 fw; + u8 hw; + u8 skedb_ver; + u8 load_step; + __le16 load_char; + u8 compliance; + u8 touch_temp; + u8 batt_info; + u8 modes; + u8 spr_min_pdp; + u8 spr_op_pdp; + u8 spr_max_pdp; + u8 epr_min_pdp; + u8 epr_op_pdp; + u8 epr_max_pdp; +} __packed; + /* PDO: Power Data Object */ #define PDO_MAX_OBJECTS 7 @@ -329,6 +397,11 @@ enum pd_apdo_type { #define PDO_SPR_AVS_APDO_9V_TO_15V_MAX_CURR GENMASK(19, 10) /* 10mA unit */ #define PDO_SPR_AVS_APDO_15V_TO_20V_MAX_CURR GENMASK(9, 0) /* 10mA unit */ +/* SPR AVS has two different current ranges 9V - 15V, 15V - 20V */ +#define SPR_AVS_TIER1_MIN_VOLT_MV 9000 +#define SPR_AVS_TIER1_MAX_VOLT_MV 15000 +#define SPR_AVS_TIER2_MAX_VOLT_MV 20000 + static inline enum pd_pdo_type pdo_type(u32 pdo) { return (pdo >> PDO_TYPE_SHIFT) & PDO_TYPE_MASK; @@ -339,6 +412,11 @@ static inline unsigned int pdo_fixed_voltage(u32 pdo) return ((pdo >> PDO_FIXED_VOLT_SHIFT) & PDO_VOLT_MASK) * 50; } +static inline unsigned int pdo_fixed_current(u32 pdo) +{ + return ((pdo >> PDO_FIXED_CURR_SHIFT) & PDO_CURR_MASK) * 10; +} + static inline unsigned int pdo_min_voltage(u32 pdo) { return ((pdo >> PDO_VAR_MIN_VOLT_SHIFT) & PDO_VOLT_MASK) * 50; -- cgit v1.2.3 From e19eaffc5213fdd6179e849d3032929fae0d8c2c Mon Sep 17 00:00:00 2001 From: Rosen Penev Date: Thu, 5 Mar 2026 14:44:10 -0800 Subject: mtd: concat: replace alloc + calloc with 1 alloc A flex array can be used to reduce the allocation to 1. And actually mtdconcat was using the pointer + 1 trick to point to the overallocated area. Better alternatives exist. Signed-off-by: Rosen Penev Signed-off-by: Miquel Raynal --- drivers/mtd/mtd_virt_concat.c | 8 +------- drivers/mtd/mtdconcat.c | 5 +---- include/linux/mtd/concat.h | 2 +- 3 files changed, 3 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/mtd_virt_concat.c b/drivers/mtd/mtd_virt_concat.c index 72689545e48e..37075ead0f33 100644 --- a/drivers/mtd/mtd_virt_concat.c +++ b/drivers/mtd/mtd_virt_concat.c @@ -182,18 +182,12 @@ static int mtd_virt_concat_create_item(struct device_node *parts, for (i = 1; i < count; i++) item->nodes[i] = of_parse_phandle(parts, CONCAT_PROP, (i - 1)); - concat = kzalloc(sizeof(*concat), GFP_KERNEL); + concat = kzalloc_flex(*concat, subdev, count, GFP_KERNEL); if (!concat) { kfree(item); return -ENOMEM; } - concat->subdev = kcalloc(count, sizeof(*concat->subdev), GFP_KERNEL); - if (!concat->subdev) { - kfree(item); - kfree(concat); - return -ENOMEM; - } item->concat = concat; list_add_tail(&item->head, &concat_node_list); diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c index 241d15235d01..c97167d51fe2 100644 --- a/drivers/mtd/mtdconcat.c +++ b/drivers/mtd/mtdconcat.c @@ -627,7 +627,6 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[], /* subdevices to c const char *name) { /* name for the new device */ int i; - size_t size; struct mtd_concat *concat; struct mtd_info *subdev_master = NULL; uint32_t max_erasesize, curr_erasesize; @@ -640,15 +639,13 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[], /* subdevices to c printk(KERN_NOTICE "into device \"%s\"\n", name); /* allocate the device structure */ - size = SIZEOF_STRUCT_MTD_CONCAT(num_devs); - concat = kzalloc(size, GFP_KERNEL); + concat = kzalloc_flex(*concat, subdev, num_devs, GFP_KERNEL); if (!concat) { printk ("memory allocation error while creating concatenated device \"%s\"\n", name); return NULL; } - concat->subdev = (struct mtd_info **) (concat + 1); /* * Set up the new "super" device's MTD object structure, check for diff --git a/include/linux/mtd/concat.h b/include/linux/mtd/concat.h index 2cd9d48958a8..f8d4d6ac1fc1 100644 --- a/include/linux/mtd/concat.h +++ b/include/linux/mtd/concat.h @@ -18,7 +18,7 @@ struct mtd_concat { struct mtd_info mtd; int num_subdev; - struct mtd_info **subdev; + struct mtd_info *subdev[]; }; struct mtd_info *mtd_concat_create( -- cgit v1.2.3 From 77dd8adabbc8ff845177b460de48b9d2cd579966 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 9 Mar 2026 13:52:22 +0100 Subject: efi: Drop unused efi_range_is_wc() function efi_range_is_wc() has no callers, so remove it. Reviewed-by: Ilias Apalodimas Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 664898d09ff5..72e76ec54641 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -832,27 +832,6 @@ extern int __init parse_efi_signature_list( const void *data, size_t size, efi_element_handler_t (*get_handler_for_guid)(const efi_guid_t *)); -/** - * efi_range_is_wc - check the WC bit on an address range - * @start: starting kvirt address - * @len: length of range - * - * Consult the EFI memory map and make sure it's ok to set this range WC. - * Returns true or false. - */ -static inline int efi_range_is_wc(unsigned long start, unsigned long len) -{ - unsigned long i; - - for (i = 0; i < len; i += (1UL << EFI_PAGE_SHIFT)) { - unsigned long paddr = __pa(start + i); - if (!(efi_mem_attributes(paddr) & EFI_MEMORY_WC)) - return 0; - } - /* The range checked out */ - return 1; -} - /* * We play games with efi_enabled so that the compiler will, if * possible, remove EFI-related code altogether. -- cgit v1.2.3 From 12ae2c81b21cfaa193db2faf035d495807edc3a7 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 26 Feb 2026 14:50:59 +0100 Subject: clone: add CLONE_AUTOREAP Add a new clone3() flag CLONE_AUTOREAP that makes a child process auto-reap on exit without ever becoming a zombie. This is a per-process property in contrast to the existing auto-reap mechanism via SA_NOCLDWAIT or SIG_IGN for SIGCHLD which applies to all children of a given parent. Currently the only way to automatically reap children is to set SA_NOCLDWAIT or SIG_IGN on SIGCHLD. This is a parent-scoped property affecting all children which makes it unsuitable for libraries or applications that need selective auto-reaping of specific children while still being able to wait() on others. CLONE_AUTOREAP stores an autoreap flag in the child's signal_struct. When the child exits do_notify_parent() checks this flag and causes exit_notify() to transition the task directly to EXIT_DEAD. Since the flag lives on the child it survives reparenting: if the original parent exits and the child is reparented to a subreaper or init the child still auto-reaps when it eventually exits. CLONE_AUTOREAP can be combined with CLONE_PIDFD to allow the parent to monitor the child's exit via poll() and retrieve exit status via PIDFD_GET_INFO. Without CLONE_PIDFD it provides a fire-and-forget pattern where the parent simply doesn't care about the child's exit status. No exit signal is delivered so exit_signal must be zero. CLONE_AUTOREAP is rejected in combination with CLONE_PARENT. If a CLONE_AUTOREAP child were to clone(CLONE_PARENT) the new grandchild would inherit exit_signal == 0 from the autoreap parent's group leader but without signal->autoreap. This grandchild would become a zombie that never sends a signal and is never autoreaped - confusing and arguably broken behavior. The flag is not inherited by the autoreap process's own children. Each child that should be autoreaped must be explicitly created with CLONE_AUTOREAP. Link: https://github.com/uapi-group/kernel-features/issues/45 Link: https://patch.msgid.link/20260226-work-pidfs-autoreap-v5-1-d148b984a989@kernel.org Reviewed-by: Oleg Nesterov Signed-off-by: Christian Brauner --- include/linux/sched/signal.h | 1 + include/uapi/linux/sched.h | 5 +++-- kernel/fork.c | 17 ++++++++++++++++- kernel/ptrace.c | 3 ++- kernel/signal.c | 4 ++++ 5 files changed, 26 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index a22248aebcf9..f842c86b806f 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -132,6 +132,7 @@ struct signal_struct { */ unsigned int is_child_subreaper:1; unsigned int has_child_subreaper:1; + unsigned int autoreap:1; #ifdef CONFIG_POSIX_TIMERS diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 359a14cc76a4..69f7b4f9eb0c 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -34,8 +34,9 @@ #define CLONE_IO 0x80000000 /* Clone io context */ /* Flags for the clone3() syscall. */ -#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */ -#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */ +#define CLONE_CLEAR_SIGHAND (1ULL << 32) /* Clear any signal handler and reset to SIG_DFL. */ +#define CLONE_INTO_CGROUP (1ULL << 33) /* Clone into a specific cgroup given the right permissions. */ +#define CLONE_AUTOREAP (1ULL << 34) /* Auto-reap child on exit. */ /* * cloning flags intersect with CSIGNAL so can be used with unshare and clone3 diff --git a/kernel/fork.c b/kernel/fork.c index e832da9d15a4..10549574fda6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2028,6 +2028,18 @@ __latent_entropy struct task_struct *copy_process( return ERR_PTR(-EINVAL); } + if (clone_flags & CLONE_AUTOREAP) { + if (clone_flags & CLONE_THREAD) + return ERR_PTR(-EINVAL); + if (clone_flags & CLONE_PARENT) + return ERR_PTR(-EINVAL); + if (args->exit_signal) + return ERR_PTR(-EINVAL); + } + + if ((clone_flags & CLONE_PARENT) && current->signal->autoreap) + return ERR_PTR(-EINVAL); + /* * Force any signals received before this point to be delivered * before the fork happens. Collect up signals sent to multiple @@ -2435,6 +2447,8 @@ __latent_entropy struct task_struct *copy_process( */ p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper || p->real_parent->signal->is_child_subreaper; + if (clone_flags & CLONE_AUTOREAP) + p->signal->autoreap = 1; list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); attach_pid(p, PIDTYPE_TGID); @@ -2897,7 +2911,8 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs) { /* Verify that no unknown flags are passed along. */ if (kargs->flags & - ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP)) + ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP | + CLONE_AUTOREAP)) return false; /* diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 392ec2f75f01..68c17daef8d4 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -549,7 +549,8 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) if (!dead && thread_group_empty(p)) { if (!same_thread_group(p->real_parent, tracer)) dead = do_notify_parent(p, p->exit_signal); - else if (ignoring_children(tracer->sighand)) { + else if (ignoring_children(tracer->sighand) || + p->signal->autoreap) { __wake_up_parent(p, tracer); dead = true; } diff --git a/kernel/signal.c b/kernel/signal.c index d65d0fe24bfb..e61f39fa8c8a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2251,6 +2251,10 @@ bool do_notify_parent(struct task_struct *tsk, int sig) if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) sig = 0; } + if (!tsk->ptrace && tsk->signal->autoreap) { + autoreap = true; + sig = 0; + } /* * Send with __send_signal as si_pid and si_uid are in the * parent's namespaces. -- cgit v1.2.3 From 4ef420b3450026b56807e5d53001f80eb495403c Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 10 Mar 2026 18:01:01 -0700 Subject: cgroup: replace global cgroup_file_kn_lock with per-cgroup_file lock Replace the global cgroup_file_kn_lock with a per-cgroup_file spinlock to eliminate cross-cgroup contention as it is not really protecting data shared between different cgroups. The lock is initialized in cgroup_add_file() alongside timer_setup(). No lock acquisition is needed during initialization since the cgroup directory is being populated under cgroup_mutex and no concurrent accessors exist at that point. Reported-by: Jakub Kicinski Signed-off-by: Shakeel Butt Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 1 + kernel/cgroup/cgroup.c | 24 ++++++++---------------- 2 files changed, 9 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index bb92f5c169ca..ba26b5d05ce3 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -167,6 +167,7 @@ struct cgroup_file { struct kernfs_node *kn; unsigned long notified_at; struct timer_list notify_timer; + spinlock_t lock; }; /* diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d161bcaa68f1..6f58efeb9016 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -99,12 +99,6 @@ static bool cgroup_debug __read_mostly; */ static DEFINE_SPINLOCK(cgroup_idr_lock); -/* - * Protects cgroup_file->kn for !self csses. It synchronizes notifications - * against file removal/re-creation across css hiding. - */ -static DEFINE_SPINLOCK(cgroup_file_kn_lock); - DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem); #define cgroup_assert_mutex_or_rcu_locked() \ @@ -1693,9 +1687,9 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss); struct cgroup_file *cfile = (void *)css + cft->file_offset; - spin_lock_irq(&cgroup_file_kn_lock); + spin_lock_irq(&cfile->lock); WRITE_ONCE(cfile->kn, NULL); - spin_unlock_irq(&cgroup_file_kn_lock); + spin_unlock_irq(&cfile->lock); timer_delete_sync(&cfile->notify_timer); } @@ -4373,10 +4367,8 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cgroup_file *cfile = (void *)css + cft->file_offset; timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0); - - spin_lock_irq(&cgroup_file_kn_lock); - WRITE_ONCE(cfile->kn, kn); - spin_unlock_irq(&cgroup_file_kn_lock); + spin_lock_init(&cfile->lock); + cfile->kn = kn; } return 0; @@ -4645,13 +4637,13 @@ void cgroup_file_notify(struct cgroup_file *cfile) return; } - spin_lock_irqsave(&cgroup_file_kn_lock, flags); + spin_lock_irqsave(&cfile->lock, flags); if (cfile->kn) { kn = cfile->kn; kernfs_get(kn); WRITE_ONCE(cfile->notified_at, jiffies); } - spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); + spin_unlock_irqrestore(&cfile->lock, flags); if (kn) { kernfs_notify(kn); @@ -4669,10 +4661,10 @@ void cgroup_file_show(struct cgroup_file *cfile, bool show) { struct kernfs_node *kn; - spin_lock_irq(&cgroup_file_kn_lock); + spin_lock_irq(&cfile->lock); kn = cfile->kn; kernfs_get(kn); - spin_unlock_irq(&cgroup_file_kn_lock); + spin_unlock_irq(&cfile->lock); if (kn) kernfs_show(kn, show); -- cgit v1.2.3 From 49b76317592ecbaefd0969d51d02019966cc994b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 1 Mar 2026 16:52:37 -0800 Subject: sched/wait: correct kernel-doc descriptions Use the correct function name and function parameter name to avoid these kernel-doc warnings: Warning: include/linux/wait_bit.h:424 expecting prototype for wait_var_event_killable(). Prototype was for wait_var_event_interruptible() instead Warning: include/linux/wait_bit.h:508 function parameter 'lock' not described in 'wait_var_event_mutex' Signed-off-by: Randy Dunlap Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260302005237.3473095-1-rdunlap@infradead.org --- include/linux/wait_bit.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h index 9e29d79fc790..ace7379d627d 100644 --- a/include/linux/wait_bit.h +++ b/include/linux/wait_bit.h @@ -406,7 +406,7 @@ do { \ schedule()) /** - * wait_var_event_killable - wait for a variable to be updated and notified + * wait_var_event_interruptible - wait for a variable to be updated and notified * @var: the address of variable being waited on * @condition: the condition to wait for * @@ -492,7 +492,7 @@ do { \ * wait_var_event_mutex - wait for a variable to be updated under a mutex * @var: the address of the variable being waited on * @condition: condition to wait for - * @mutex: the mutex which protects updates to the variable + * @lock: the mutex which protects updates to the variable * * Wait for a condition which can only be reliably tested while holding * a mutex. The variables assessed in the condition will normal be -- cgit v1.2.3 From 754e38d2d1aeeadddac5220f34e07cf263502a46 Mon Sep 17 00:00:00 2001 From: "Thomas Weißschuh (Schneider Electric)" Date: Wed, 11 Mar 2026 11:15:11 +0100 Subject: tracing: Use explicit array size instead of sentinel elements in symbol printing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sentinel value added by the wrapper macros __print_symbolic() et al prevents the callers from adding their own trailing comma. This makes constructing symbol list dynamically based on kconfig values tedious. Drop the sentinel elements, so callers can either specify the trailing comma or not, just like in regular array initializers. Signed-off-by: Thomas Weißschuh (Schneider Electric) Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260311-hrtimer-cleanups-v1-2-095357392669@linutronix.de --- include/linux/trace_events.h | 13 ++++++---- include/trace/stages/stage3_trace_output.h | 40 +++++++++++++++--------------- kernel/trace/trace_events_synth.c | 4 +-- kernel/trace/trace_output.c | 20 +++++++++------ kernel/trace/trace_syscalls.c | 3 +-- 5 files changed, 43 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 37eb2f0f3dd8..40a43a4c7caf 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -22,20 +22,23 @@ union bpf_attr; const char *trace_print_flags_seq(struct trace_seq *p, const char *delim, unsigned long flags, - const struct trace_print_flags *flag_array); + const struct trace_print_flags *flag_array, + size_t flag_array_size); const char *trace_print_symbols_seq(struct trace_seq *p, unsigned long val, - const struct trace_print_flags *symbol_array); + const struct trace_print_flags *symbol_array, + size_t symbol_array_size); #if BITS_PER_LONG == 32 const char *trace_print_flags_seq_u64(struct trace_seq *p, const char *delim, unsigned long long flags, - const struct trace_print_flags_u64 *flag_array); + const struct trace_print_flags_u64 *flag_array, + size_t flag_array_size); const char *trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, - const struct trace_print_flags_u64 - *symbol_array); + const struct trace_print_flags_u64 *symbol_array, + size_t symbol_array_size); #endif struct trace_iterator; diff --git a/include/trace/stages/stage3_trace_output.h b/include/trace/stages/stage3_trace_output.h index fce85ea2df1c..b7d8ef4b9fe1 100644 --- a/include/trace/stages/stage3_trace_output.h +++ b/include/trace/stages/stage3_trace_output.h @@ -64,36 +64,36 @@ #define __get_rel_sockaddr(field) ((struct sockaddr *)__get_rel_dynamic_array(field)) #undef __print_flags -#define __print_flags(flag, delim, flag_array...) \ - ({ \ - static const struct trace_print_flags __flags[] = \ - { flag_array, { -1, NULL }}; \ - trace_print_flags_seq(p, delim, flag, __flags); \ +#define __print_flags(flag, delim, flag_array...) \ + ({ \ + static const struct trace_print_flags __flags[] = \ + { flag_array }; \ + trace_print_flags_seq(p, delim, flag, __flags, ARRAY_SIZE(__flags)); \ }) #undef __print_symbolic -#define __print_symbolic(value, symbol_array...) \ - ({ \ - static const struct trace_print_flags symbols[] = \ - { symbol_array, { -1, NULL }}; \ - trace_print_symbols_seq(p, value, symbols); \ +#define __print_symbolic(value, symbol_array...) \ + ({ \ + static const struct trace_print_flags symbols[] = \ + { symbol_array }; \ + trace_print_symbols_seq(p, value, symbols, ARRAY_SIZE(symbols)); \ }) #undef __print_flags_u64 #undef __print_symbolic_u64 #if BITS_PER_LONG == 32 -#define __print_flags_u64(flag, delim, flag_array...) \ - ({ \ - static const struct trace_print_flags_u64 __flags[] = \ - { flag_array, { -1, NULL } }; \ - trace_print_flags_seq_u64(p, delim, flag, __flags); \ +#define __print_flags_u64(flag, delim, flag_array...) \ + ({ \ + static const struct trace_print_flags_u64 __flags[] = \ + { flag_array }; \ + trace_print_flags_seq_u64(p, delim, flag, __flags, ARRAY_SIZE(__flags)); \ }) -#define __print_symbolic_u64(value, symbol_array...) \ - ({ \ - static const struct trace_print_flags_u64 symbols[] = \ - { symbol_array, { -1, NULL } }; \ - trace_print_symbols_seq_u64(p, value, symbols); \ +#define __print_symbolic_u64(value, symbol_array...) \ + ({ \ + static const struct trace_print_flags_u64 symbols[] = \ + { symbol_array }; \ + trace_print_symbols_seq_u64(p, value, symbols, ARRAY_SIZE(symbols)); \ }) #else #define __print_flags_u64(flag, delim, flag_array...) \ diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 8bb95b2a6fcf..39ac4eba0702 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -395,7 +395,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, n_u64++; } else { struct trace_print_flags __flags[] = { - __def_gfpflag_names, {-1, NULL} }; + __def_gfpflag_names }; char *space = (i == se->n_fields - 1 ? "" : " "); print_synth_event_num_val(s, print_fmt, @@ -408,7 +408,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, trace_seq_puts(s, " ("); trace_print_flags_seq(s, "|", entry->fields[n_u64].as_u64, - __flags); + __flags, ARRAY_SIZE(__flags)); trace_seq_putc(s, ')'); } n_u64++; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 1996d7aba038..96e2d22b4364 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -69,14 +69,15 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) const char * trace_print_flags_seq(struct trace_seq *p, const char *delim, unsigned long flags, - const struct trace_print_flags *flag_array) + const struct trace_print_flags *flag_array, + size_t flag_array_size) { unsigned long mask; const char *str; const char *ret = trace_seq_buffer_ptr(p); int i, first = 1; - for (i = 0; flag_array[i].name && flags; i++) { + for (i = 0; i < flag_array_size && flags; i++) { mask = flag_array[i].mask; if ((flags & mask) != mask) @@ -106,12 +107,13 @@ EXPORT_SYMBOL(trace_print_flags_seq); const char * trace_print_symbols_seq(struct trace_seq *p, unsigned long val, - const struct trace_print_flags *symbol_array) + const struct trace_print_flags *symbol_array, + size_t symbol_array_size) { int i; const char *ret = trace_seq_buffer_ptr(p); - for (i = 0; symbol_array[i].name; i++) { + for (i = 0; i < symbol_array_size; i++) { if (val != symbol_array[i].mask) continue; @@ -133,14 +135,15 @@ EXPORT_SYMBOL(trace_print_symbols_seq); const char * trace_print_flags_seq_u64(struct trace_seq *p, const char *delim, unsigned long long flags, - const struct trace_print_flags_u64 *flag_array) + const struct trace_print_flags_u64 *flag_array, + size_t flag_array_size) { unsigned long long mask; const char *str; const char *ret = trace_seq_buffer_ptr(p); int i, first = 1; - for (i = 0; flag_array[i].name && flags; i++) { + for (i = 0; i < flag_array_size && flags; i++) { mask = flag_array[i].mask; if ((flags & mask) != mask) @@ -170,12 +173,13 @@ EXPORT_SYMBOL(trace_print_flags_seq_u64); const char * trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, - const struct trace_print_flags_u64 *symbol_array) + const struct trace_print_flags_u64 *symbol_array, + size_t symbol_array_size) { int i; const char *ret = trace_seq_buffer_ptr(p); - for (i = 0; symbol_array[i].name; i++) { + for (i = 0; i < symbol_array_size; i++) { if (val != symbol_array[i].mask) continue; diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 37317b81fcda..8ad72e17d8eb 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -174,7 +174,6 @@ sys_enter_openat_print(struct syscall_trace_enter *trace, struct syscall_metadat { O_NOFOLLOW, "O_NOFOLLOW" }, { O_NOATIME, "O_NOATIME" }, { O_CLOEXEC, "O_CLOEXEC" }, - { -1, NULL } }; trace_seq_printf(s, "%s(", entry->name); @@ -205,7 +204,7 @@ sys_enter_openat_print(struct syscall_trace_enter *trace, struct syscall_metadat trace_seq_puts(s, "O_RDONLY|"); } - trace_print_flags_seq(s, "|", bits, __flags); + trace_print_flags_seq(s, "|", bits, __flags, ARRAY_SIZE(__flags)); /* * trace_print_flags_seq() adds a '\0' to the * buffer, but this needs to append more to the seq. -- cgit v1.2.3 From 8ef2807042d0886a85bbcb0aba1a2a277680dc4a Mon Sep 17 00:00:00 2001 From: "Thomas Weißschuh (Schneider Electric)" Date: Wed, 11 Mar 2026 11:15:15 +0100 Subject: hrtimer: Remove hrtimer_get_expires_ns() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are no users left. Signed-off-by: Thomas Weißschuh (Schneider Electric) Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260311-hrtimer-cleanups-v1-6-095357392669@linutronix.de --- include/linux/hrtimer.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index c087b7142330..9ced498fefaa 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -116,11 +116,6 @@ static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer) return timer->_softexpires; } -static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer) -{ - return ktime_to_ns(timer->node.expires); -} - ktime_t hrtimer_cb_get_time(const struct hrtimer *timer); static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) -- cgit v1.2.3 From b94c076dd949426d09e5d415304acb3f951d9069 Mon Sep 17 00:00:00 2001 From: "Thomas Weißschuh (Schneider Electric)" Date: Wed, 11 Mar 2026 11:15:17 +0100 Subject: hrtimer: Drop spurious space in 'enum hrtimer_base_type' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This spurious space makes grepping for the enum definition annoying. Remove it. Signed-off-by: Thomas Weißschuh (Schneider Electric) Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260311-hrtimer-cleanups-v1-8-095357392669@linutronix.de --- include/linux/hrtimer_defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index 0f851b2432c3..e6d4dc1b61e0 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -35,7 +35,7 @@ struct hrtimer_clock_base { ktime_t offset; } __hrtimer_clock_base_align; -enum hrtimer_base_type { +enum hrtimer_base_type { HRTIMER_BASE_MONOTONIC, HRTIMER_BASE_REALTIME, HRTIMER_BASE_BOOTTIME, -- cgit v1.2.3 From f12ef5cb4e035e15f0c324c41ff402441578ffda Mon Sep 17 00:00:00 2001 From: "Thomas Weißschuh (Schneider Electric)" Date: Wed, 11 Mar 2026 11:15:19 +0100 Subject: hrtimer: Mark index and clockid of clock base as const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These fields are initialized once and are never supposed to change. Mark them as const to make this explicit. Signed-off-by: Thomas Weißschuh (Schneider Electric) Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260311-hrtimer-cleanups-v1-10-095357392669@linutronix.de --- include/linux/hrtimer_defs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index e6d4dc1b61e0..a03240c0b14f 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -26,8 +26,8 @@ */ struct hrtimer_clock_base { struct hrtimer_cpu_base *cpu_base; - unsigned int index; - clockid_t clockid; + const unsigned int index; + const clockid_t clockid; seqcount_raw_spinlock_t seq; ktime_t expires_next; struct hrtimer *running; -- cgit v1.2.3 From f27fc117cf8fba56e0619694e685f9bca9b9cb82 Mon Sep 17 00:00:00 2001 From: "Thomas Weißschuh (Schneider Electric)" Date: Wed, 11 Mar 2026 11:15:20 +0100 Subject: hrtimer: Remove trailing comma after HRTIMER_MAX_CLOCK_BASES MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HRTIMER_MAX_CLOCK_BASES is required to stay the last value of the enum. Drop the trailing comma so no new members are added after it by mistake. Signed-off-by: Thomas Weißschuh (Schneider Electric) Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260311-hrtimer-cleanups-v1-11-095357392669@linutronix.de --- include/linux/hrtimer_defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index a03240c0b14f..52ed9e46ff13 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -44,7 +44,7 @@ enum hrtimer_base_type { HRTIMER_BASE_REALTIME_SOFT, HRTIMER_BASE_BOOTTIME_SOFT, HRTIMER_BASE_TAI_SOFT, - HRTIMER_MAX_CLOCK_BASES, + HRTIMER_MAX_CLOCK_BASES }; /** -- cgit v1.2.3 From 6f459eda8b60382efa0da2ca025c26a2018adc87 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 10 Mar 2026 12:44:51 +0000 Subject: tcp: add tcp_release_cb_cond() helper Majority of tcp_release_cb() calls do nothing at all. Provide tcp_release_cb_cond() helper so that release_sock() can avoid these calls. Also hint the compiler that __release_sock() and wake_up() are rarely called. $ scripts/bloat-o-meter -t vmlinux.old vmlinux.new add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-77 (-77) Function old new delta release_sock 258 181 -77 Total: Before=25235790, After=25235713, chg -0.00% Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260310124451.2280968-1-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/tcp.h | 7 +++++++ include/net/tcp.h | 14 ++++++++++++++ net/core/sock.c | 14 ++++++++------ net/ipv4/tcp_output.c | 5 ----- 4 files changed, 29 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index c44cf9ae8d16..bcebc4f07532 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -548,6 +548,13 @@ enum tsq_flags { TCPF_ACK_DEFERRED = BIT(TCP_ACK_DEFERRED), }; +/* Flags of interest for tcp_release_cb() */ +#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \ + TCPF_WRITE_TIMER_DEFERRED | \ + TCPF_DELACK_TIMER_DEFERRED | \ + TCPF_MTU_REDUCED_DEFERRED | \ + TCPF_ACK_DEFERRED) + #define tcp_sk(ptr) container_of_const(ptr, struct tcp_sock, inet_conn.icsk_inet.sk) /* Variant of tcp_sk() upgrading a const sock to a read/write tcp socket. diff --git a/include/net/tcp.h b/include/net/tcp.h index 9f0aee9e5d76..48dffcca0a71 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -375,7 +375,21 @@ int tcp_send_mss(struct sock *sk, int *size_goal, int flags); int tcp_wmem_schedule(struct sock *sk, int copy); void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle, int size_goal); + void tcp_release_cb(struct sock *sk); + +static inline bool tcp_release_cb_cond(struct sock *sk) +{ +#ifdef CONFIG_INET + if (likely(sk->sk_prot->release_cb == tcp_release_cb)) { + if (unlikely(smp_load_acquire(&sk->sk_tsq_flags) & TCP_DEFERRED_ALL)) + tcp_release_cb(sk); + return true; + } +#endif + return false; +} + void tcp_wfree(struct sk_buff *skb); void tcp_write_timer_handler(struct sock *sk); void tcp_delack_timer_handler(struct sock *sk); diff --git a/net/core/sock.c b/net/core/sock.c index f4e2ff23d60e..fdaf66e6dc18 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3807,16 +3807,18 @@ EXPORT_SYMBOL(lock_sock_nested); void release_sock(struct sock *sk) { spin_lock_bh(&sk->sk_lock.slock); - if (sk->sk_backlog.tail) - __release_sock(sk); - if (sk->sk_prot->release_cb) - INDIRECT_CALL_INET_1(sk->sk_prot->release_cb, - tcp_release_cb, sk); + if (unlikely(sk->sk_backlog.tail)) + __release_sock(sk); + if (sk->sk_prot->release_cb) { + if (!tcp_release_cb_cond(sk)) + sk->sk_prot->release_cb(sk); + } sock_release_ownership(sk); - if (waitqueue_active(&sk->sk_lock.wq)) + if (unlikely(waitqueue_active(&sk->sk_lock.wq))) wake_up(&sk->sk_lock.wq); + spin_unlock_bh(&sk->sk_lock.slock); } EXPORT_SYMBOL(release_sock); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a53802f28dd1..34a25ef61006 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1320,11 +1320,6 @@ static void tcp_tsq_workfn(struct work_struct *work) } } -#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \ - TCPF_WRITE_TIMER_DEFERRED | \ - TCPF_DELACK_TIMER_DEFERRED | \ - TCPF_MTU_REDUCED_DEFERRED | \ - TCPF_ACK_DEFERRED) /** * tcp_release_cb - tcp release_sock() callback * @sk: socket -- cgit v1.2.3 From c670267ff50d5f9beb486f0203cdede580a99ae3 Mon Sep 17 00:00:00 2001 From: Qingfang Deng Date: Fri, 6 Feb 2026 14:20:03 +0800 Subject: tty: constify tty_ldisc_ops tty_ldisc_ops is not modified once registered, so make it const. Signed-off-by: Qingfang Deng Link: https://patch.msgid.link/20260206062004.1273890-1-dqfext@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/tty_ldisc.c | 16 ++++++++-------- include/linux/tty_ldisc.h | 6 +++--- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c index 888f2f8f9481..27fe8236f662 100644 --- a/drivers/tty/tty_ldisc.c +++ b/drivers/tty/tty_ldisc.c @@ -44,7 +44,7 @@ enum { static DEFINE_RAW_SPINLOCK(tty_ldiscs_lock); /* Line disc dispatch table */ -static struct tty_ldisc_ops *tty_ldiscs[NR_LDISCS]; +static const struct tty_ldisc_ops *tty_ldiscs[NR_LDISCS]; /** * tty_register_ldisc - install a line discipline @@ -55,7 +55,7 @@ static struct tty_ldisc_ops *tty_ldiscs[NR_LDISCS]; * * Locking: takes %tty_ldiscs_lock to guard against ldisc races */ -int tty_register_ldisc(struct tty_ldisc_ops *new_ldisc) +int tty_register_ldisc(const struct tty_ldisc_ops *new_ldisc) { unsigned long flags; @@ -80,7 +80,7 @@ EXPORT_SYMBOL(tty_register_ldisc); * Locking: takes %tty_ldiscs_lock to guard against ldisc races */ -void tty_unregister_ldisc(struct tty_ldisc_ops *ldisc) +void tty_unregister_ldisc(const struct tty_ldisc_ops *ldisc) { unsigned long flags; @@ -90,10 +90,10 @@ void tty_unregister_ldisc(struct tty_ldisc_ops *ldisc) } EXPORT_SYMBOL(tty_unregister_ldisc); -static struct tty_ldisc_ops *get_ldops(int disc) +static const struct tty_ldisc_ops *get_ldops(int disc) { unsigned long flags; - struct tty_ldisc_ops *ldops, *ret; + const struct tty_ldisc_ops *ldops, *ret; raw_spin_lock_irqsave(&tty_ldiscs_lock, flags); ret = ERR_PTR(-EINVAL); @@ -107,7 +107,7 @@ static struct tty_ldisc_ops *get_ldops(int disc) return ret; } -static void put_ldops(struct tty_ldisc_ops *ldops) +static void put_ldops(const struct tty_ldisc_ops *ldops) { unsigned long flags; @@ -139,7 +139,7 @@ int tty_ldisc_autoload = IS_BUILTIN(CONFIG_LDISC_AUTOLOAD); static struct tty_ldisc *tty_ldisc_get(struct tty_struct *tty, int disc) { struct tty_ldisc *ld; - struct tty_ldisc_ops *ldops; + const struct tty_ldisc_ops *ldops; if (disc < N_TTY || disc >= NR_LDISCS) return ERR_PTR(-EINVAL); @@ -202,7 +202,7 @@ static void tty_ldiscs_seq_stop(struct seq_file *m, void *v) static int tty_ldiscs_seq_show(struct seq_file *m, void *v) { int i = *(loff_t *)v; - struct tty_ldisc_ops *ldops; + const struct tty_ldisc_ops *ldops; ldops = get_ldops(i); if (IS_ERR(ldops)) diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h index c5cccc3fc1e8..d227a58e3e49 100644 --- a/include/linux/tty_ldisc.h +++ b/include/linux/tty_ldisc.h @@ -266,7 +266,7 @@ struct tty_ldisc_ops { }; struct tty_ldisc { - struct tty_ldisc_ops *ops; + const struct tty_ldisc_ops *ops; struct tty_struct *tty; }; @@ -281,8 +281,8 @@ struct tty_ldisc *tty_ldisc_ref_wait(struct tty_struct *); void tty_ldisc_flush(struct tty_struct *tty); -int tty_register_ldisc(struct tty_ldisc_ops *new_ldisc); -void tty_unregister_ldisc(struct tty_ldisc_ops *ldisc); +int tty_register_ldisc(const struct tty_ldisc_ops *new_ldisc); +void tty_unregister_ldisc(const struct tty_ldisc_ops *ldisc); int tty_set_ldisc(struct tty_struct *tty, int disc); #endif /* _LINUX_TTY_LDISC_H */ -- cgit v1.2.3 From 24728b93fafe0949b5353e1a7b3a94175fe26d6e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 10 Mar 2026 22:23:47 -0700 Subject: serdev: serdev.h: clean up kernel-doc comments Correct kernel-doc comment format and add a missing to avoid kernel-doc warnings: Warning: include/linux/serdev.h:49 struct member 'write_comp' not described in 'serdev_device' Warning: include/linux/serdev.h:49 struct member 'write_lock' not described in 'serdev_device' Warning: include/linux/serdev.h:68 struct member 'shutdown' not described in 'serdev_device_driver' Warning: include/linux/serdev.h:134 function parameter 'serdev' not described in 'serdev_device_put' Warning: include/linux/serdev.h:162 function parameter 'ctrl' not described in 'serdev_controller_put' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20260311052347.305612-1-rdunlap@infradead.org Signed-off-by: Greg Kroah-Hartman --- include/linux/serdev.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serdev.h b/include/linux/serdev.h index 5654c58eb73c..090c93c08045 100644 --- a/include/linux/serdev.h +++ b/include/linux/serdev.h @@ -37,8 +37,8 @@ struct serdev_device_ops { * @nr: Device number on serdev bus. * @ctrl: serdev controller managing this device. * @ops: Device operations. - * @write_comp Completion used by serdev_device_write() internally - * @write_lock Lock to serialize access when writing data + * @write_comp: Completion used by serdev_device_write() internally + * @write_lock: Lock to serialize access when writing data */ struct serdev_device { struct device dev; @@ -60,6 +60,7 @@ static inline struct serdev_device *to_serdev_device(struct device *d) * structure. * @probe: binds this driver to a serdev device. * @remove: unbinds this driver from the serdev device. + * @shutdown: shut down this serdev device. */ struct serdev_device_driver { struct device_driver driver; @@ -129,7 +130,7 @@ static inline void serdev_device_set_drvdata(struct serdev_device *serdev, void /** * serdev_device_put() - decrement serdev device refcount - * @serdev serdev device. + * @serdev: serdev device. */ static inline void serdev_device_put(struct serdev_device *serdev) { @@ -157,7 +158,7 @@ static inline void serdev_controller_set_drvdata(struct serdev_controller *ctrl, /** * serdev_controller_put() - decrement controller refcount - * @ctrl serdev controller. + * @ctrl: serdev controller. */ static inline void serdev_controller_put(struct serdev_controller *ctrl) { -- cgit v1.2.3 From eb3b0d92c9c39890592cca6647601fe5c631efea Mon Sep 17 00:00:00 2001 From: Xin Zhao Date: Fri, 13 Feb 2026 16:50:39 +0800 Subject: tty: tty_port: add workqueue to flip TTY buffer On the embedded platform, certain critical data, such as IMU data, is transmitted through UART. The tty_flip_buffer_push() interface in the TTY layer uses system_dfl_wq to handle the flipping of the TTY buffer. Although the unbound workqueue can create new threads on demand and wake up the kworker thread on an idle CPU, it may be preempted by real-time tasks or other high-prio tasks. flush_to_ldisc() needs to wake up the relevant data handle thread. When executing __wake_up_common_lock(), it calls spin_lock_irqsave(), which does not disable preemption but disables migration in RT-Linux. This prevents the kworker thread from being migrated to other cores by CPU's balancing logic, resulting in long delays. The call trace is as follows: __wake_up_common_lock __wake_up ep_poll_callback __wake_up_common __wake_up_common_lock __wake_up n_tty_receive_buf_common n_tty_receive_buf2 tty_ldisc_receive_buf tty_port_default_receive_buf flush_to_ldisc In our system, the processing interval for each frame of IMU data transmitted via UART can experience significant jitter due to this issue. Instead of the expected 10 to 15 ms frame processing interval, we see spikes up to 30 to 35 ms. Moreover, in just one or two hours, there can be 2 to 3 occurrences of such high jitter, which is quite frequent. This jitter exceeds the software's tolerable limit of 20 ms. Introduce flip_wq in tty_port which can be set by tty_port_link_wq() or as default linked to default workqueue allocated when tty_register_driver(). The default workqueue is allocated with flag WQ_SYSFS, so that cpumask and nice can be set dynamically. The execution timing of tty_port_link_wq() is not clearly restricted. The newly added function tty_port_link_driver_wq() checks whether the flip_wq of the tty_port has already been assigned when linking the default tty_driver's workqueue to the port. After the user has set a custom workqueue for a certain tty_port using tty_port_link_wq(), the system will only use this custom workqueue, even if tty_driver does not have %TTY_DRIVER_NO_WORKQUEUE flag. When tty_port register device, flip_wq link operation is done by tty_port_link_driver_wq(), but for in-memory devices the link operation cannot cover all the cases. Although tty_port_install() is dedicated for in-memory devices lik PTY to link port allocated on demand, the logic of tty_port_install() is so simple that people may not call it, vc_cons[0].d->port is one such case. We check the buf.flip_wq when flip TTY buffer, if buf.flip_wq of TTY port is NULL, use system_dfl_wq as a backup. To avoid naming conflict of the default tty_driver's workqueue, using '"%s-%s", driver->name, driver->driver_name' as the workqueue name. In cases where driver_name is not specified and therefore is NULL, the workqueue is not created. Drivers that do not define driver_name are potentially in-memory devices like vty, which generally do not require special workqueue settings. Even with the combination of name and driver_name, the workqueue names can still be duplicated, as many tty serial drivers use "ttyS" as dev_name and "serial" as driver_name. I modified the conflicting driver_name of these drivers by appending a suffix of _xx based on the corresponding .c file. If this modification is not made, it could not only lead to duplicate workqueue names but also result in duplicate entries for the /proc/tty/driver/ nodes. Introduce %TTY_DRIVER_NO_WORKQUEUE flag meaning not to create the default single tty_driver workqueue. Two reasons why need to introduce the %TTY_DRIVER_NO_WORKQUEUE flag: 1. If the WQ_SYSFS parameter is enabled, workqueue_sysfs_register() will fail when trying to create a workqueue with the same name. The pty is an example of this; if both CONFIG_LEGACY_PTYS and CONFIG_UNIX98_PTYS are enabled, the call to tty_register_driver() in unix98_pty_init() will fail. 2. Different TTY ports may be used for different tasks, which may require separate core binding control via workqueues. In this case, the workqueue created by default in the TTY driver is unnecessary. Enabling this flag prevents the creation of this redundant workqueue. After applying this patch, we can set the related UART TTY flip buffer workqueue by sysfs. We set the cpumask to CPU cores associated with the IMU tasks, and set the nice to -20. Testing has shown significant improvement in the previously described issue, with almost no stuttering occurring anymore. Tested-by: Tommaso Merciai Tested-by: Marek Szyprowski Signed-off-by: Xin Zhao Link: https://patch.msgid.link/20260213085039.3274704-1-jackzxcui1989@163.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/pty.c | 12 ++++++++---- drivers/tty/serial/8250/8250_core.c | 2 +- drivers/tty/serial/apbuart.c | 2 +- drivers/tty/serial/dz.c | 2 +- drivers/tty/serial/ip22zilog.c | 2 +- drivers/tty/serial/zs.c | 2 +- drivers/tty/tty_buffer.c | 15 +++++++++++---- drivers/tty/tty_io.c | 25 ++++++++++++++++++++++++- drivers/tty/tty_port.c | 22 ++++++++++++++++++++++ include/linux/tty_buffer.h | 1 + include/linux/tty_driver.h | 7 +++++++ include/linux/tty_port.h | 13 +++++++++++++ 12 files changed, 91 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c index cb427e93372d..cc7f7091ed9a 100644 --- a/drivers/tty/pty.c +++ b/drivers/tty/pty.c @@ -532,14 +532,16 @@ static void __init legacy_pty_init(void) pty_driver = tty_alloc_driver(legacy_count, TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_REAL_RAW | - TTY_DRIVER_DYNAMIC_ALLOC); + TTY_DRIVER_DYNAMIC_ALLOC | + TTY_DRIVER_NO_WORKQUEUE); if (IS_ERR(pty_driver)) panic("Couldn't allocate pty driver"); pty_slave_driver = tty_alloc_driver(legacy_count, TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_REAL_RAW | - TTY_DRIVER_DYNAMIC_ALLOC); + TTY_DRIVER_DYNAMIC_ALLOC | + TTY_DRIVER_NO_WORKQUEUE); if (IS_ERR(pty_slave_driver)) panic("Couldn't allocate pty slave driver"); @@ -849,7 +851,8 @@ static void __init unix98_pty_init(void) TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV | TTY_DRIVER_DEVPTS_MEM | - TTY_DRIVER_DYNAMIC_ALLOC); + TTY_DRIVER_DYNAMIC_ALLOC | + TTY_DRIVER_NO_WORKQUEUE); if (IS_ERR(ptm_driver)) panic("Couldn't allocate Unix98 ptm driver"); pts_driver = tty_alloc_driver(NR_UNIX98_PTY_MAX, @@ -857,7 +860,8 @@ static void __init unix98_pty_init(void) TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV | TTY_DRIVER_DEVPTS_MEM | - TTY_DRIVER_DYNAMIC_ALLOC); + TTY_DRIVER_DYNAMIC_ALLOC | + TTY_DRIVER_NO_WORKQUEUE); if (IS_ERR(pts_driver)) panic("Couldn't allocate Unix98 pts driver"); diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c index d2e2c5dfef99..a428e88938eb 100644 --- a/drivers/tty/serial/8250/8250_core.c +++ b/drivers/tty/serial/8250/8250_core.c @@ -524,7 +524,7 @@ console_initcall(univ8250_console_init); struct uart_driver serial8250_reg = { .owner = THIS_MODULE, - .driver_name = "serial", + .driver_name = "serial_8250", .dev_name = "ttyS", .major = TTY_MAJOR, .minor = 64, diff --git a/drivers/tty/serial/apbuart.c b/drivers/tty/serial/apbuart.c index 364599f256db..3e46341cfff8 100644 --- a/drivers/tty/serial/apbuart.c +++ b/drivers/tty/serial/apbuart.c @@ -505,7 +505,7 @@ console_initcall(apbuart_console_init); static struct uart_driver grlib_apbuart_driver = { .owner = THIS_MODULE, - .driver_name = "serial", + .driver_name = "serial_apbuart", .dev_name = "ttyS", .major = SERIAL_APBUART_MAJOR, .minor = SERIAL_APBUART_MINOR, diff --git a/drivers/tty/serial/dz.c b/drivers/tty/serial/dz.c index eba91daedef8..e53c54353c3e 100644 --- a/drivers/tty/serial/dz.c +++ b/drivers/tty/serial/dz.c @@ -914,7 +914,7 @@ console_initcall(dz_serial_console_init); static struct uart_driver dz_reg = { .owner = THIS_MODULE, - .driver_name = "serial", + .driver_name = "serial_dz", .dev_name = "ttyS", .major = TTY_MAJOR, .minor = 64, diff --git a/drivers/tty/serial/ip22zilog.c b/drivers/tty/serial/ip22zilog.c index 6e19c6713849..a69b06893d9e 100644 --- a/drivers/tty/serial/ip22zilog.c +++ b/drivers/tty/serial/ip22zilog.c @@ -1015,7 +1015,7 @@ static struct console ip22zilog_console = { static struct uart_driver ip22zilog_reg = { .owner = THIS_MODULE, - .driver_name = "serial", + .driver_name = "serial_ip22zilog", .dev_name = "ttyS", .major = TTY_MAJOR, .minor = 64, diff --git a/drivers/tty/serial/zs.c b/drivers/tty/serial/zs.c index 79ea7108a0f3..72a3c0d90f40 100644 --- a/drivers/tty/serial/zs.c +++ b/drivers/tty/serial/zs.c @@ -1252,7 +1252,7 @@ console_initcall(zs_serial_console_init); static struct uart_driver zs_reg = { .owner = THIS_MODULE, - .driver_name = "serial", + .driver_name = "serial_zs", .dev_name = "ttyS", .major = TTY_MAJOR, .minor = 64, diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c index 79ec953824d5..96be90db53b7 100644 --- a/drivers/tty/tty_buffer.c +++ b/drivers/tty/tty_buffer.c @@ -59,6 +59,13 @@ void tty_buffer_lock_exclusive(struct tty_port *port) } EXPORT_SYMBOL_GPL(tty_buffer_lock_exclusive); +static bool tty_buffer_queue_work(struct tty_bufhead *buf) +{ + struct workqueue_struct *flip_wq = READ_ONCE(buf->flip_wq); + + return queue_work(flip_wq ?: system_dfl_wq, &buf->work); +} + /** * tty_buffer_unlock_exclusive - release exclusive access * @port: tty port owning the flip buffer @@ -76,7 +83,7 @@ void tty_buffer_unlock_exclusive(struct tty_port *port) mutex_unlock(&buf->lock); if (restart) - queue_work(system_dfl_wq, &buf->work); + tty_buffer_queue_work(buf); } EXPORT_SYMBOL_GPL(tty_buffer_unlock_exclusive); @@ -530,7 +537,7 @@ void tty_flip_buffer_push(struct tty_port *port) struct tty_bufhead *buf = &port->buf; tty_flip_buffer_commit(buf->tail); - queue_work(system_dfl_wq, &buf->work); + tty_buffer_queue_work(buf); } EXPORT_SYMBOL(tty_flip_buffer_push); @@ -560,7 +567,7 @@ int tty_insert_flip_string_and_push_buffer(struct tty_port *port, tty_flip_buffer_commit(buf->tail); spin_unlock_irqrestore(&port->lock, flags); - queue_work(system_dfl_wq, &buf->work); + tty_buffer_queue_work(buf); return size; } @@ -613,7 +620,7 @@ void tty_buffer_set_lock_subclass(struct tty_port *port) bool tty_buffer_restart_work(struct tty_port *port) { - return queue_work(system_dfl_wq, &port->buf.work); + return tty_buffer_queue_work(&port->buf); } bool tty_buffer_cancel_work(struct tty_port *port) diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index a5d0457e0e28..6b283fd03ff8 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -3443,10 +3443,27 @@ int tty_register_driver(struct tty_driver *driver) if (error < 0) goto err; + /* + * Drivers that do not define driver_name are potentially in-memory devices + * like vty, which generally do not require special workqueue settings. + */ + if (!(driver->flags & TTY_DRIVER_NO_WORKQUEUE) && driver->driver_name) { + driver->flip_wq = alloc_workqueue("%s-%s", WQ_UNBOUND | WQ_SYSFS, + 0, driver->name, driver->driver_name); + if (!driver->flip_wq) { + error = -ENOMEM; + goto err_unreg_char; + } + for (i = 0; i < driver->num; i++) { + if (driver->ports[i]) + tty_port_link_driver_wq(driver->ports[i], driver); + } + } + if (driver->flags & TTY_DRIVER_DYNAMIC_ALLOC) { error = tty_cdev_add(driver, dev, 0, driver->num); if (error) - goto err_unreg_char; + goto err_destroy_wq; } scoped_guard(mutex, &tty_mutex) @@ -3472,6 +3489,10 @@ err_unreg_devs: scoped_guard(mutex, &tty_mutex) list_del(&driver->tty_drivers); +err_destroy_wq: + if (driver->flip_wq) + destroy_workqueue(driver->flip_wq); + err_unreg_char: unregister_chrdev_region(dev, driver->num); err: @@ -3491,6 +3512,8 @@ void tty_unregister_driver(struct tty_driver *driver) driver->num); scoped_guard(mutex, &tty_mutex) list_del(&driver->tty_drivers); + if (driver->flip_wq) + destroy_workqueue(driver->flip_wq); } EXPORT_SYMBOL(tty_unregister_driver); diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c index fe67c5cb0a3f..54359310e293 100644 --- a/drivers/tty/tty_port.c +++ b/drivers/tty/tty_port.c @@ -99,6 +99,23 @@ void tty_port_init(struct tty_port *port) } EXPORT_SYMBOL(tty_port_init); +/** + * tty_port_link_wq - link tty_port and flip workqueue + * @port: tty_port of the device + * @flip_wq: workqueue to queue flip buffer work on + * + * Whenever %TTY_DRIVER_NO_WORKQUEUE is used, every tty_port can be linked to + * a workqueue manually by this function. + * tty_port will use system_dfl_wq when buf.flip_wq is NULL. + * + * Note that tty_port API will NOT destroy the workqueue. + */ +void tty_port_link_wq(struct tty_port *port, struct workqueue_struct *flip_wq) +{ + port->buf.flip_wq = flip_wq; +} +EXPORT_SYMBOL_GPL(tty_port_link_wq); + /** * tty_port_link_device - link tty and tty_port * @port: tty_port of the device @@ -157,6 +174,7 @@ struct device *tty_port_register_device_attr(struct tty_port *port, const struct attribute_group **attr_grp) { tty_port_link_device(port, driver, index); + tty_port_link_driver_wq(port, driver); return tty_register_device_attr(driver, index, device, drvdata, attr_grp); } @@ -183,6 +201,7 @@ struct device *tty_port_register_device_attr_serdev(struct tty_port *port, struct device *dev; tty_port_link_device(port, driver, index); + tty_port_link_driver_wq(port, driver); dev = serdev_tty_port_register(port, host, parent, driver, index); if (PTR_ERR(dev) != -ENODEV) { @@ -210,6 +229,7 @@ void tty_port_unregister_device(struct tty_port *port, { int ret; + WRITE_ONCE(port->buf.flip_wq, NULL); ret = serdev_tty_port_unregister(port); if (ret == 0) return; @@ -257,6 +277,7 @@ void tty_port_destroy(struct tty_port *port) { tty_buffer_cancel_work(port); tty_buffer_free_all(port); + WRITE_ONCE(port->buf.flip_wq, NULL); } EXPORT_SYMBOL(tty_port_destroy); @@ -703,6 +724,7 @@ int tty_port_install(struct tty_port *port, struct tty_driver *driver, struct tty_struct *tty) { tty->port = port; + tty_port_link_driver_wq(port, driver); return tty_standard_install(driver, tty); } EXPORT_SYMBOL_GPL(tty_port_install); diff --git a/include/linux/tty_buffer.h b/include/linux/tty_buffer.h index 31125e3be3c5..48adcb0e8ff3 100644 --- a/include/linux/tty_buffer.h +++ b/include/linux/tty_buffer.h @@ -34,6 +34,7 @@ static inline u8 *flag_buf_ptr(struct tty_buffer *b, unsigned int ofs) struct tty_bufhead { struct tty_buffer *head; /* Queue head */ + struct workqueue_struct *flip_wq; struct work_struct work; struct mutex lock; atomic_t priority; diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index 188ee9b768eb..1f2896e56e77 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -69,6 +69,10 @@ struct serial_struct; * Do not create numbered ``/dev`` nodes. For example, create * ``/dev/ttyprintk`` and not ``/dev/ttyprintk0``. Applicable only when a * driver for a single tty device is being allocated. + * + * @TTY_DRIVER_NO_WORKQUEUE: + * Do not create workqueue when tty_register_driver(). Whenever set, flip + * buffer workqueue can be set by tty_port_link_wq() for every port. */ enum tty_driver_flag { TTY_DRIVER_INSTALLED = BIT(0), @@ -79,6 +83,7 @@ enum tty_driver_flag { TTY_DRIVER_HARDWARE_BREAK = BIT(5), TTY_DRIVER_DYNAMIC_ALLOC = BIT(6), TTY_DRIVER_UNNUMBERED_NODE = BIT(7), + TTY_DRIVER_NO_WORKQUEUE = BIT(8), }; enum tty_driver_type { @@ -506,6 +511,7 @@ struct tty_operations { * @flags: tty driver flags (%TTY_DRIVER_) * @proc_entry: proc fs entry, used internally * @other: driver of the linked tty; only used for the PTY driver + * @flip_wq: workqueue to queue flip buffer work on * @ttys: array of active &struct tty_struct, set by tty_standard_install() * @ports: array of &struct tty_port; can be set during initialization by * tty_port_link_device() and similar @@ -539,6 +545,7 @@ struct tty_driver { unsigned long flags; struct proc_dir_entry *proc_entry; struct tty_driver *other; + struct workqueue_struct *flip_wq; /* * Pointer to the tty data structures diff --git a/include/linux/tty_port.h b/include/linux/tty_port.h index 660c254f1efe..d2a7882c0b58 100644 --- a/include/linux/tty_port.h +++ b/include/linux/tty_port.h @@ -138,6 +138,7 @@ struct tty_port { kernel */ void tty_port_init(struct tty_port *port); +void tty_port_link_wq(struct tty_port *port, struct workqueue_struct *flip_wq); void tty_port_link_device(struct tty_port *port, struct tty_driver *driver, unsigned index); struct device *tty_port_register_device(struct tty_port *port, @@ -165,6 +166,18 @@ static inline struct tty_port *tty_port_get(struct tty_port *port) return NULL; } +/* + * Never overwrite the workqueue set by tty_port_link_wq(). + * No effect when %TTY_DRIVER_NO_WORKQUEUE is set, as driver->flip_wq is + * %NULL. + */ +static inline void tty_port_link_driver_wq(struct tty_port *port, + struct tty_driver *driver) +{ + if (!port->buf.flip_wq) + tty_port_link_wq(port, driver->flip_wq); +} + /* If the cts flow control is enabled, return true. */ static inline bool tty_port_cts_enabled(const struct tty_port *port) { -- cgit v1.2.3 From 59621105ffca7a33955f56bc7dee0923992f5832 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 23 Feb 2026 14:37:16 +0100 Subject: of: provide of_machine_read_compatible() Provide a helper function allowing users to read the compatible string of the machine, hiding the access to the root node. Reviewed-by: Christophe Leroy (CS GROUP) Signed-off-by: Bartosz Golaszewski Reviewed-by: Rob Herring (Arm) Link: https://patch.msgid.link/20260223-soc-of-root-v2-1-b45da45903c8@oss.qualcomm.com Signed-off-by: Greg Kroah-Hartman --- drivers/of/base.c | 15 +++++++++++++++ include/linux/of.h | 8 ++++++++ 2 files changed, 23 insertions(+) (limited to 'include/linux') diff --git a/drivers/of/base.c b/drivers/of/base.c index 57420806c1a2..b70aec32e0e3 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -434,6 +434,21 @@ bool of_machine_compatible_match(const char *const *compats) } EXPORT_SYMBOL(of_machine_compatible_match); +/** + * of_machine_read_compatible - Get the compatible string of this machine + * @compatible: address at which the address of the compatible string will be + * stored + * @index: index of the compatible entry in the list + * + * Returns: + * 0 on success, negative error number on failure. + */ +int of_machine_read_compatible(const char **compatible, unsigned int index) +{ + return of_property_read_string_index(of_root, "compatible", index, compatible); +} +EXPORT_SYMBOL_GPL(of_machine_read_compatible); + /** * of_machine_device_match - Test root of device tree against a of_device_id array * @matches: NULL terminated array of of_device_id match structures to search in diff --git a/include/linux/of.h b/include/linux/of.h index be6ec4916adf..7df971d52b55 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -426,6 +426,8 @@ static inline bool of_machine_is_compatible(const char *compat) return of_machine_compatible_match(compats); } +int of_machine_read_compatible(const char **compatible, unsigned int index); + extern int of_add_property(struct device_node *np, struct property *prop); extern int of_remove_property(struct device_node *np, struct property *prop); extern int of_update_property(struct device_node *np, struct property *newprop); @@ -851,6 +853,12 @@ static inline int of_machine_is_compatible(const char *compat) return 0; } +static inline int of_machine_read_compatible(const char **compatible, + unsigned int index) +{ + return -ENOSYS; +} + static inline int of_add_property(struct device_node *np, struct property *prop) { return 0; -- cgit v1.2.3 From c86d3b7b847cc9b32a17117cfd71679e4315fd9f Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 23 Feb 2026 14:37:17 +0100 Subject: of: provide of_machine_read_model() Provide a helper function allowing users to read the model string of the machine, hiding the access to the root node. Signed-off-by: Bartosz Golaszewski Reviewed-by: Rob Herring (Arm) Link: https://patch.msgid.link/20260223-soc-of-root-v2-2-b45da45903c8@oss.qualcomm.com Signed-off-by: Greg Kroah-Hartman --- drivers/of/base.c | 13 +++++++++++++ include/linux/of.h | 6 ++++++ 2 files changed, 19 insertions(+) (limited to 'include/linux') diff --git a/drivers/of/base.c b/drivers/of/base.c index b70aec32e0e3..bf4a51887d74 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -449,6 +449,19 @@ int of_machine_read_compatible(const char **compatible, unsigned int index) } EXPORT_SYMBOL_GPL(of_machine_read_compatible); +/** + * of_machine_read_model - Get the model string of this machine + * @model: address at which the address of the model string will be stored + * + * Returns: + * 0 on success, negative error number on failure. + */ +int of_machine_read_model(const char **model) +{ + return of_property_read_string(of_root, "model", model); +} +EXPORT_SYMBOL_GPL(of_machine_read_model); + /** * of_machine_device_match - Test root of device tree against a of_device_id array * @matches: NULL terminated array of of_device_id match structures to search in diff --git a/include/linux/of.h b/include/linux/of.h index 7df971d52b55..2b95777f16f6 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -427,6 +427,7 @@ static inline bool of_machine_is_compatible(const char *compat) } int of_machine_read_compatible(const char **compatible, unsigned int index); +int of_machine_read_model(const char **model); extern int of_add_property(struct device_node *np, struct property *prop); extern int of_remove_property(struct device_node *np, struct property *prop); @@ -859,6 +860,11 @@ static inline int of_machine_read_compatible(const char **compatible, return -ENOSYS; } +static inline int of_machine_read_model(const char **model) +{ + return -ENOSYS; +} + static inline int of_add_property(struct device_node *np, struct property *prop) { return 0; -- cgit v1.2.3 From 030706e954c10749da8c75464c6b02cb30cb00aa Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 23 Feb 2026 14:37:19 +0100 Subject: base: soc: rename and export soc_device_get_machine() Some SoC drivers reimplement the functionality of soc_device_get_machine(). Make this function accessible through the sys_soc.h header and rename it to a more descriptive name. Reviewed-by: Christophe Leroy (CS GROUP) Signed-off-by: Bartosz Golaszewski Reviewed-by: Rob Herring (Arm) Link: https://patch.msgid.link/20260223-soc-of-root-v2-4-b45da45903c8@oss.qualcomm.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/soc.c | 13 +++++-------- include/linux/sys_soc.h | 10 ++++++++++ 2 files changed, 15 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/soc.c b/drivers/base/soc.c index 48e2f0dbd330..65ce72d49230 100644 --- a/drivers/base/soc.c +++ b/drivers/base/soc.c @@ -111,17 +111,14 @@ static void soc_release(struct device *dev) kfree(soc_dev); } -static void soc_device_get_machine(struct soc_device_attribute *soc_dev_attr) +int soc_attr_read_machine(struct soc_device_attribute *soc_dev_attr) { - struct device_node *np; - if (soc_dev_attr->machine) - return; + return -EBUSY; - np = of_find_node_by_path("/"); - of_property_read_string(np, "model", &soc_dev_attr->machine); - of_node_put(np); + return of_machine_read_model(&soc_dev_attr->machine); } +EXPORT_SYMBOL_GPL(soc_attr_read_machine); static struct soc_device_attribute *early_soc_dev_attr; @@ -131,7 +128,7 @@ struct soc_device *soc_device_register(struct soc_device_attribute *soc_dev_attr const struct attribute_group **soc_attr_groups; int ret; - soc_device_get_machine(soc_dev_attr); + soc_attr_read_machine(soc_dev_attr); if (!soc_bus_registered) { if (early_soc_dev_attr) diff --git a/include/linux/sys_soc.h b/include/linux/sys_soc.h index d9b3cf0f410c..f19f5cec18e2 100644 --- a/include/linux/sys_soc.h +++ b/include/linux/sys_soc.h @@ -37,6 +37,16 @@ void soc_device_unregister(struct soc_device *soc_dev); */ struct device *soc_device_to_device(struct soc_device *soc); +/** + * soc_attr_read_machine - retrieve the machine model and store it in + * the soc_device_attribute structure + * @soc_dev_attr: SoC attribute structure to store the model in + * + * Returns: + * 0 on success, negative error number on failure. + */ +int soc_attr_read_machine(struct soc_device_attribute *soc_dev_attr); + #ifdef CONFIG_SOC_BUS const struct soc_device_attribute *soc_device_match( const struct soc_device_attribute *matches); -- cgit v1.2.3 From bb729bf1d6fdf5c2087c1651165c74cef0da1742 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Tue, 10 Mar 2026 23:57:53 +0800 Subject: driver core: Add conditional guard support for device_lock() Introduce conditional guard version of device_lock() for scenarios that require conditional device lock holding. Suggested-by: Dan Williams Reviewed-by: Dan Williams Acked-by: Greg Kroah-Hartman Signed-off-by: Li Ming Link: https://patch.msgid.link/20260310-fix_access_endpoint_without_drv_check-v1-1-94fe919a0b87@zohomail.com Signed-off-by: Danilo Krummrich --- include/linux/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index 0be95294b6e6..4fafee80524b 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -911,6 +911,7 @@ static inline void device_unlock(struct device *dev) } DEFINE_GUARD(device, struct device *, device_lock(_T), device_unlock(_T)) +DEFINE_GUARD_COND(device, _intr, device_lock_interruptible(_T), _RET == 0) static inline void device_lock_assert(struct device *dev) { -- cgit v1.2.3 From e416e7fa417b2d2604c1526a2f9cc38da7ced166 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 10 Mar 2026 22:29:53 -0700 Subject: tee: clean up tee_core.h kernel-doc Use the correct struct member name and function parameter name in kernel-doc comments. Move a macro that was between a struct's documentation and its declaration. These eliminate the following kernel-doc warnings: Warning: include/linux/tee_core.h:73 struct member 'c_no_users' not described in 'tee_device' Warning: include/linux/tee_core.h:132 #define TEE_DESC_PRIVILEGED 0x1; error: Cannot parse struct or union! Warning: include/linux/tee_core.h:257 function parameter 'connection_data' not described in 'tee_session_calc_client_uuid' Warning: include/linux/tee_core.h:320 function parameter 'teedev' not described in 'tee_get_drvdata' Signed-off-by: Randy Dunlap Reviewed-by: Sumit Garg Signed-off-by: Jens Wiklander --- include/linux/tee_core.h | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tee_core.h b/include/linux/tee_core.h index ee5f0bd41f43..f993d5118edd 100644 --- a/include/linux/tee_core.h +++ b/include/linux/tee_core.h @@ -50,7 +50,7 @@ enum tee_dma_heap_id { * @dev: embedded basic device structure * @cdev: embedded cdev * @num_users: number of active users of this device - * @c_no_user: completion used when unregistering the device + * @c_no_users: completion used when unregistering the device * @mutex: mutex protecting @num_users and @idr * @idr: register of user space shared memory objects allocated or * registered on this device @@ -132,6 +132,7 @@ struct tee_driver_ops { /* Size for TEE revision string buffer used by get_tee_revision(). */ #define TEE_REVISION_STR_SIZE 128 +#define TEE_DESC_PRIVILEGED 0x1 /** * struct tee_desc - Describes the TEE driver to the subsystem * @name: name of driver @@ -139,7 +140,6 @@ struct tee_driver_ops { * @owner: module providing the driver * @flags: Extra properties of driver, defined by TEE_DESC_* below */ -#define TEE_DESC_PRIVILEGED 0x1 struct tee_desc { const char *name; const struct tee_driver_ops *ops; @@ -187,7 +187,7 @@ struct tee_protmem_pool_ops { * Allocates a new struct tee_device instance. The device is * removed by tee_device_unregister(). * - * @returns a pointer to a 'struct tee_device' or an ERR_PTR on failure + * @returns: a pointer to a 'struct tee_device' or an ERR_PTR on failure */ struct tee_device *tee_device_alloc(const struct tee_desc *teedesc, struct device *dev, @@ -201,7 +201,7 @@ struct tee_device *tee_device_alloc(const struct tee_desc *teedesc, * tee_device_unregister() need to be called to remove the @teedev if * this function fails. * - * @returns < 0 on failure + * @returns: < 0 on failure */ int tee_device_register(struct tee_device *teedev); @@ -254,14 +254,14 @@ void tee_device_set_dev_groups(struct tee_device *teedev, * tee_session_calc_client_uuid() - Calculates client UUID for session * @uuid: Resulting UUID * @connection_method: Connection method for session (TEE_IOCTL_LOGIN_*) - * @connectuon_data: Connection data for opening session + * @connection_data: Connection data for opening session * * Based on connection method calculates UUIDv5 based client UUID. * * For group based logins verifies that calling process has specified * credentials. * - * @return < 0 on failure + * @returns: < 0 on failure */ int tee_session_calc_client_uuid(uuid_t *uuid, u32 connection_method, const u8 connection_data[TEE_IOCTL_UUID_LEN]); @@ -295,7 +295,7 @@ struct tee_shm_pool_ops { * @paddr: Physical address of start of pool * @size: Size in bytes of the pool * - * @returns pointer to a 'struct tee_shm_pool' or an ERR_PTR on failure. + * @returns: pointer to a 'struct tee_shm_pool' or an ERR_PTR on failure. */ struct tee_shm_pool *tee_shm_pool_alloc_res_mem(unsigned long vaddr, phys_addr_t paddr, size_t size, @@ -318,14 +318,16 @@ static inline void tee_shm_pool_free(struct tee_shm_pool *pool) * @paddr: Physical address of start of pool * @size: Size in bytes of the pool * - * @returns pointer to a 'struct tee_protmem_pool' or an ERR_PTR on failure. + * @returns: pointer to a 'struct tee_protmem_pool' or an ERR_PTR on failure. */ struct tee_protmem_pool *tee_protmem_static_pool_alloc(phys_addr_t paddr, size_t size); /** * tee_get_drvdata() - Return driver_data pointer - * @returns the driver_data pointer supplied to tee_register(). + * @teedev: Pointer to the tee_device + * + * @returns: the driver_data pointer supplied to tee_register(). */ void *tee_get_drvdata(struct tee_device *teedev); @@ -334,7 +336,7 @@ void *tee_get_drvdata(struct tee_device *teedev); * TEE driver * @ctx: The TEE context for shared memory allocation * @size: Shared memory allocation size - * @returns a pointer to 'struct tee_shm' on success or an ERR_PTR on failure + * @returns: a pointer to 'struct tee_shm' on success or an ERR_PTR on failure */ struct tee_shm *tee_shm_alloc_priv_buf(struct tee_context *ctx, size_t size); @@ -354,7 +356,7 @@ void tee_dyn_shm_free_helper(struct tee_shm *shm, /** * tee_shm_is_dynamic() - Check if shared memory object is of the dynamic kind * @shm: Shared memory handle - * @returns true if object is dynamic shared memory + * @returns: true if object is dynamic shared memory */ static inline bool tee_shm_is_dynamic(struct tee_shm *shm) { @@ -370,7 +372,7 @@ void tee_shm_put(struct tee_shm *shm); /** * tee_shm_get_id() - Get id of a shared memory object * @shm: Shared memory handle - * @returns id + * @returns: id */ static inline int tee_shm_get_id(struct tee_shm *shm) { @@ -382,7 +384,7 @@ static inline int tee_shm_get_id(struct tee_shm *shm) * count * @ctx: Context owning the shared memory * @id: Id of shared memory object - * @returns a pointer to 'struct tee_shm' on success or an ERR_PTR on failure + * @returns: a pointer to 'struct tee_shm' on success or an ERR_PTR on failure */ struct tee_shm *tee_shm_get_from_id(struct tee_context *ctx, int id); @@ -402,7 +404,7 @@ static inline bool tee_param_is_memref(struct tee_param *param) * teedev_open() - Open a struct tee_device * @teedev: Device to open * - * @return a pointer to struct tee_context on success or an ERR_PTR on failure. + * @returns: pointer to struct tee_context on success or an ERR_PTR on failure. */ struct tee_context *teedev_open(struct tee_device *teedev); -- cgit v1.2.3 From fe2511adb1fc1814df06ca11e0d8a92f792e4029 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 1 Mar 2026 13:30:17 +0100 Subject: sysfs: constify group arrays in function arguments Constify the groups array argument where applicable. This allows to pass constant arrays as arguments. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/17035265-8882-4101-b7a7-16b3eb94f8b5@gmail.com Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/group.c | 10 +++++----- include/linux/sysfs.h | 16 ++++++++-------- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index e1e639f515a0..b3edae0578c0 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -217,7 +217,7 @@ int sysfs_create_group(struct kobject *kobj, EXPORT_SYMBOL_GPL(sysfs_create_group); static int internal_create_groups(struct kobject *kobj, int update, - const struct attribute_group **groups) + const struct attribute_group *const *groups) { int error = 0; int i; @@ -250,7 +250,7 @@ static int internal_create_groups(struct kobject *kobj, int update, * Returns 0 on success or error code from sysfs_create_group on failure. */ int sysfs_create_groups(struct kobject *kobj, - const struct attribute_group **groups) + const struct attribute_group *const *groups) { return internal_create_groups(kobj, 0, groups); } @@ -268,7 +268,7 @@ EXPORT_SYMBOL_GPL(sysfs_create_groups); * Returns 0 on success or error code from sysfs_update_group on failure. */ int sysfs_update_groups(struct kobject *kobj, - const struct attribute_group **groups) + const struct attribute_group *const *groups) { return internal_create_groups(kobj, 1, groups); } @@ -342,7 +342,7 @@ EXPORT_SYMBOL_GPL(sysfs_remove_group); * If groups is not NULL, remove the specified groups from the kobject. */ void sysfs_remove_groups(struct kobject *kobj, - const struct attribute_group **groups) + const struct attribute_group *const *groups) { int i; @@ -613,7 +613,7 @@ EXPORT_SYMBOL_GPL(sysfs_group_change_owner); * Returns 0 on success or error code on failure. */ int sysfs_groups_change_owner(struct kobject *kobj, - const struct attribute_group **groups, + const struct attribute_group *const *groups, kuid_t kuid, kgid_t kgid) { int error = 0, i; diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index 99b775f3ff46..9777e9445dd5 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -445,15 +445,15 @@ void sysfs_delete_link(struct kobject *dir, struct kobject *targ, int __must_check sysfs_create_group(struct kobject *kobj, const struct attribute_group *grp); int __must_check sysfs_create_groups(struct kobject *kobj, - const struct attribute_group **groups); + const struct attribute_group *const *groups); int __must_check sysfs_update_groups(struct kobject *kobj, - const struct attribute_group **groups); + const struct attribute_group *const *groups); int sysfs_update_group(struct kobject *kobj, const struct attribute_group *grp); void sysfs_remove_group(struct kobject *kobj, const struct attribute_group *grp); void sysfs_remove_groups(struct kobject *kobj, - const struct attribute_group **groups); + const struct attribute_group *const *groups); int sysfs_add_file_to_group(struct kobject *kobj, const struct attribute *attr, const char *group); void sysfs_remove_file_from_group(struct kobject *kobj, @@ -486,7 +486,7 @@ int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid); int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ, const char *name, kuid_t kuid, kgid_t kgid); int sysfs_groups_change_owner(struct kobject *kobj, - const struct attribute_group **groups, + const struct attribute_group *const *groups, kuid_t kuid, kgid_t kgid); int sysfs_group_change_owner(struct kobject *kobj, const struct attribute_group *groups, kuid_t kuid, @@ -629,13 +629,13 @@ static inline int sysfs_create_group(struct kobject *kobj, } static inline int sysfs_create_groups(struct kobject *kobj, - const struct attribute_group **groups) + const struct attribute_group *const *groups) { return 0; } static inline int sysfs_update_groups(struct kobject *kobj, - const struct attribute_group **groups) + const struct attribute_group *const *groups) { return 0; } @@ -652,7 +652,7 @@ static inline void sysfs_remove_group(struct kobject *kobj, } static inline void sysfs_remove_groups(struct kobject *kobj, - const struct attribute_group **groups) + const struct attribute_group *const *groups) { } @@ -733,7 +733,7 @@ static inline int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t k } static inline int sysfs_groups_change_owner(struct kobject *kobj, - const struct attribute_group **groups, + const struct attribute_group *const *groups, kuid_t kuid, kgid_t kgid) { return 0; -- cgit v1.2.3 From ece5283706aff6791a37894bafbb0c134a94c0f3 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 1 Mar 2026 13:31:02 +0100 Subject: driver: core: constify groups array argument in device_add_groups and device_remove_groups Now that sysfs_create_groups() and sysfs_remove_groups() allow to pass constant groups arrays, we can constify the groups array argument also here. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/8ea2d6d1-0adb-4d7f-92bc-751e93ce08d6@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 5 +++-- include/linux/device.h | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/core.c b/drivers/base/core.c index 791f9e444df8..f497b724332a 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -2831,14 +2831,15 @@ static ssize_t removable_show(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR_RO(removable); -int device_add_groups(struct device *dev, const struct attribute_group **groups) +int device_add_groups(struct device *dev, + const struct attribute_group *const *groups) { return sysfs_create_groups(&dev->kobj, groups); } EXPORT_SYMBOL_GPL(device_add_groups); void device_remove_groups(struct device *dev, - const struct attribute_group **groups) + const struct attribute_group *const *groups) { sysfs_remove_groups(&dev->kobj, groups); } diff --git a/include/linux/device.h b/include/linux/device.h index 0be95294b6e6..48a0444ccc1e 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1131,9 +1131,9 @@ device_create_with_groups(const struct class *cls, struct device *parent, dev_t void device_destroy(const struct class *cls, dev_t devt); int __must_check device_add_groups(struct device *dev, - const struct attribute_group **groups); + const struct attribute_group *const *groups); void device_remove_groups(struct device *dev, - const struct attribute_group **groups); + const struct attribute_group *const *groups); static inline int __must_check device_add_group(struct device *dev, const struct attribute_group *grp) -- cgit v1.2.3 From 10f874dc92b3f3bf96470d997bdf157b289c9d4c Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 1 Mar 2026 13:31:56 +0100 Subject: driver core: make struct class groups members constant arrays Constify the groups arrays, allowing to assign constant arrays. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/7ff56b07-09ca-4948-b98f-5bd37ceef21e@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/device/class.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/device/class.h b/include/linux/device/class.h index 65880e60c720..2079239a5aa5 100644 --- a/include/linux/device/class.h +++ b/include/linux/device/class.h @@ -50,8 +50,8 @@ struct fwnode_handle; struct class { const char *name; - const struct attribute_group **class_groups; - const struct attribute_group **dev_groups; + const struct attribute_group *const *class_groups; + const struct attribute_group *const *dev_groups; int (*dev_uevent)(const struct device *dev, struct kobj_uevent_env *env); char *(*devnode)(const struct device *dev, umode_t *mode); -- cgit v1.2.3 From cb0caadb64ca0894c4a24e1a34841f260d462f90 Mon Sep 17 00:00:00 2001 From: Shayne Chen Date: Fri, 13 Mar 2026 14:21:49 +0800 Subject: wifi: ieee80211: fix definition of EHT-MCS 15 in MRU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to the definition in IEEE Std 802.11be-2024, Table 9-417r, each bit indicates support for the transmission and reception of EHT-MCS 15 in: - B0: 52+26-tone and 106+26-tone MRUs. - B1: a 484+242-tone MRU if 80 MHz is supported. - B2: a 996+484-tone MRU and a 996+484+242-tone MRU if 160 MHz is supported. - B3: a 3×996-tone MRU if 320 MHz is supported. Fixes: 6239da18d2f9 ("wifi: mac80211: adjust EHT capa when lowering bandwidth") Signed-off-by: Shayne Chen Link: https://patch.msgid.link/20260313062150.3165433-1-shayne.chen@mediatek.com Signed-off-by: Johannes Berg --- include/linux/ieee80211-eht.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211-eht.h b/include/linux/ieee80211-eht.h index f8e9f5d36d2a..a97b1d01f3ac 100644 --- a/include/linux/ieee80211-eht.h +++ b/include/linux/ieee80211-eht.h @@ -251,8 +251,8 @@ struct ieee80211_eht_operation_info { #define IEEE80211_EHT_PHY_CAP5_SUPP_EXTRA_EHT_LTF 0x40 #define IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK 0x07 -#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_80MHZ 0x08 -#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_160MHZ 0x30 +#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_80MHZ 0x10 +#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_160MHZ 0x20 #define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_320MHZ 0x40 #define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK 0x78 #define IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP 0x80 -- cgit v1.2.3 From 7caedbb5ade345df0eec0bf01035c780919a9f56 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 9 Mar 2026 13:37:02 -0700 Subject: integrity: Eliminate weak definition of arch_get_secureboot() security/integrity/secure_boot.c contains a single __weak function, which breaks recordmcount when building with clang: $ make -skj"$(nproc)" ARCH=powerpc LLVM=1 ppc64_defconfig security/integrity/secure_boot.o Cannot find symbol for section 2: .text. security/integrity/secure_boot.o: failed Introduce a Kconfig symbol, CONFIG_HAVE_ARCH_GET_SECUREBOOT, to indicate that an architecture provides a definition of arch_get_secureboot(). Provide a static inline stub when this symbol is not defined to achieve the same effect as the __weak function, allowing secure_boot.c to be removed altogether. Move the s390 definition of arch_get_secureboot() out of the CONFIG_KEXEC_FILE block to ensure it is always available, as it does not actually depend on KEXEC_FILE. Reported-by: Arnd Bergmann Fixes: 31a6a07eefeb ("integrity: Make arch_ima_get_secureboot integrity-wide") Signed-off-by: Nathan Chancellor Acked-by: Arnd Bergmann Signed-off-by: Mimi Zohar --- arch/Kconfig | 3 +++ arch/powerpc/Kconfig | 1 + arch/s390/Kconfig | 1 + arch/s390/kernel/ipl.c | 10 +++++----- include/linux/secure_boot.h | 4 ++++ security/integrity/Makefile | 2 +- security/integrity/secure_boot.c | 16 ---------------- 7 files changed, 15 insertions(+), 22 deletions(-) delete mode 100644 security/integrity/secure_boot.c (limited to 'include/linux') diff --git a/arch/Kconfig b/arch/Kconfig index 102ddbd4298e..a6d1c8cc1d64 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1841,4 +1841,7 @@ config ARCH_WANTS_PRE_LINK_VMLINUX config ARCH_HAS_CPU_ATTACK_VECTORS bool +config HAVE_ARCH_GET_SECUREBOOT + def_bool EFI + endmenu diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index ad7a2fe63a2a..da1eafb64354 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -1061,6 +1061,7 @@ config PPC_SECURE_BOOT depends on IMA_ARCH_POLICY imply IMA_SECURE_AND_OR_TRUSTED_BOOT select PSERIES_PLPKS if PPC_PSERIES + select HAVE_ARCH_GET_SECUREBOOT help Systems with firmware secure boot enabled need to define security policies to extend secure boot to the OS. This config allows a user diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 2101cc738b5e..4197c20d34b4 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -181,6 +181,7 @@ config S390 select GENERIC_IOREMAP if PCI select HAVE_ALIGNED_STRUCT_PAGE select HAVE_ARCH_AUDITSYSCALL + select HAVE_ARCH_GET_SECUREBOOT select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_JUMP_LABEL_RELATIVE select HAVE_ARCH_KASAN diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 2d01a1713938..3c346b02ceb9 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -2388,6 +2388,11 @@ void __no_stack_protector s390_reset_system(void) diag_amode31_ops.diag308_reset(); } +bool arch_get_secureboot(void) +{ + return ipl_secure_flag; +} + #ifdef CONFIG_KEXEC_FILE int ipl_report_add_component(struct ipl_report *report, struct kexec_buf *kbuf, @@ -2505,11 +2510,6 @@ out: return buf; } -bool arch_get_secureboot(void) -{ - return ipl_secure_flag; -} - int ipl_report_free(struct ipl_report *report) { struct ipl_report_component *comp, *ncomp; diff --git a/include/linux/secure_boot.h b/include/linux/secure_boot.h index 3ded3f03655c..d17e92351567 100644 --- a/include/linux/secure_boot.h +++ b/include/linux/secure_boot.h @@ -10,10 +10,14 @@ #include +#ifdef CONFIG_HAVE_ARCH_GET_SECUREBOOT /* * Returns true if the platform secure boot is enabled. * Returns false if disabled or not supported. */ bool arch_get_secureboot(void); +#else +static inline bool arch_get_secureboot(void) { return false; } +#endif #endif /* _LINUX_SECURE_BOOT_H */ diff --git a/security/integrity/Makefile b/security/integrity/Makefile index 548665e2b702..45dfdedbdad4 100644 --- a/security/integrity/Makefile +++ b/security/integrity/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_INTEGRITY) += integrity.o -integrity-y := iint.o secure_boot.o +integrity-y := iint.o integrity-$(CONFIG_INTEGRITY_AUDIT) += integrity_audit.o integrity-$(CONFIG_INTEGRITY_SIGNATURE) += digsig.o integrity-$(CONFIG_INTEGRITY_ASYMMETRIC_KEYS) += digsig_asymmetric.o diff --git a/security/integrity/secure_boot.c b/security/integrity/secure_boot.c deleted file mode 100644 index fc2693c286f8..000000000000 --- a/security/integrity/secure_boot.c +++ /dev/null @@ -1,16 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2026 Red Hat, Inc. All Rights Reserved. - * - * Author: Coiby Xu - */ -#include - -/* - * Default weak implementation. - * Architectures that support secure boot must override this. - */ -__weak bool arch_get_secureboot(void) -{ - return false; -} -- cgit v1.2.3 From 97e6fabee5dcb2d86d4ff45f20606b8a73181f74 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 13 Mar 2026 13:53:03 +0100 Subject: driver core: auxiliary bus: Introduce dev_is_auxiliary() Introduce dev_is_auxiliary() in analogy with dev_is_platform() to facilitate subsequent changes. Signed-off-by: Rafael J. Wysocki Reviewed-by: Danilo Krummrich Reviewed-by: Greg Kroah-Hartman Link: https://patch.msgid.link/5079467.GXAFRqVoOG@rafael.j.wysocki --- drivers/base/auxiliary.c | 10 ++++++++++ include/linux/auxiliary_bus.h | 2 ++ 2 files changed, 12 insertions(+) (limited to 'include/linux') diff --git a/drivers/base/auxiliary.c b/drivers/base/auxiliary.c index 9fd3820d1f8a..11949d6bcda4 100644 --- a/drivers/base/auxiliary.c +++ b/drivers/base/auxiliary.c @@ -502,6 +502,16 @@ struct auxiliary_device *__devm_auxiliary_device_create(struct device *dev, } EXPORT_SYMBOL_GPL(__devm_auxiliary_device_create); +/** + * dev_is_auxiliary - check if the device is an auxiliary one + * @dev: device to check + */ +bool dev_is_auxiliary(struct device *dev) +{ + return dev->bus == &auxiliary_bus_type; +} +EXPORT_SYMBOL_GPL(dev_is_auxiliary); + void __init auxiliary_bus_init(void) { WARN_ON(bus_register(&auxiliary_bus_type)); diff --git a/include/linux/auxiliary_bus.h b/include/linux/auxiliary_bus.h index 4086afd0cc6b..bc09b55e3682 100644 --- a/include/linux/auxiliary_bus.h +++ b/include/linux/auxiliary_bus.h @@ -271,6 +271,8 @@ struct auxiliary_device *__devm_auxiliary_device_create(struct device *dev, __devm_auxiliary_device_create(dev, KBUILD_MODNAME, devname, \ platform_data, 0) +bool dev_is_auxiliary(struct device *dev); + /** * module_auxiliary_driver() - Helper macro for registering an auxiliary driver * @__auxiliary_driver: auxiliary driver struct -- cgit v1.2.3 From 98d709cba3193f0bec54da4cd76ef499ea2f1ef7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 13 Mar 2026 09:43:22 -1000 Subject: sched_ext: Implement SCX_ENQ_IMMED Add SCX_ENQ_IMMED enqueue flag for local DSQ insertions. Once a task is dispatched with IMMED, it either gets on the CPU immediately and stays on it, or gets reenqueued back to the BPF scheduler. It will never linger on a local DSQ behind other tasks or on a CPU taken by a higher-priority class. rq_is_open() uses rq->next_class to determine whether the rq is available, and wakeup_preempt_scx() triggers reenqueue when a higher-priority class task arrives. These capture all higher class preemptions. Combined with reenqueue points in the dispatch path, all cases where an IMMED task would not execute immediately are covered. SCX_TASK_IMMED persists in p->scx.flags until the next fresh enqueue, so the guarantee survives SAVE/RESTORE cycles. If preempted while running, put_prev_task_scx() reenqueues through ops.enqueue() with SCX_TASK_REENQ_PREEMPTED instead of silently placing the task back on the local DSQ. This enables tighter scheduling latency control by preventing tasks from piling up on local DSQs. It also enables opportunistic CPU sharing across sub-schedulers - without this, a sub-scheduler can stuff the local DSQ of a shared CPU, making it difficult for others to use. v2: - Rewrite is_curr_done() as rq_is_open() using rq->next_class and implement wakeup_preempt_scx() to achieve complete coverage of all cases where IMMED tasks could get stranded. - Track IMMED persistently in p->scx.flags and reenqueue preempted-while-running tasks through ops.enqueue(). - Bound deferred reenq cycles (SCX_REENQ_LOCAL_MAX_REPEAT). - Misc renames, documentation. Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- include/linux/sched/ext.h | 5 + kernel/sched/ext.c | 271 ++++++++++++++++++++++++++++--- kernel/sched/ext_internal.h | 47 ++++++ kernel/sched/sched.h | 2 + tools/sched_ext/include/scx/compat.bpf.h | 5 + 5 files changed, 311 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 60a4f65d0174..602dc83cab36 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -100,6 +100,7 @@ enum scx_ent_flags { SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */ SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ SCX_TASK_SUB_INIT = 1 << 4, /* task being initialized for a sub sched */ + SCX_TASK_IMMED = 1 << 5, /* task is on local DSQ with %SCX_ENQ_IMMED */ /* * Bits 8 and 9 are used to carry task state: @@ -125,6 +126,8 @@ enum scx_ent_flags { * * NONE not being reenqueued * KFUNC reenqueued by scx_bpf_dsq_reenq() and friends + * IMMED reenqueued due to failed ENQ_IMMED + * PREEMPTED preempted while running */ SCX_TASK_REENQ_REASON_SHIFT = 12, SCX_TASK_REENQ_REASON_BITS = 2, @@ -132,6 +135,8 @@ enum scx_ent_flags { SCX_TASK_REENQ_NONE = 0 << SCX_TASK_REENQ_REASON_SHIFT, SCX_TASK_REENQ_KFUNC = 1 << SCX_TASK_REENQ_REASON_SHIFT, + SCX_TASK_REENQ_IMMED = 2 << SCX_TASK_REENQ_REASON_SHIFT, + SCX_TASK_REENQ_PREEMPTED = 3 << SCX_TASK_REENQ_REASON_SHIFT, /* iteration cursor, not a task */ SCX_TASK_CURSOR = 1 << 31, diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 2f59265b9b57..c75c35b67a18 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -406,6 +406,62 @@ static bool bypass_dsp_enabled(struct scx_sched *sch) return unlikely(atomic_read(&sch->bypass_dsp_enable_depth)); } +/** + * rq_is_open - Is the rq available for immediate execution of an SCX task? + * @rq: rq to test + * @enq_flags: optional %SCX_ENQ_* of the task being enqueued + * + * Returns %true if @rq is currently open for executing an SCX task. After a + * %false return, @rq is guaranteed to invoke SCX dispatch path at least once + * before going to idle and not inserting a task into @rq's local DSQ after a + * %false return doesn't cause @rq to stall. + */ +static bool rq_is_open(struct rq *rq, u64 enq_flags) +{ + lockdep_assert_rq_held(rq); + + /* + * A higher-priority class task is either running or in the process of + * waking up on @rq. + */ + if (sched_class_above(rq->next_class, &ext_sched_class)) + return false; + + /* + * @rq is either in transition to or in idle and there is no + * higher-priority class task waking up on it. + */ + if (sched_class_above(&ext_sched_class, rq->next_class)) + return true; + + /* + * @rq is either picking, in transition to, or running an SCX task. + */ + + /* + * If we're in the dispatch path holding rq lock, $curr may or may not + * be ready depending on whether the on-going dispatch decides to extend + * $curr's slice. We say yes here and resolve it at the end of dispatch. + * See balance_one(). + */ + if (rq->scx.flags & SCX_RQ_IN_BALANCE) + return true; + + /* + * %SCX_ENQ_PREEMPT clears $curr's slice if on SCX and kicks dispatch, + * so allow it to avoid spuriously triggering reenq on a combined + * PREEMPT|IMMED insertion. + */ + if (enq_flags & SCX_ENQ_PREEMPT) + return true; + + /* + * @rq is either in transition to or running an SCX task and can't go + * idle without another SCX dispatch cycle. + */ + return false; +} + /* * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate @@ -1220,6 +1276,16 @@ static void schedule_dsq_reenq(struct scx_sched *sch, struct scx_dispatch_q *dsq } } +static void schedule_reenq_local(struct rq *rq, u64 reenq_flags) +{ + struct scx_sched *root = rcu_dereference_sched(scx_root); + + if (WARN_ON_ONCE(!root)) + return; + + schedule_dsq_reenq(root, &rq->scx.local_dsq, reenq_flags); +} + /** * touch_core_sched - Update timestamp used for core-sched task ordering * @rq: rq to read clock from, must be locked @@ -1296,10 +1362,58 @@ static bool scx_dsq_priq_less(struct rb_node *node_a, return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime); } -static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta) +static void dsq_inc_nr(struct scx_dispatch_q *dsq, struct task_struct *p, u64 enq_flags) { /* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */ - WRITE_ONCE(dsq->nr, dsq->nr + delta); + WRITE_ONCE(dsq->nr, dsq->nr + 1); + + /* + * Once @p reaches a local DSQ, it can only leave it by being dispatched + * to the CPU or dequeued. In both cases, the only way @p can go back to + * the BPF sched is through enqueueing. If being inserted into a local + * DSQ with IMMED, persist the state until the next enqueueing event in + * do_enqueue_task() so that we can maintain IMMED protection through + * e.g. SAVE/RESTORE cycles and slice extensions. + */ + if (enq_flags & SCX_ENQ_IMMED) { + if (unlikely(dsq->id != SCX_DSQ_LOCAL)) { + WARN_ON_ONCE(!(enq_flags & SCX_ENQ_GDSQ_FALLBACK)); + return; + } + p->scx.flags |= SCX_TASK_IMMED; + } + + if (p->scx.flags & SCX_TASK_IMMED) { + struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); + + if (WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL)) + return; + + rq->scx.nr_immed++; + + /* + * If @rq already had other tasks or the current task is not + * done yet, @p can't go on the CPU immediately. Re-enqueue. + */ + if (unlikely(dsq->nr > 1 || !rq_is_open(rq, enq_flags))) + schedule_reenq_local(rq, 0); + } +} + +static void dsq_dec_nr(struct scx_dispatch_q *dsq, struct task_struct *p) +{ + /* see dsq_inc_nr() */ + WRITE_ONCE(dsq->nr, dsq->nr - 1); + + if (p->scx.flags & SCX_TASK_IMMED) { + struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); + + if (WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL) || + WARN_ON_ONCE(rq->scx.nr_immed <= 0)) + return; + + rq->scx.nr_immed--; + } } static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p) @@ -1458,7 +1572,7 @@ static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq, WRITE_ONCE(dsq->seq, dsq->seq + 1); p->scx.dsq_seq = dsq->seq; - dsq_mod_nr(dsq, 1); + dsq_inc_nr(dsq, p, enq_flags); p->scx.dsq = dsq; /* @@ -1512,7 +1626,7 @@ static void task_unlink_from_dsq(struct task_struct *p, } list_del_init(&p->scx.dsq_list.node); - dsq_mod_nr(dsq, -1); + dsq_dec_nr(dsq, p); if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN) && dsq->first_task == p) { struct task_struct *first_task; @@ -1723,10 +1837,18 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); - /* rq migration */ + /* internal movements - rq migration / RESTORE */ if (sticky_cpu == cpu_of(rq)) goto local_norefill; + /* + * Clear persistent TASK_IMMED for fresh enqueues, see dsq_inc_nr(). + * Note that exiting and migration-disabled tasks that skip + * ops.enqueue() below will lose IMMED protection unless + * %SCX_OPS_ENQ_EXITING / %SCX_OPS_ENQ_MIGRATION_DISABLED are set. + */ + p->scx.flags &= ~SCX_TASK_IMMED; + /* * If !scx_rq_online(), we already told the BPF scheduler that the CPU * is offline and are just running the hotplug path. Don't bother the @@ -2032,6 +2154,30 @@ static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) return false; } +static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) +{ + /* + * Preemption between SCX tasks is implemented by resetting the victim + * task's slice to 0 and triggering reschedule on the target CPU. + * Nothing to do. + */ + if (p->sched_class == &ext_sched_class) + return; + + /* + * Getting preempted by a higher-priority class. Reenqueue IMMED tasks. + * This captures all preemption cases including: + * + * - A SCX task is currently running. + * + * - @rq is waking from idle due to a SCX task waking to it. + * + * - A higher-priority wakes up while SCX dispatch is in progress. + */ + if (rq->scx.nr_immed) + schedule_reenq_local(rq, 0); +} + static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct scx_dispatch_q *src_dsq, struct rq *dst_rq) @@ -2049,7 +2195,7 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags, else list_add_tail(&p->scx.dsq_list.node, &dst_dsq->list); - dsq_mod_nr(dst_dsq, 1); + dsq_inc_nr(dst_dsq, p, enq_flags); p->scx.dsq = dst_dsq; local_dsq_post_enq(dst_dsq, p, enq_flags); @@ -2257,6 +2403,7 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch, unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { dst_dsq = find_global_dsq(sch, task_cpu(p)); dst_rq = src_rq; + enq_flags |= SCX_ENQ_GDSQ_FALLBACK; } } else { /* no need to migrate if destination is a non-local DSQ */ @@ -2385,7 +2532,7 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq, if (src_rq != dst_rq && unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { dispatch_enqueue(sch, rq, find_global_dsq(sch, task_cpu(p)), p, - enq_flags | SCX_ENQ_CLEAR_OPSS); + enq_flags | SCX_ENQ_CLEAR_OPSS | SCX_ENQ_GDSQ_FALLBACK); return; } @@ -2738,6 +2885,19 @@ static int balance_one(struct rq *rq, struct task_struct *prev) return false; has_tasks: + /* + * @rq may have extra IMMED tasks without reenq scheduled: + * + * - rq_is_open() can't reliably tell when and how slice is going to be + * modified for $curr and allows IMMED tasks to be queued while + * dispatch is in progress. + * + * - A non-IMMED HEAD task can get queued in front of an IMMED task + * between the IMMED queueing and the subsequent scheduling event. + */ + if (unlikely(rq->scx.local_dsq.nr > 1 && rq->scx.nr_immed)) + schedule_reenq_local(rq, 0); + rq->scx.flags &= ~SCX_RQ_IN_BALANCE; return true; } @@ -2859,11 +3019,17 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p, * If @p has slice left and is being put, @p is getting * preempted by a higher priority scheduler class or core-sched * forcing a different task. Leave it at the head of the local - * DSQ. + * DSQ unless it was an IMMED task. IMMED tasks should not + * linger on a busy CPU, reenqueue them to the BPF scheduler. */ if (p->scx.slice && !scx_bypassing(sch, cpu_of(rq))) { - dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p, - SCX_ENQ_HEAD); + if (p->scx.flags & SCX_TASK_IMMED) { + p->scx.flags |= SCX_TASK_REENQ_PREEMPTED; + do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); + p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; + } else { + dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p, SCX_ENQ_HEAD); + } goto switch_class; } @@ -3682,8 +3848,6 @@ static void switched_from_scx(struct rq *rq, struct task_struct *p) scx_disable_task(scx_task_sched(p), p); } -static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) {} - static void switched_to_scx(struct rq *rq, struct task_struct *p) {} int scx_check_setscheduler(struct task_struct *p, int policy) @@ -3725,9 +3889,45 @@ static void process_ddsp_deferred_locals(struct rq *rq) } } +/* + * Determine whether @p should be reenqueued from a local DSQ. + * + * @reenq_flags is mutable and accumulates state across the DSQ walk: + * + * - %SCX_REENQ_TSR_NOT_FIRST: Set after the first task is visited. "First" + * tracks position in the DSQ list, not among IMMED tasks. A non-IMMED task at + * the head consumes the first slot. + * + * - %SCX_REENQ_TSR_RQ_OPEN: Set by reenq_local() before the walk if + * rq_is_open() is true. + * + * An IMMED task is kept (returns %false) only if it's the first task in the DSQ + * AND the current task is done — i.e. it will execute immediately. All other + * IMMED tasks are reenqueued. This means if a non-IMMED task sits at the head, + * every IMMED task behind it gets reenqueued. + * + * Reenqueued tasks go through ops.enqueue() with %SCX_ENQ_REENQ | + * %SCX_TASK_REENQ_IMMED. If the BPF scheduler dispatches back to the same local + * DSQ with %SCX_ENQ_IMMED while the CPU is still unavailable, this triggers + * another reenq cycle. Repetitions are bounded by %SCX_REENQ_LOCAL_MAX_REPEAT + * in process_deferred_reenq_locals(). + */ static bool local_task_should_reenq(struct task_struct *p, u64 *reenq_flags, u32 *reason) { + bool first; + + first = !(*reenq_flags & SCX_REENQ_TSR_NOT_FIRST); + *reenq_flags |= SCX_REENQ_TSR_NOT_FIRST; + *reason = SCX_TASK_REENQ_KFUNC; + + if ((p->scx.flags & SCX_TASK_IMMED) && + (!first || !(*reenq_flags & SCX_REENQ_TSR_RQ_OPEN))) { + __scx_add_event(scx_task_sched(p), SCX_EV_REENQ_IMMED, 1); + *reason = SCX_TASK_REENQ_IMMED; + return true; + } + return *reenq_flags & SCX_REENQ_ANY; } @@ -3739,6 +3939,11 @@ static u32 reenq_local(struct scx_sched *sch, struct rq *rq, u64 reenq_flags) lockdep_assert_rq_held(rq); + if (WARN_ON_ONCE(reenq_flags & __SCX_REENQ_TSR_MASK)) + reenq_flags &= ~__SCX_REENQ_TSR_MASK; + if (rq_is_open(rq, 0)) + reenq_flags |= SCX_REENQ_TSR_RQ_OPEN; + /* * The BPF scheduler may choose to dispatch tasks back to * @rq->scx.local_dsq. Move all candidate tasks off to a private list @@ -3792,11 +3997,14 @@ static u32 reenq_local(struct scx_sched *sch, struct rq *rq, u64 reenq_flags) static void process_deferred_reenq_locals(struct rq *rq) { + u64 seq = ++rq->scx.deferred_reenq_locals_seq; + lockdep_assert_rq_held(rq); while (true) { struct scx_sched *sch; u64 reenq_flags; + bool skip = false; scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) { struct scx_deferred_reenq_local *drl = @@ -3811,15 +4019,31 @@ static void process_deferred_reenq_locals(struct rq *rq) sch_pcpu = container_of(drl, struct scx_sched_pcpu, deferred_reenq_local); sch = sch_pcpu->sch; + reenq_flags = drl->flags; WRITE_ONCE(drl->flags, 0); list_del_init(&drl->node); + + if (likely(drl->seq != seq)) { + drl->seq = seq; + drl->cnt = 0; + } else { + if (unlikely(++drl->cnt > SCX_REENQ_LOCAL_MAX_REPEAT)) { + scx_error(sch, "SCX_ENQ_REENQ on SCX_DSQ_LOCAL repeated %u times", + drl->cnt); + skip = true; + } + + __scx_add_event(sch, SCX_EV_REENQ_LOCAL_REPEAT, 1); + } } - /* see schedule_dsq_reenq() */ - smp_mb(); + if (!skip) { + /* see schedule_dsq_reenq() */ + smp_mb(); - reenq_local(sch, rq, reenq_flags); + reenq_local(sch, rq, reenq_flags); + } } } @@ -4208,10 +4432,6 @@ static void scx_cgroup_unlock(void) {} /* * Omitted operations: * - * - wakeup_preempt: NOOP as it isn't useful in the wakeup path because the task - * isn't tied to the CPU at that point. Preemption is implemented by resetting - * the victim task's slice to 0 and triggering reschedule on the target CPU. - * * - migrate_task_rq: Unnecessary as task to cpu mapping is transient. * * - task_fork/dead: We need fork/dead notifications for all tasks regardless of @@ -4580,6 +4800,8 @@ static ssize_t scx_attr_events_show(struct kobject *kobj, at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST); at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING); at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); + at += scx_attr_event_show(buf, at, &events, SCX_EV_REENQ_IMMED); + at += scx_attr_event_show(buf, at, &events, SCX_EV_REENQ_LOCAL_REPEAT); at += scx_attr_event_show(buf, at, &events, SCX_EV_REFILL_SLICE_DFL); at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION); at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH); @@ -6019,6 +6241,8 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei, scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST); scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING); scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); + scx_dump_event(s, &events, SCX_EV_REENQ_IMMED); + scx_dump_event(s, &events, SCX_EV_REENQ_LOCAL_REPEAT); scx_dump_event(s, &events, SCX_EV_REFILL_SLICE_DFL); scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION); scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH); @@ -7532,6 +7756,13 @@ void __init init_sched_ext_class(void) */ static bool scx_vet_enq_flags(struct scx_sched *sch, u64 dsq_id, u64 enq_flags) { + if ((enq_flags & SCX_ENQ_IMMED) && + unlikely(dsq_id != SCX_DSQ_LOCAL && + (dsq_id & SCX_DSQ_LOCAL_ON) != SCX_DSQ_LOCAL_ON)) { + scx_error(sch, "SCX_ENQ_IMMED on a non-local DSQ 0x%llx", dsq_id); + return false; + } + return true; } @@ -9101,6 +9332,8 @@ static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *event scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_EXITING); scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); + scx_agg_event(events, e_cpu, SCX_EV_REENQ_IMMED); + scx_agg_event(events, e_cpu, SCX_EV_REENQ_LOCAL_REPEAT); scx_agg_event(events, e_cpu, SCX_EV_REFILL_SLICE_DFL); scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DURATION); scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DISPATCH); diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h index c78dadaadab8..2ef855f7c861 100644 --- a/kernel/sched/ext_internal.h +++ b/kernel/sched/ext_internal.h @@ -31,6 +31,8 @@ enum scx_consts { SCX_BYPASS_LB_MIN_DELTA_DIV = 4, SCX_BYPASS_LB_BATCH = 256, + SCX_REENQ_LOCAL_MAX_REPEAT = 256, + SCX_SUB_MAX_DEPTH = 4, }; @@ -887,6 +889,24 @@ struct scx_event_stats { */ s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED; + /* + * The number of times a task, enqueued on a local DSQ with + * SCX_ENQ_IMMED, was re-enqueued because the CPU was not available for + * immediate execution. + */ + s64 SCX_EV_REENQ_IMMED; + + /* + * The number of times a reenq of local DSQ caused another reenq of + * local DSQ. This can happen when %SCX_ENQ_IMMED races against a higher + * priority class task even if the BPF scheduler always satisfies the + * prerequisites for %SCX_ENQ_IMMED at the time of enqueue. However, + * that scenario is very unlikely and this count going up regularly + * indicates that the BPF scheduler is handling %SCX_ENQ_REENQ + * incorrectly causing recursive reenqueues. + */ + s64 SCX_EV_REENQ_LOCAL_REPEAT; + /* * Total number of times a task's time slice was refilled with the * default value (SCX_SLICE_DFL). @@ -951,6 +971,8 @@ struct scx_dsp_ctx { struct scx_deferred_reenq_local { struct list_head node; u64 flags; + u64 seq; + u32 cnt; }; struct scx_sched_pcpu { @@ -1074,6 +1096,24 @@ enum scx_enq_flags { */ SCX_ENQ_PREEMPT = 1LLU << 32, + /* + * Only allowed on local DSQs. Guarantees that the task either gets + * on the CPU immediately and stays on it, or gets reenqueued back + * to the BPF scheduler. It will never linger on a local DSQ or be + * silently put back after preemption. + * + * The protection persists until the next fresh enqueue - it + * survives SAVE/RESTORE cycles, slice extensions and preemption. + * If the task can't stay on the CPU for any reason, it gets + * reenqueued back to the BPF scheduler. + * + * Exiting and migration-disabled tasks bypass ops.enqueue() and + * are placed directly on a local DSQ without IMMED protection + * unless %SCX_OPS_ENQ_EXITING and %SCX_OPS_ENQ_MIGRATION_DISABLED + * are set respectively. + */ + SCX_ENQ_IMMED = 1LLU << 33, + /* * The task being enqueued was previously enqueued on a DSQ, but was * removed and is being re-enqueued. See SCX_TASK_REENQ_* flags to find @@ -1098,6 +1138,7 @@ enum scx_enq_flags { SCX_ENQ_CLEAR_OPSS = 1LLU << 56, SCX_ENQ_DSQ_PRIQ = 1LLU << 57, SCX_ENQ_NESTED = 1LLU << 58, + SCX_ENQ_GDSQ_FALLBACK = 1LLU << 59, /* fell back to global DSQ */ }; enum scx_deq_flags { @@ -1127,6 +1168,12 @@ enum scx_reenq_flags { __SCX_REENQ_FILTER_MASK = 0xffffLLU, __SCX_REENQ_USER_MASK = SCX_REENQ_ANY, + + /* bits 32-35 used by task_should_reenq() */ + SCX_REENQ_TSR_RQ_OPEN = 1LLU << 32, + SCX_REENQ_TSR_NOT_FIRST = 1LLU << 33, + + __SCX_REENQ_TSR_MASK = 0xfLLU << 32, }; enum scx_pick_idle_cpu_flags { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 60627119d0ab..5b93f6190d31 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -800,6 +800,7 @@ struct scx_rq { u32 cpuperf_target; /* [0, SCHED_CAPACITY_SCALE] */ bool cpu_released; u32 flags; + u32 nr_immed; /* ENQ_IMMED tasks on local_dsq */ u64 clock; /* current per-rq clock -- see scx_bpf_now() */ cpumask_var_t cpus_to_kick; cpumask_var_t cpus_to_kick_if_idle; @@ -810,6 +811,7 @@ struct scx_rq { struct task_struct *sub_dispatch_prev; raw_spinlock_t deferred_reenq_lock; + u64 deferred_reenq_locals_seq; struct list_head deferred_reenq_locals; /* scheds requesting reenq of local DSQ */ struct list_head deferred_reenq_users; /* user DSQs requesting reenq */ struct balance_callback deferred_bal_cb; diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h index 704728864d83..cba37432eec0 100644 --- a/tools/sched_ext/include/scx/compat.bpf.h +++ b/tools/sched_ext/include/scx/compat.bpf.h @@ -404,6 +404,11 @@ static inline void scx_bpf_dsq_reenq(u64 dsq_id, u64 reenq_flags) scx_bpf_error("kernel too old to reenqueue foreign local or user DSQs"); } +/* + * v7.1: %SCX_ENQ_IMMED. + */ +#define SCX_ENQ_IMMED __COMPAT_ENUM_OR_ZERO(enum scx_enq_flags, SCX_ENQ_IMMED) + /* * Define sched_ext_ops. This may be expanded to define multiple variants for * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH(). -- cgit v1.2.3 From 82b6c1b542ea0530318c6f2a880d884eb4dce49f Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 2 Mar 2026 17:29:05 +0100 Subject: of: Add of_machine_get_match() helper Currently, there are two helpers to match the root compatible value against an of_device_id array: - of_machine_device_match() returns true if a match is found, - of_machine_get_match_data() returns the match data if a match is found. However, there is no helper that returns the actual of_device_id structure corresponding to the match, leading to code duplication in various drivers. Fix this by reworking of_machine_device_match() to return the actual match structure, and renaming it to of_machine_get_match(). Retain the old of_machine_device_match() functionality using a cheap static inline wrapper around the new of_machine_get_match() helper. Signed-off-by: Geert Uytterhoeven Acked-by: Viresh Kumar Link: https://patch.msgid.link/14e1c03d443b1a5f210609ec3a1ebbaeab8fb3d9.1772468323.git.geert+renesas@glider.be Signed-off-by: Rob Herring (Arm) --- drivers/of/base.c | 11 +++++------ include/linux/of.h | 11 ++++++++--- 2 files changed, 13 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/of/base.c b/drivers/of/base.c index 57420806c1a2..2a01d2a66eed 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -435,13 +435,12 @@ bool of_machine_compatible_match(const char *const *compats) EXPORT_SYMBOL(of_machine_compatible_match); /** - * of_machine_device_match - Test root of device tree against a of_device_id array + * of_machine_get_match - Test root of device tree against an of_device_id array * @matches: NULL terminated array of of_device_id match structures to search in * - * Returns true if the root node has any of the given compatible values in its - * compatible property. + * Returns matched entry or NULL */ -bool of_machine_device_match(const struct of_device_id *matches) +const struct of_device_id *of_machine_get_match(const struct of_device_id *matches) { struct device_node *root; const struct of_device_id *match = NULL; @@ -452,9 +451,9 @@ bool of_machine_device_match(const struct of_device_id *matches) of_node_put(root); } - return match != NULL; + return match; } -EXPORT_SYMBOL(of_machine_device_match); +EXPORT_SYMBOL(of_machine_get_match); /** * of_machine_get_match_data - Tell if root of device tree has a matching of_match structure diff --git a/include/linux/of.h b/include/linux/of.h index be6ec4916adf..b4d7d33b0ceb 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -410,7 +410,7 @@ extern int of_alias_get_id(const struct device_node *np, const char *stem); extern int of_alias_get_highest_id(const char *stem); bool of_machine_compatible_match(const char *const *compats); -bool of_machine_device_match(const struct of_device_id *matches); +const struct of_device_id *of_machine_get_match(const struct of_device_id *matches); const void *of_machine_get_match_data(const struct of_device_id *matches); /** @@ -866,9 +866,9 @@ static inline bool of_machine_compatible_match(const char *const *compats) return false; } -static inline bool of_machine_device_match(const struct of_device_id *matches) +static inline const struct of_device_id *of_machine_get_match(const struct of_device_id *matches) { - return false; + return NULL; } static inline const void * @@ -976,6 +976,11 @@ static inline int of_numa_init(void) } #endif +static inline bool of_machine_device_match(const struct of_device_id *matches) +{ + return of_machine_get_match(matches) != NULL; +} + static inline struct device_node *of_find_matching_node( struct device_node *from, const struct of_device_id *matches) -- cgit v1.2.3 From d7eafe655b741dfc241d5b920f6d2cea45b568d9 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Sun, 1 Mar 2026 06:13:16 +0800 Subject: dma-mapping: Separate DMA sync issuing and completion waiting Currently, arch_sync_dma_for_cpu and arch_sync_dma_for_device always wait for the completion of each DMA buffer. That is, issuing the DMA sync and waiting for completion is done in a single API call. For scatter-gather lists with multiple entries, this means issuing and waiting is repeated for each entry, which can hurt performance. Architectures like ARM64 may be able to issue all DMA sync operations for all entries first and then wait for completion together. To address this, arch_sync_dma_for_* now batches DMA operations and performs a flush afterward. On ARM64, the flush is implemented with a dsb instruction in arch_sync_dma_flush(). On other architectures, arch_sync_dma_flush() is currently a nop. Cc: Leon Romanovsky Cc: Catalin Marinas Cc: Will Deacon Cc: Marek Szyprowski Cc: Robin Murphy Cc: Ada Couprie Diaz Cc: Ard Biesheuvel Cc: Marc Zyngier Cc: Anshuman Khandual Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Joerg Roedel Cc: Stefano Stabellini Cc: Oleksandr Tyshchenko Cc: Tangquan Zheng Reviewed-by: Juergen Gross # drivers/xen/swiotlb-xen.c Tested-by: Xueyuan Chen Signed-off-by: Barry Song Reviewed-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20260228221316.59934-1-21cnbao@gmail.com --- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/cache.h | 5 +++++ arch/arm64/mm/dma-mapping.c | 4 ++-- drivers/iommu/dma-iommu.c | 35 +++++++++++++++++++++++++++-------- drivers/xen/swiotlb-xen.c | 24 ++++++++++++++++-------- include/linux/dma-map-ops.h | 6 ++++++ kernel/dma/Kconfig | 3 +++ kernel/dma/direct.c | 6 +++++- kernel/dma/direct.h | 9 +++++++-- kernel/dma/swiotlb.c | 7 ++++++- 10 files changed, 78 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 38dba5f7e4d2..ceafaac6532c 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -55,6 +55,7 @@ config ARM64 select ARCH_HAS_STRICT_MODULE_RWX select ARCH_HAS_SYNC_DMA_FOR_DEVICE select ARCH_HAS_SYNC_DMA_FOR_CPU + select ARCH_HAS_BATCHED_DMA_SYNC select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_ZONE_DMA_SET if EXPERT diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h index dd2c8586a725..10a7ffadee3d 100644 --- a/arch/arm64/include/asm/cache.h +++ b/arch/arm64/include/asm/cache.h @@ -87,6 +87,11 @@ int cache_line_size(void); #define dma_get_cache_alignment cache_line_size +static inline void arch_sync_dma_flush(void) +{ + dsb(sy); +} + /* Compress a u64 MPIDR value into 32 bits. */ static inline u64 arch_compact_of_hwid(u64 id) { diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index b2b5792b2caa..ae1ae0280eef 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -17,7 +17,7 @@ void arch_sync_dma_for_device(phys_addr_t paddr, size_t size, { unsigned long start = (unsigned long)phys_to_virt(paddr); - dcache_clean_poc(start, start + size); + dcache_clean_poc_nosync(start, start + size); } void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, @@ -28,7 +28,7 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, if (dir == DMA_TO_DEVICE) return; - dcache_inval_poc(start, start + size); + dcache_inval_poc_nosync(start, start + size); } void arch_dma_prep_coherent(struct page *page, size_t size) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 5dac64be61bb..66fc25bae85b 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1095,8 +1095,10 @@ void iommu_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, return; phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle); - if (!dev_is_dma_coherent(dev)) + if (!dev_is_dma_coherent(dev)) { arch_sync_dma_for_cpu(phys, size, dir); + arch_sync_dma_flush(); + } swiotlb_sync_single_for_cpu(dev, phys, size, dir); } @@ -1112,8 +1114,10 @@ void iommu_dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle); swiotlb_sync_single_for_device(dev, phys, size, dir); - if (!dev_is_dma_coherent(dev)) + if (!dev_is_dma_coherent(dev)) { arch_sync_dma_for_device(phys, size, dir); + arch_sync_dma_flush(); + } } void iommu_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, @@ -1122,13 +1126,15 @@ void iommu_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, struct scatterlist *sg; int i; - if (sg_dma_is_swiotlb(sgl)) + if (sg_dma_is_swiotlb(sgl)) { for_each_sg(sgl, sg, nelems, i) iommu_dma_sync_single_for_cpu(dev, sg_dma_address(sg), sg->length, dir); - else if (!dev_is_dma_coherent(dev)) + } else if (!dev_is_dma_coherent(dev)) { for_each_sg(sgl, sg, nelems, i) arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir); + arch_sync_dma_flush(); + } } void iommu_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, @@ -1137,14 +1143,16 @@ void iommu_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, struct scatterlist *sg; int i; - if (sg_dma_is_swiotlb(sgl)) + if (sg_dma_is_swiotlb(sgl)) { for_each_sg(sgl, sg, nelems, i) iommu_dma_sync_single_for_device(dev, sg_dma_address(sg), sg->length, dir); - else if (!dev_is_dma_coherent(dev)) + } else if (!dev_is_dma_coherent(dev)) { for_each_sg(sgl, sg, nelems, i) arch_sync_dma_for_device(sg_phys(sg), sg->length, dir); + arch_sync_dma_flush(); + } } static phys_addr_t iommu_dma_map_swiotlb(struct device *dev, phys_addr_t phys, @@ -1219,8 +1227,10 @@ dma_addr_t iommu_dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, return DMA_MAPPING_ERROR; } - if (!coherent && !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) + if (!coherent && !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) { arch_sync_dma_for_device(phys, size, dir); + arch_sync_dma_flush(); + } iova = __iommu_dma_map(dev, phys, size, prot, dma_mask); if (iova == DMA_MAPPING_ERROR && !(attrs & DMA_ATTR_MMIO)) @@ -1242,8 +1252,10 @@ void iommu_dma_unmap_phys(struct device *dev, dma_addr_t dma_handle, if (WARN_ON(!phys)) return; - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && !dev_is_dma_coherent(dev)) + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && !dev_is_dma_coherent(dev)) { arch_sync_dma_for_cpu(phys, size, dir); + arch_sync_dma_flush(); + } __iommu_dma_unmap(dev, dma_handle, size); @@ -1980,6 +1992,8 @@ int dma_iova_sync(struct device *dev, struct dma_iova_state *state, dma_addr_t addr = state->addr + offset; size_t iova_start_pad = iova_offset(iovad, addr); + if (!dev_is_dma_coherent(dev)) + arch_sync_dma_flush(); return iommu_sync_map(domain, addr - iova_start_pad, iova_align(iovad, size + iova_start_pad)); } @@ -1993,6 +2007,8 @@ static void iommu_dma_iova_unlink_range_slow(struct device *dev, struct iommu_dma_cookie *cookie = domain->iova_cookie; struct iova_domain *iovad = &cookie->iovad; size_t iova_start_pad = iova_offset(iovad, addr); + bool need_sync_dma = !dev_is_dma_coherent(dev) && + !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)); dma_addr_t end = addr + size; do { @@ -2016,6 +2032,9 @@ static void iommu_dma_iova_unlink_range_slow(struct device *dev, addr += len; iova_start_pad = 0; } while (addr < end); + + if (need_sync_dma) + arch_sync_dma_flush(); } static void __iommu_dma_iova_unlink(struct device *dev, diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index ccf25027bec1..b79917e785a5 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -262,10 +262,12 @@ static dma_addr_t xen_swiotlb_map_phys(struct device *dev, phys_addr_t phys, done: if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { - if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dev_addr)))) + if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dev_addr)))) { arch_sync_dma_for_device(phys, size, dir); - else + arch_sync_dma_flush(); + } else { xen_dma_sync_for_device(dev, dev_addr, size, dir); + } } return dev_addr; } @@ -287,10 +289,12 @@ static void xen_swiotlb_unmap_phys(struct device *hwdev, dma_addr_t dev_addr, BUG_ON(dir == DMA_NONE); if (!dev_is_dma_coherent(hwdev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { - if (pfn_valid(PFN_DOWN(dma_to_phys(hwdev, dev_addr)))) + if (pfn_valid(PFN_DOWN(dma_to_phys(hwdev, dev_addr)))) { arch_sync_dma_for_cpu(paddr, size, dir); - else + arch_sync_dma_flush(); + } else { xen_dma_sync_for_cpu(hwdev, dev_addr, size, dir); + } } /* NOTE: We use dev_addr here, not paddr! */ @@ -308,10 +312,12 @@ xen_swiotlb_sync_single_for_cpu(struct device *dev, dma_addr_t dma_addr, struct io_tlb_pool *pool; if (!dev_is_dma_coherent(dev)) { - if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr)))) + if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr)))) { arch_sync_dma_for_cpu(paddr, size, dir); - else + arch_sync_dma_flush(); + } else { xen_dma_sync_for_cpu(dev, dma_addr, size, dir); + } } pool = xen_swiotlb_find_pool(dev, dma_addr); @@ -331,10 +337,12 @@ xen_swiotlb_sync_single_for_device(struct device *dev, dma_addr_t dma_addr, __swiotlb_sync_single_for_device(dev, paddr, size, dir, pool); if (!dev_is_dma_coherent(dev)) { - if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr)))) + if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr)))) { arch_sync_dma_for_device(paddr, size, dir); - else + arch_sync_dma_flush(); + } else { xen_dma_sync_for_device(dev, dma_addr, size, dir); + } } } diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 60b63756df82..8a07df5a9ef6 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -361,6 +361,12 @@ static inline void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, } #endif /* ARCH_HAS_SYNC_DMA_FOR_CPU */ +#ifndef CONFIG_ARCH_HAS_BATCHED_DMA_SYNC +static inline void arch_sync_dma_flush(void) +{ +} +#endif + #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL void arch_sync_dma_for_cpu_all(void); #else diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 159900736f25..bfef21b4a9ae 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -72,6 +72,9 @@ config ARCH_HAS_DMA_PREP_COHERENT config ARCH_HAS_FORCE_DMA_UNENCRYPTED bool +config ARCH_HAS_BATCHED_DMA_SYNC + bool + # # Select this option if the architecture assumes DMA devices are coherent # by default. diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 8f43a930716d..c7666e5d5e7c 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -406,6 +406,8 @@ void dma_direct_sync_sg_for_device(struct device *dev, arch_sync_dma_for_device(paddr, sg->length, dir); } + if (!dev_is_dma_coherent(dev)) + arch_sync_dma_flush(); } #endif @@ -427,8 +429,10 @@ void dma_direct_sync_sg_for_cpu(struct device *dev, swiotlb_sync_single_for_cpu(dev, paddr, sg->length, dir); } - if (!dev_is_dma_coherent(dev)) + if (!dev_is_dma_coherent(dev)) { + arch_sync_dma_flush(); arch_sync_dma_for_cpu_all(); + } } /* diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index f476c63b668c..f925a7e8b000 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -60,8 +60,10 @@ static inline void dma_direct_sync_single_for_device(struct device *dev, swiotlb_sync_single_for_device(dev, paddr, size, dir); - if (!dev_is_dma_coherent(dev)) + if (!dev_is_dma_coherent(dev)) { arch_sync_dma_for_device(paddr, size, dir); + arch_sync_dma_flush(); + } } static inline void dma_direct_sync_single_for_cpu(struct device *dev, @@ -71,6 +73,7 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev, if (!dev_is_dma_coherent(dev)) { arch_sync_dma_for_cpu(paddr, size, dir); + arch_sync_dma_flush(); arch_sync_dma_for_cpu_all(); } @@ -106,8 +109,10 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev, } if (!dev_is_dma_coherent(dev) && - !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) + !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) { arch_sync_dma_for_device(phys, size, dir); + arch_sync_dma_flush(); + } return dma_addr; err_overflow: diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index d8e6f1d889d5..1105db1689d5 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -867,6 +867,9 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size if (orig_addr == INVALID_PHYS_ADDR) return; + if (dir == DMA_FROM_DEVICE && !dev_is_dma_coherent(dev)) + arch_sync_dma_flush(); + /* * It's valid for tlb_offset to be negative. This can happen when the * "offset" returned by swiotlb_align_offset() is non-zero, and the @@ -1595,8 +1598,10 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, return DMA_MAPPING_ERROR; } - if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { arch_sync_dma_for_device(swiotlb_addr, size, dir); + arch_sync_dma_flush(); + } return dma_addr; } -- cgit v1.2.3 From 74f0cca1100b6d1f1ea28178435aff8078d06603 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 11 Mar 2026 05:19:56 +0000 Subject: udp: Remove UDPLITE_SEND_CSCOV and UDPLITE_RECV_CSCOV. UDP-Lite supports variable-length checksum and has two socket options, UDPLITE_SEND_CSCOV and UDPLITE_RECV_CSCOV, to control the checksum coverage. Let's remove the support. setsockopt(UDPLITE_SEND_CSCOV / UDPLITE_RECV_CSCOV) was only available for UDP-Lite and returned -ENOPROTOOPT for UDP. Now, the options are handled in ip_setsockopt() and ipv6_setsockopt(), which still return the same error. getsockopt(UDPLITE_SEND_CSCOV / UDPLITE_RECV_CSCOV) was available for UDP and always returned 0, meaning full checksum, but now -ENOPROTOOPT is returned. Given that getsockopt() is meaningless for UDP and even the options are not defined under include/uapi/, this should not be a problem. $ man 7 udplite ... BUGS Where glibc support is missing, the following definitions are needed: #define IPPROTO_UDPLITE 136 #define UDPLITE_SEND_CSCOV 10 #define UDPLITE_RECV_CSCOV 11 Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20260311052020.1213705-10-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/udp.h | 10 +--------- include/net/udplite.h | 15 --------------- include/uapi/linux/udp.h | 2 ++ net/ipv4/udp.c | 46 ++-------------------------------------------- net/ipv6/ip6_checksum.c | 2 +- net/ipv6/udp.c | 5 ++--- 6 files changed, 8 insertions(+), 72 deletions(-) delete mode 100644 include/net/udplite.h (limited to 'include/linux') diff --git a/include/linux/udp.h b/include/linux/udp.h index 1cbf6b4d3aab..ce56ebcee5cb 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -40,8 +40,6 @@ enum { UDP_FLAGS_ACCEPT_FRAGLIST, UDP_FLAGS_ACCEPT_L4, UDP_FLAGS_ENCAP_ENABLED, /* This socket enabled encap */ - UDP_FLAGS_UDPLITE_SEND_CC, /* set via udplite setsockopt */ - UDP_FLAGS_UDPLITE_RECV_CC, /* set via udplite setsockopt */ }; /* per NUMA structure for lockless producer usage. */ @@ -74,11 +72,7 @@ struct udp_sock { */ __u16 len; /* total length of pending frames */ __u16 gso_size; - /* - * Fields specific to UDP-Lite. - */ - __u16 pcslen; - __u16 pcrlen; + /* * For encapsulation sockets. */ @@ -236,8 +230,6 @@ static inline void udp_allow_gso(struct sock *sk) hlist_nulls_for_each_entry_rcu(__up, node, list, udp_lrpa_node) #endif -#define IS_UDPLITE(__sk) (unlikely(__sk->sk_protocol == IPPROTO_UDPLITE)) - static inline struct sock *udp_tunnel_sk(const struct net *net, bool is_ipv6) { #if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) diff --git a/include/net/udplite.h b/include/net/udplite.h deleted file mode 100644 index 6bfa1d6833d1..000000000000 --- a/include/net/udplite.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Definitions for the UDP-Lite (RFC 3828) code. - */ -#ifndef _UDPLITE_H -#define _UDPLITE_H - -#include -#include - -/* UDP-Lite socket options */ -#define UDPLITE_SEND_CSCOV 10 /* sender partial coverage (as sent) */ -#define UDPLITE_RECV_CSCOV 11 /* receiver partial coverage (threshold ) */ - -#endif /* _UDPLITE_H */ diff --git a/include/uapi/linux/udp.h b/include/uapi/linux/udp.h index edca3e430305..877fb02df8fb 100644 --- a/include/uapi/linux/udp.h +++ b/include/uapi/linux/udp.h @@ -29,6 +29,8 @@ struct udphdr { /* UDP socket options */ #define UDP_CORK 1 /* Never send partially complete segments */ +/* Deprecated, reserved for UDPLITE_SEND_CSCOV 10 */ +/* Deprecated, reserved for UDPLITE_RECV_CSCOV 11 */ #define UDP_ENCAP 100 /* Set the socket to accept encapsulated packets */ #define UDP_NO_CHECK6_TX 101 /* Disable sending checksum for UDP6X */ #define UDP_NO_CHECK6_RX 102 /* Disable accepting checksum for UDP6 */ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 9a2c8ff96e83..d47ca721ef0d 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -117,7 +117,6 @@ #include #include #include -#include #include #if IS_ENABLED(CONFIG_IPV6) #include @@ -2924,7 +2923,6 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, struct udp_sock *up = udp_sk(sk); int val, valbool; int err = 0; - int is_udplite = IS_UDPLITE(sk); if (level == SOL_SOCKET) { err = sk_setsockopt(sk, level, optname, optval, optlen); @@ -3011,36 +3009,6 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, sockopt_release_sock(sk); break; - /* - * UDP-Lite's partial checksum coverage (RFC 3828). - */ - /* The sender sets actual checksum coverage length via this option. - * The case coverage > packet length is handled by send module. */ - case UDPLITE_SEND_CSCOV: - if (!is_udplite) /* Disable the option on UDP sockets */ - return -ENOPROTOOPT; - if (val != 0 && val < 8) /* Illegal coverage: use default (8) */ - val = 8; - else if (val > USHRT_MAX) - val = USHRT_MAX; - WRITE_ONCE(up->pcslen, val); - udp_set_bit(UDPLITE_SEND_CC, sk); - break; - - /* The receiver specifies a minimum checksum coverage value. To make - * sense, this should be set to at least 8 (as done below). If zero is - * used, this again means full checksum coverage. */ - case UDPLITE_RECV_CSCOV: - if (!is_udplite) /* Disable the option on UDP sockets */ - return -ENOPROTOOPT; - if (val != 0 && val < 8) /* Avoid silly minimal values. */ - val = 8; - else if (val > USHRT_MAX) - val = USHRT_MAX; - WRITE_ONCE(up->pcrlen, val); - udp_set_bit(UDPLITE_RECV_CC, sk); - break; - default: err = -ENOPROTOOPT; break; @@ -3053,7 +3021,7 @@ EXPORT_IPV6_MOD(udp_lib_setsockopt); static int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { - if (level == SOL_UDP || level == SOL_UDPLITE || level == SOL_SOCKET) + if (level == SOL_UDP || level == SOL_SOCKET) return udp_lib_setsockopt(sk, level, optname, optval, optlen, udp_push_pending_frames); @@ -3099,16 +3067,6 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, val = udp_test_bit(GRO_ENABLED, sk); break; - /* The following two cannot be changed on UDP sockets, the return is - * always 0 (which corresponds to the full checksum coverage of UDP). */ - case UDPLITE_SEND_CSCOV: - val = READ_ONCE(up->pcslen); - break; - - case UDPLITE_RECV_CSCOV: - val = READ_ONCE(up->pcrlen); - break; - default: return -ENOPROTOOPT; } @@ -3124,7 +3082,7 @@ EXPORT_IPV6_MOD(udp_lib_getsockopt); static int udp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { - if (level == SOL_UDP || level == SOL_UDPLITE) + if (level == SOL_UDP) return udp_lib_getsockopt(sk, level, optname, optval, optlen); return ip_getsockopt(sk, level, optname, optval, optlen); } diff --git a/net/ipv6/ip6_checksum.c b/net/ipv6/ip6_checksum.c index 8bb68a0cdfd6..e1a594873675 100644 --- a/net/ipv6/ip6_checksum.c +++ b/net/ipv6/ip6_checksum.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include -#include #include #ifndef _HAVE_ARCH_IPV6_CSUM diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 511e3f898be5..c3d8b5ede164 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -58,7 +58,6 @@ #include #include #include -#include static void udpv6_destruct_sock(struct sock *sk) { @@ -1831,7 +1830,7 @@ static void udpv6_destroy_sock(struct sock *sk) static int udpv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { - if (level == SOL_UDP || level == SOL_UDPLITE || level == SOL_SOCKET) + if (level == SOL_UDP || level == SOL_SOCKET) return udp_lib_setsockopt(sk, level, optname, optval, optlen, udp_v6_push_pending_frames); @@ -1841,7 +1840,7 @@ static int udpv6_setsockopt(struct sock *sk, int level, int optname, static int udpv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { - if (level == SOL_UDP || level == SOL_UDPLITE) + if (level == SOL_UDP) return udp_lib_getsockopt(sk, level, optname, optval, optlen); return ipv6_getsockopt(sk, level, optname, optval, optlen); } -- cgit v1.2.3 From 203247c5cb972af5d46bdb7d41ef40078048810b Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 13 Mar 2026 07:47:00 -0700 Subject: blk-integrity: support arbitrary buffer alignment A bio segment may have partial interval block data with the rest continuing into the next segments because direct-io data payloads only need to align in memory to the device's DMA limits. At the same time, the protection information may also be split in multiple segments. The most likely way that may happen is if two requests merge, or if we're directly using the io_uring user metadata. The generate/verify, however, only ever accessed the first bip_vec. Further, it may be possible to unalign the protection fields from the user space buffer, or if there are odd additional opaque bytes in front or in back of the protection information metadata region. Change up the iteration to allow spanning multiple segments. This patch is mostly a re-write of the protection information handling to allow any arbitrary alignments, so it's probably easier to review the end result rather than the diff. Many controllers are not able to handle interval data composed of multiple segments when PI is used, so this patch introduces a new integrity limit that a low level driver can set to notify that it is capable, default to false. The nvme driver is the first one to enable it in this patch. Everyone else will force DMA alignment to the logical block size as before to ensure interval data is always aligned within a single segment. Reviewed-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch Link: https://patch.msgid.link/20260313144701.1221652-2-kbusch@meta.com Signed-off-by: Jens Axboe --- block/blk-settings.c | 12 +- block/t10-pi.c | 816 +++++++++++++++++++++++------------------- drivers/nvme/host/core.c | 1 + include/linux/blk-integrity.h | 1 + 4 files changed, 465 insertions(+), 365 deletions(-) (limited to 'include/linux') diff --git a/block/blk-settings.c b/block/blk-settings.c index dabfab97fbab..78c83817b9d3 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -189,11 +189,11 @@ static int blk_validate_integrity_limits(struct queue_limits *lim) } /* - * The PI generation / validation helpers do not expect intervals to - * straddle multiple bio_vecs. Enforce alignment so that those are + * Some IO controllers can not handle data intervals straddling + * multiple bio_vecs. For those, enforce alignment so that those are * never generated, and that each buffer is aligned as expected. */ - if (bi->csum_type) { + if (!(bi->flags & BLK_SPLIT_INTERVAL_CAPABLE) && bi->csum_type) { lim->dma_alignment = max(lim->dma_alignment, (1U << bi->interval_exp) - 1); } @@ -992,10 +992,14 @@ bool queue_limits_stack_integrity(struct queue_limits *t, if ((ti->flags & BLK_INTEGRITY_REF_TAG) != (bi->flags & BLK_INTEGRITY_REF_TAG)) goto incompatible; + if ((ti->flags & BLK_SPLIT_INTERVAL_CAPABLE) && + !(bi->flags & BLK_SPLIT_INTERVAL_CAPABLE)) + ti->flags &= ~BLK_SPLIT_INTERVAL_CAPABLE; } else { ti->flags = BLK_INTEGRITY_STACKED; ti->flags |= (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE) | - (bi->flags & BLK_INTEGRITY_REF_TAG); + (bi->flags & BLK_INTEGRITY_REF_TAG) | + (bi->flags & BLK_SPLIT_INTERVAL_CAPABLE); ti->csum_type = bi->csum_type; ti->pi_tuple_size = bi->pi_tuple_size; ti->metadata_size = bi->metadata_size; diff --git a/block/t10-pi.c b/block/t10-pi.c index d27be6041fd3..a19b4e102a83 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -12,462 +12,556 @@ #include #include "blk.h" +#define APP_TAG_ESCAPE 0xffff +#define REF_TAG_ESCAPE 0xffffffff + +/* + * This union is used for onstack allocations when the pi field is split across + * segments. blk_validate_integrity_limits() guarantees pi_tuple_size matches + * the sizeof one of these two types. + */ +union pi_tuple { + struct crc64_pi_tuple crc64_pi; + struct t10_pi_tuple t10_pi; +}; + struct blk_integrity_iter { - void *prot_buf; - void *data_buf; - sector_t seed; - unsigned int data_size; - unsigned short interval; - const char *disk_name; + struct bio *bio; + struct bio_integrity_payload *bip; + struct blk_integrity *bi; + struct bvec_iter data_iter; + struct bvec_iter prot_iter; + unsigned int interval_remaining; + u64 seed; + u64 csum; }; -static __be16 t10_pi_csum(__be16 csum, void *data, unsigned int len, - unsigned char csum_type) +static void blk_calculate_guard(struct blk_integrity_iter *iter, void *data, + unsigned int len) { - if (csum_type == BLK_INTEGRITY_CSUM_IP) - return (__force __be16)ip_compute_csum(data, len); - return cpu_to_be16(crc_t10dif_update(be16_to_cpu(csum), data, len)); + switch (iter->bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + iter->csum = crc64_nvme(iter->csum, data, len); + break; + case BLK_INTEGRITY_CSUM_CRC: + iter->csum = crc_t10dif_update(iter->csum, data, len); + break; + case BLK_INTEGRITY_CSUM_IP: + iter->csum = (__force u32)csum_partial(data, len, + (__force __wsum)iter->csum); + break; + default: + WARN_ON_ONCE(1); + iter->csum = U64_MAX; + break; + } +} + +static void blk_integrity_csum_finish(struct blk_integrity_iter *iter) +{ + switch (iter->bi->csum_type) { + case BLK_INTEGRITY_CSUM_IP: + iter->csum = (__force u16)csum_fold((__force __wsum)iter->csum); + break; + default: + break; + } } /* - * Type 1 and Type 2 protection use the same format: 16 bit guard tag, - * 16 bit app tag, 32 bit reference tag. Type 3 does not define the ref - * tag. + * Update the csum for formats that have metadata padding in front of the data + * integrity field */ -static void t10_pi_generate(struct blk_integrity_iter *iter, - struct blk_integrity *bi) +static void blk_integrity_csum_offset(struct blk_integrity_iter *iter) { - u8 offset = bi->pi_offset; - unsigned int i; + unsigned int offset = iter->bi->pi_offset; + struct bio_vec *bvec = iter->bip->bip_vec; + + while (offset > 0) { + struct bio_vec pbv = bvec_iter_bvec(bvec, iter->prot_iter); + unsigned int len = min(pbv.bv_len, offset); + void *prot_buf = bvec_kmap_local(&pbv); + + blk_calculate_guard(iter, prot_buf, len); + kunmap_local(prot_buf); + offset -= len; + bvec_iter_advance_single(bvec, &iter->prot_iter, len); + } + blk_integrity_csum_finish(iter); +} - for (i = 0 ; i < iter->data_size ; i += iter->interval) { - struct t10_pi_tuple *pi = iter->prot_buf + offset; +static void blk_integrity_copy_from_tuple(struct bio_integrity_payload *bip, + struct bvec_iter *iter, void *tuple, + unsigned int tuple_size) +{ + while (tuple_size) { + struct bio_vec pbv = bvec_iter_bvec(bip->bip_vec, *iter); + unsigned int len = min(tuple_size, pbv.bv_len); + void *prot_buf = bvec_kmap_local(&pbv); + + memcpy(prot_buf, tuple, len); + kunmap_local(prot_buf); + bvec_iter_advance_single(bip->bip_vec, iter, len); + tuple_size -= len; + tuple += len; + } +} - pi->guard_tag = t10_pi_csum(0, iter->data_buf, iter->interval, - bi->csum_type); - if (offset) - pi->guard_tag = t10_pi_csum(pi->guard_tag, - iter->prot_buf, offset, bi->csum_type); - pi->app_tag = 0; +static void blk_integrity_copy_to_tuple(struct bio_integrity_payload *bip, + struct bvec_iter *iter, void *tuple, + unsigned int tuple_size) +{ + while (tuple_size) { + struct bio_vec pbv = bvec_iter_bvec(bip->bip_vec, *iter); + unsigned int len = min(tuple_size, pbv.bv_len); + void *prot_buf = bvec_kmap_local(&pbv); + + memcpy(tuple, prot_buf, len); + kunmap_local(prot_buf); + bvec_iter_advance_single(bip->bip_vec, iter, len); + tuple_size -= len; + tuple += len; + } +} - if (bi->flags & BLK_INTEGRITY_REF_TAG) - pi->ref_tag = cpu_to_be32(lower_32_bits(iter->seed)); - else - pi->ref_tag = 0; +static bool ext_pi_ref_escape(const u8 ref_tag[6]) +{ + static const u8 ref_escape[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; - iter->data_buf += iter->interval; - iter->prot_buf += bi->metadata_size; - iter->seed++; - } + return memcmp(ref_tag, ref_escape, sizeof(ref_escape)) == 0; } -static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, - struct blk_integrity *bi) -{ - u8 offset = bi->pi_offset; - unsigned int i; - - for (i = 0 ; i < iter->data_size ; i += iter->interval) { - struct t10_pi_tuple *pi = iter->prot_buf + offset; - __be16 csum; - - if (bi->flags & BLK_INTEGRITY_REF_TAG) { - if (pi->app_tag == T10_PI_APP_ESCAPE) - goto next; - - if (be32_to_cpu(pi->ref_tag) != - lower_32_bits(iter->seed)) { - pr_err("%s: ref tag error at location %llu " \ - "(rcvd %u)\n", iter->disk_name, - (unsigned long long) - iter->seed, be32_to_cpu(pi->ref_tag)); - return BLK_STS_PROTECTION; - } - } else { - if (pi->app_tag == T10_PI_APP_ESCAPE && - pi->ref_tag == T10_PI_REF_ESCAPE) - goto next; +static blk_status_t blk_verify_ext_pi(struct blk_integrity_iter *iter, + struct crc64_pi_tuple *pi) +{ + u64 seed = lower_48_bits(iter->seed); + u64 guard = get_unaligned_be64(&pi->guard_tag); + u64 ref = get_unaligned_be48(pi->ref_tag); + u16 app = get_unaligned_be16(&pi->app_tag); + + if (iter->bi->flags & BLK_INTEGRITY_REF_TAG) { + if (app == APP_TAG_ESCAPE) + return BLK_STS_OK; + if (ref != seed) { + pr_err("%s: ref tag error at location %llu (rcvd %llu)\n", + iter->bio->bi_bdev->bd_disk->disk_name, seed, + ref); + return BLK_STS_PROTECTION; } + } else if (app == APP_TAG_ESCAPE && ext_pi_ref_escape(pi->ref_tag)) { + return BLK_STS_OK; + } + + if (guard != iter->csum) { + pr_err("%s: guard tag error at sector %llu (rcvd %016llx, want %016llx)\n", + iter->bio->bi_bdev->bd_disk->disk_name, iter->seed, + guard, iter->csum); + return BLK_STS_PROTECTION; + } + + return BLK_STS_OK; +} - csum = t10_pi_csum(0, iter->data_buf, iter->interval, - bi->csum_type); - if (offset) - csum = t10_pi_csum(csum, iter->prot_buf, offset, - bi->csum_type); - - if (pi->guard_tag != csum) { - pr_err("%s: guard tag error at sector %llu " \ - "(rcvd %04x, want %04x)\n", iter->disk_name, - (unsigned long long)iter->seed, - be16_to_cpu(pi->guard_tag), be16_to_cpu(csum)); +static blk_status_t blk_verify_pi(struct blk_integrity_iter *iter, + struct t10_pi_tuple *pi, u16 guard) +{ + u32 seed = lower_32_bits(iter->seed); + u32 ref = get_unaligned_be32(&pi->ref_tag); + u16 app = get_unaligned_be16(&pi->app_tag); + + if (iter->bi->flags & BLK_INTEGRITY_REF_TAG) { + if (app == APP_TAG_ESCAPE) + return BLK_STS_OK; + if (ref != seed) { + pr_err("%s: ref tag error at location %u (rcvd %u)\n", + iter->bio->bi_bdev->bd_disk->disk_name, seed, + ref); return BLK_STS_PROTECTION; } + } else if (app == APP_TAG_ESCAPE && ref == REF_TAG_ESCAPE) { + return BLK_STS_OK; + } -next: - iter->data_buf += iter->interval; - iter->prot_buf += bi->metadata_size; - iter->seed++; + if (guard != (u16)iter->csum) { + pr_err("%s: guard tag error at sector %llu (rcvd %04x, want %04x)\n", + iter->bio->bi_bdev->bd_disk->disk_name, iter->seed, + guard, (u16)iter->csum); + return BLK_STS_PROTECTION; } return BLK_STS_OK; } -/** - * t10_pi_type1_prepare - prepare PI prior submitting request to device - * @rq: request with PI that should be prepared - * - * For Type 1/Type 2, the virtual start sector is the one that was - * originally submitted by the block layer for the ref_tag usage. Due to - * partitioning, MD/DM cloning, etc. the actual physical start sector is - * likely to be different. Remap protection information to match the - * physical LBA. - */ -static void t10_pi_type1_prepare(struct request *rq) +static blk_status_t blk_verify_t10_pi(struct blk_integrity_iter *iter, + struct t10_pi_tuple *pi) { - struct blk_integrity *bi = &rq->q->limits.integrity; - const int tuple_sz = bi->metadata_size; - u32 ref_tag = t10_pi_ref_tag(rq); - u8 offset = bi->pi_offset; - struct bio *bio; + u16 guard = get_unaligned_be16(&pi->guard_tag); - __rq_for_each_bio(bio, rq) { - struct bio_integrity_payload *bip = bio_integrity(bio); - u32 virt = bip_get_seed(bip) & 0xffffffff; - struct bio_vec iv; - struct bvec_iter iter; + return blk_verify_pi(iter, pi, guard); +} - /* Already remapped? */ - if (bip->bip_flags & BIP_MAPPED_INTEGRITY) - break; +static blk_status_t blk_verify_ip_pi(struct blk_integrity_iter *iter, + struct t10_pi_tuple *pi) +{ + u16 guard = get_unaligned((u16 *)&pi->guard_tag); - bip_for_each_vec(iv, bip, iter) { - unsigned int j; - void *p; - - p = bvec_kmap_local(&iv); - for (j = 0; j < iv.bv_len; j += tuple_sz) { - struct t10_pi_tuple *pi = p + offset; - - if (be32_to_cpu(pi->ref_tag) == virt) - pi->ref_tag = cpu_to_be32(ref_tag); - virt++; - ref_tag++; - p += tuple_sz; - } - kunmap_local(p); - } + return blk_verify_pi(iter, pi, guard); +} - bip->bip_flags |= BIP_MAPPED_INTEGRITY; +static blk_status_t blk_integrity_verify(struct blk_integrity_iter *iter, + union pi_tuple *tuple) +{ + switch (iter->bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + return blk_verify_ext_pi(iter, &tuple->crc64_pi); + case BLK_INTEGRITY_CSUM_CRC: + return blk_verify_t10_pi(iter, &tuple->t10_pi); + case BLK_INTEGRITY_CSUM_IP: + return blk_verify_ip_pi(iter, &tuple->t10_pi); + default: + return BLK_STS_OK; } } -/** - * t10_pi_type1_complete - prepare PI prior returning request to the blk layer - * @rq: request with PI that should be prepared - * @nr_bytes: total bytes to prepare - * - * For Type 1/Type 2, the virtual start sector is the one that was - * originally submitted by the block layer for the ref_tag usage. Due to - * partitioning, MD/DM cloning, etc. the actual physical start sector is - * likely to be different. Since the physical start sector was submitted - * to the device, we should remap it back to virtual values expected by the - * block layer. - */ -static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) +static void blk_set_ext_pi(struct blk_integrity_iter *iter, + struct crc64_pi_tuple *pi) { - struct blk_integrity *bi = &rq->q->limits.integrity; - unsigned intervals = nr_bytes >> bi->interval_exp; - const int tuple_sz = bi->metadata_size; - u32 ref_tag = t10_pi_ref_tag(rq); - u8 offset = bi->pi_offset; - struct bio *bio; + put_unaligned_be64(iter->csum, &pi->guard_tag); + put_unaligned_be16(0, &pi->app_tag); + put_unaligned_be48(iter->seed, &pi->ref_tag); +} - __rq_for_each_bio(bio, rq) { - struct bio_integrity_payload *bip = bio_integrity(bio); - u32 virt = bip_get_seed(bip) & 0xffffffff; - struct bio_vec iv; - struct bvec_iter iter; - - bip_for_each_vec(iv, bip, iter) { - unsigned int j; - void *p; - - p = bvec_kmap_local(&iv); - for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) { - struct t10_pi_tuple *pi = p + offset; - - if (be32_to_cpu(pi->ref_tag) == ref_tag) - pi->ref_tag = cpu_to_be32(virt); - virt++; - ref_tag++; - intervals--; - p += tuple_sz; - } - kunmap_local(p); - } - } +static void blk_set_pi(struct blk_integrity_iter *iter, + struct t10_pi_tuple *pi, __be16 csum) +{ + put_unaligned(csum, &pi->guard_tag); + put_unaligned_be16(0, &pi->app_tag); + put_unaligned_be32(iter->seed, &pi->ref_tag); } -static __be64 ext_pi_crc64(u64 crc, void *data, unsigned int len) +static void blk_set_t10_pi(struct blk_integrity_iter *iter, + struct t10_pi_tuple *pi) { - return cpu_to_be64(crc64_nvme(crc, data, len)); + blk_set_pi(iter, pi, cpu_to_be16((u16)iter->csum)); } -static void ext_pi_crc64_generate(struct blk_integrity_iter *iter, - struct blk_integrity *bi) +static void blk_set_ip_pi(struct blk_integrity_iter *iter, + struct t10_pi_tuple *pi) { - u8 offset = bi->pi_offset; - unsigned int i; + blk_set_pi(iter, pi, (__force __be16)(u16)iter->csum); +} - for (i = 0 ; i < iter->data_size ; i += iter->interval) { - struct crc64_pi_tuple *pi = iter->prot_buf + offset; +static void blk_integrity_set(struct blk_integrity_iter *iter, + union pi_tuple *tuple) +{ + switch (iter->bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + return blk_set_ext_pi(iter, &tuple->crc64_pi); + case BLK_INTEGRITY_CSUM_CRC: + return blk_set_t10_pi(iter, &tuple->t10_pi); + case BLK_INTEGRITY_CSUM_IP: + return blk_set_ip_pi(iter, &tuple->t10_pi); + default: + WARN_ON_ONCE(1); + return; + } +} - pi->guard_tag = ext_pi_crc64(0, iter->data_buf, iter->interval); - if (offset) - pi->guard_tag = ext_pi_crc64(be64_to_cpu(pi->guard_tag), - iter->prot_buf, offset); - pi->app_tag = 0; +static blk_status_t blk_integrity_interval(struct blk_integrity_iter *iter, + bool verify) +{ + blk_status_t ret = BLK_STS_OK; + union pi_tuple tuple; + void *ptuple = &tuple; + struct bio_vec pbv; + + blk_integrity_csum_offset(iter); + pbv = bvec_iter_bvec(iter->bip->bip_vec, iter->prot_iter); + if (pbv.bv_len >= iter->bi->pi_tuple_size) { + ptuple = bvec_kmap_local(&pbv); + bvec_iter_advance_single(iter->bip->bip_vec, &iter->prot_iter, + iter->bi->metadata_size - iter->bi->pi_offset); + } else if (verify) { + blk_integrity_copy_to_tuple(iter->bip, &iter->prot_iter, + ptuple, iter->bi->pi_tuple_size); + } - if (bi->flags & BLK_INTEGRITY_REF_TAG) - put_unaligned_be48(iter->seed, pi->ref_tag); - else - put_unaligned_be48(0ULL, pi->ref_tag); + if (verify) + ret = blk_integrity_verify(iter, ptuple); + else + blk_integrity_set(iter, ptuple); - iter->data_buf += iter->interval; - iter->prot_buf += bi->metadata_size; - iter->seed++; + if (likely(ptuple != &tuple)) { + kunmap_local(ptuple); + } else if (!verify) { + blk_integrity_copy_from_tuple(iter->bip, &iter->prot_iter, + ptuple, iter->bi->pi_tuple_size); } + + iter->interval_remaining = 1 << iter->bi->interval_exp; + iter->csum = 0; + iter->seed++; + return ret; } -static bool ext_pi_ref_escape(const u8 ref_tag[6]) +static blk_status_t blk_integrity_iterate(struct bio *bio, + struct bvec_iter *data_iter, + bool verify) { - static const u8 ref_escape[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + struct bio_integrity_payload *bip = bio_integrity(bio); + struct blk_integrity_iter iter = { + .bio = bio, + .bip = bip, + .bi = bi, + .data_iter = *data_iter, + .prot_iter = bip->bip_iter, + .interval_remaining = 1 << bi->interval_exp, + .seed = data_iter->bi_sector, + .csum = 0, + }; + blk_status_t ret = BLK_STS_OK; + + while (iter.data_iter.bi_size && ret == BLK_STS_OK) { + struct bio_vec bv = bvec_iter_bvec(iter.bio->bi_io_vec, + iter.data_iter); + void *kaddr = bvec_kmap_local(&bv); + void *data = kaddr; + unsigned int len; + + bvec_iter_advance_single(iter.bio->bi_io_vec, &iter.data_iter, + bv.bv_len); + while (bv.bv_len && ret == BLK_STS_OK) { + len = min(iter.interval_remaining, bv.bv_len); + blk_calculate_guard(&iter, data, len); + bv.bv_len -= len; + data += len; + iter.interval_remaining -= len; + if (!iter.interval_remaining) + ret = blk_integrity_interval(&iter, verify); + } + kunmap_local(kaddr); + } - return memcmp(ref_tag, ref_escape, sizeof(ref_escape)) == 0; + return ret; } -static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, - struct blk_integrity *bi) -{ - u8 offset = bi->pi_offset; - unsigned int i; - - for (i = 0; i < iter->data_size; i += iter->interval) { - struct crc64_pi_tuple *pi = iter->prot_buf + offset; - u64 ref, seed; - __be64 csum; - - if (bi->flags & BLK_INTEGRITY_REF_TAG) { - if (pi->app_tag == T10_PI_APP_ESCAPE) - goto next; - - ref = get_unaligned_be48(pi->ref_tag); - seed = lower_48_bits(iter->seed); - if (ref != seed) { - pr_err("%s: ref tag error at location %llu (rcvd %llu)\n", - iter->disk_name, seed, ref); - return BLK_STS_PROTECTION; - } - } else { - if (pi->app_tag == T10_PI_APP_ESCAPE && - ext_pi_ref_escape(pi->ref_tag)) - goto next; - } +void bio_integrity_generate(struct bio *bio) +{ + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); - csum = ext_pi_crc64(0, iter->data_buf, iter->interval); - if (offset) - csum = ext_pi_crc64(be64_to_cpu(csum), iter->prot_buf, - offset); + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + blk_integrity_iterate(bio, &bio->bi_iter, false); + break; + default: + break; + } +} - if (pi->guard_tag != csum) { - pr_err("%s: guard tag error at sector %llu " \ - "(rcvd %016llx, want %016llx)\n", - iter->disk_name, (unsigned long long)iter->seed, - be64_to_cpu(pi->guard_tag), be64_to_cpu(csum)); - return BLK_STS_PROTECTION; - } +blk_status_t bio_integrity_verify(struct bio *bio, struct bvec_iter *saved_iter) +{ + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); -next: - iter->data_buf += iter->interval; - iter->prot_buf += bi->metadata_size; - iter->seed++; + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + return blk_integrity_iterate(bio, saved_iter, true); + default: + break; } return BLK_STS_OK; } -static void ext_pi_type1_prepare(struct request *rq) +/* + * Advance @iter past the protection offset for protection formats that + * contain front padding on the metadata region. + */ +static void blk_pi_advance_offset(struct blk_integrity *bi, + struct bio_integrity_payload *bip, + struct bvec_iter *iter) { - struct blk_integrity *bi = &rq->q->limits.integrity; - const int tuple_sz = bi->metadata_size; - u64 ref_tag = ext_pi_ref_tag(rq); - u8 offset = bi->pi_offset; - struct bio *bio; + unsigned int offset = bi->pi_offset; - __rq_for_each_bio(bio, rq) { - struct bio_integrity_payload *bip = bio_integrity(bio); - u64 virt = lower_48_bits(bip_get_seed(bip)); - struct bio_vec iv; - struct bvec_iter iter; + while (offset > 0) { + struct bio_vec bv = mp_bvec_iter_bvec(bip->bip_vec, *iter); + unsigned int len = min(bv.bv_len, offset); - /* Already remapped? */ - if (bip->bip_flags & BIP_MAPPED_INTEGRITY) - break; + bvec_iter_advance_single(bip->bip_vec, iter, len); + offset -= len; + } +} - bip_for_each_vec(iv, bip, iter) { - unsigned int j; - void *p; - - p = bvec_kmap_local(&iv); - for (j = 0; j < iv.bv_len; j += tuple_sz) { - struct crc64_pi_tuple *pi = p + offset; - u64 ref = get_unaligned_be48(pi->ref_tag); - - if (ref == virt) - put_unaligned_be48(ref_tag, pi->ref_tag); - virt++; - ref_tag++; - p += tuple_sz; - } - kunmap_local(p); - } +static void *blk_tuple_remap_begin(union pi_tuple *tuple, + struct blk_integrity *bi, + struct bio_integrity_payload *bip, + struct bvec_iter *iter) +{ + struct bvec_iter titer; + struct bio_vec pbv; - bip->bip_flags |= BIP_MAPPED_INTEGRITY; + blk_pi_advance_offset(bi, bip, iter); + pbv = bvec_iter_bvec(bip->bip_vec, *iter); + if (likely(pbv.bv_len >= bi->pi_tuple_size)) + return bvec_kmap_local(&pbv); + + /* + * We need to preserve the state of the original iter for the + * copy_from_tuple at the end, so make a temp iter for here. + */ + titer = *iter; + blk_integrity_copy_to_tuple(bip, &titer, tuple, bi->pi_tuple_size); + return tuple; +} + +static void blk_tuple_remap_end(union pi_tuple *tuple, void *ptuple, + struct blk_integrity *bi, + struct bio_integrity_payload *bip, + struct bvec_iter *iter) +{ + unsigned int len = bi->metadata_size - bi->pi_offset; + + if (likely(ptuple != tuple)) { + kunmap_local(ptuple); + } else { + blk_integrity_copy_from_tuple(bip, iter, ptuple, + bi->pi_tuple_size); + len -= bi->pi_tuple_size; } + + bvec_iter_advance(bip->bip_vec, iter, len); } -static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes) +static void blk_set_ext_unmap_ref(struct crc64_pi_tuple *pi, u64 virt, + u64 ref_tag) { - struct blk_integrity *bi = &rq->q->limits.integrity; - unsigned intervals = nr_bytes >> bi->interval_exp; - const int tuple_sz = bi->metadata_size; - u64 ref_tag = ext_pi_ref_tag(rq); - u8 offset = bi->pi_offset; - struct bio *bio; + u64 ref = get_unaligned_be48(&pi->ref_tag); - __rq_for_each_bio(bio, rq) { - struct bio_integrity_payload *bip = bio_integrity(bio); - u64 virt = lower_48_bits(bip_get_seed(bip)); - struct bio_vec iv; - struct bvec_iter iter; - - bip_for_each_vec(iv, bip, iter) { - unsigned int j; - void *p; - - p = bvec_kmap_local(&iv); - for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) { - struct crc64_pi_tuple *pi = p + offset; - u64 ref = get_unaligned_be48(pi->ref_tag); - - if (ref == ref_tag) - put_unaligned_be48(virt, pi->ref_tag); - virt++; - ref_tag++; - intervals--; - p += tuple_sz; - } - kunmap_local(p); - } + if (ref == lower_48_bits(ref_tag) && ref != lower_48_bits(virt)) + put_unaligned_be48(virt, pi->ref_tag); +} + +static void blk_set_t10_unmap_ref(struct t10_pi_tuple *pi, u32 virt, + u32 ref_tag) +{ + u32 ref = get_unaligned_be32(&pi->ref_tag); + + if (ref == ref_tag && ref != virt) + put_unaligned_be32(virt, &pi->ref_tag); +} + +static void blk_reftag_remap_complete(struct blk_integrity *bi, + union pi_tuple *tuple, u64 virt, u64 ref) +{ + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + blk_set_ext_unmap_ref(&tuple->crc64_pi, virt, ref); + break; + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + blk_set_t10_unmap_ref(&tuple->t10_pi, virt, ref); + break; + default: + WARN_ON_ONCE(1); + break; } } -void bio_integrity_generate(struct bio *bio) +static void blk_set_ext_map_ref(struct crc64_pi_tuple *pi, u64 virt, + u64 ref_tag) { - struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); - struct bio_integrity_payload *bip = bio_integrity(bio); - struct blk_integrity_iter iter; - struct bvec_iter bviter; - struct bio_vec bv; - - iter.disk_name = bio->bi_bdev->bd_disk->disk_name; - iter.interval = 1 << bi->interval_exp; - iter.seed = bio->bi_iter.bi_sector; - iter.prot_buf = bvec_virt(bip->bip_vec); - bio_for_each_segment(bv, bio, bviter) { - void *kaddr = bvec_kmap_local(&bv); + u64 ref = get_unaligned_be48(&pi->ref_tag); - iter.data_buf = kaddr; - iter.data_size = bv.bv_len; - switch (bi->csum_type) { - case BLK_INTEGRITY_CSUM_CRC64: - ext_pi_crc64_generate(&iter, bi); - break; - case BLK_INTEGRITY_CSUM_CRC: - case BLK_INTEGRITY_CSUM_IP: - t10_pi_generate(&iter, bi); - break; - default: - break; - } - kunmap_local(kaddr); + if (ref == lower_48_bits(virt) && ref != ref_tag) + put_unaligned_be48(ref_tag, pi->ref_tag); +} + +static void blk_set_t10_map_ref(struct t10_pi_tuple *pi, u32 virt, u32 ref_tag) +{ + u32 ref = get_unaligned_be32(&pi->ref_tag); + + if (ref == virt && ref != ref_tag) + put_unaligned_be32(ref_tag, &pi->ref_tag); +} + +static void blk_reftag_remap_prepare(struct blk_integrity *bi, + union pi_tuple *tuple, + u64 virt, u64 ref) +{ + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + blk_set_ext_map_ref(&tuple->crc64_pi, virt, ref); + break; + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + blk_set_t10_map_ref(&tuple->t10_pi, virt, ref); + break; + default: + WARN_ON_ONCE(1); + break; } } -blk_status_t bio_integrity_verify(struct bio *bio, struct bvec_iter *saved_iter) +static void __blk_reftag_remap(struct bio *bio, struct blk_integrity *bi, + unsigned *intervals, u64 *ref, bool prep) { - struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct bio_integrity_payload *bip = bio_integrity(bio); - struct blk_integrity_iter iter; - struct bvec_iter bviter; - struct bio_vec bv; + struct bvec_iter iter = bip->bip_iter; + u64 virt = bip_get_seed(bip); + union pi_tuple *ptuple; + union pi_tuple tuple; - /* - * At the moment verify is called bi_iter has been advanced during split - * and completion, so use the copy created during submission here. - */ - iter.disk_name = bio->bi_bdev->bd_disk->disk_name; - iter.interval = 1 << bi->interval_exp; - iter.seed = saved_iter->bi_sector; - iter.prot_buf = bvec_virt(bip->bip_vec); - __bio_for_each_segment(bv, bio, bviter, *saved_iter) { - void *kaddr = bvec_kmap_local(&bv); - blk_status_t ret = BLK_STS_OK; + if (prep && bip->bip_flags & BIP_MAPPED_INTEGRITY) { + *ref += bio->bi_iter.bi_size >> bi->interval_exp; + return; + } - iter.data_buf = kaddr; - iter.data_size = bv.bv_len; - switch (bi->csum_type) { - case BLK_INTEGRITY_CSUM_CRC64: - ret = ext_pi_crc64_verify(&iter, bi); - break; - case BLK_INTEGRITY_CSUM_CRC: - case BLK_INTEGRITY_CSUM_IP: - ret = t10_pi_verify(&iter, bi); - break; - default: - break; - } - kunmap_local(kaddr); + while (iter.bi_size && *intervals) { + ptuple = blk_tuple_remap_begin(&tuple, bi, bip, &iter); + + if (prep) + blk_reftag_remap_prepare(bi, ptuple, virt, *ref); + else + blk_reftag_remap_complete(bi, ptuple, virt, *ref); - if (ret) - return ret; + blk_tuple_remap_end(&tuple, ptuple, bi, bip, &iter); + (*intervals)--; + (*ref)++; + virt++; } - return BLK_STS_OK; + if (prep) + bip->bip_flags |= BIP_MAPPED_INTEGRITY; } -void blk_integrity_prepare(struct request *rq) +static void blk_integrity_remap(struct request *rq, unsigned int nr_bytes, + bool prep) { struct blk_integrity *bi = &rq->q->limits.integrity; + u64 ref = blk_rq_pos(rq) >> (bi->interval_exp - SECTOR_SHIFT); + unsigned intervals = nr_bytes >> bi->interval_exp; + struct bio *bio; if (!(bi->flags & BLK_INTEGRITY_REF_TAG)) return; - if (bi->csum_type == BLK_INTEGRITY_CSUM_CRC64) - ext_pi_type1_prepare(rq); - else - t10_pi_type1_prepare(rq); + __rq_for_each_bio(bio, rq) { + __blk_reftag_remap(bio, bi, &intervals, &ref, prep); + if (!intervals) + break; + } } -void blk_integrity_complete(struct request *rq, unsigned int nr_bytes) +void blk_integrity_prepare(struct request *rq) { - struct blk_integrity *bi = &rq->q->limits.integrity; - - if (!(bi->flags & BLK_INTEGRITY_REF_TAG)) - return; + blk_integrity_remap(rq, blk_rq_bytes(rq), true); +} - if (bi->csum_type == BLK_INTEGRITY_CSUM_CRC64) - ext_pi_type1_complete(rq, nr_bytes); - else - t10_pi_type1_complete(rq, nr_bytes); +void blk_integrity_complete(struct request *rq, unsigned int nr_bytes) +{ + blk_integrity_remap(rq, nr_bytes, false); } diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 58bf432ec5e6..3de52f1d2723 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1875,6 +1875,7 @@ static bool nvme_init_integrity(struct nvme_ns_head *head, break; } + bi->flags |= BLK_SPLIT_INTERVAL_CAPABLE; bi->metadata_size = head->ms; if (bi->csum_type) { bi->pi_tuple_size = head->pi_size; diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index ea6d7d322ae3..b1b530613c34 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -14,6 +14,7 @@ enum blk_integrity_flags { BLK_INTEGRITY_DEVICE_CAPABLE = 1 << 2, BLK_INTEGRITY_REF_TAG = 1 << 3, BLK_INTEGRITY_STACKED = 1 << 4, + BLK_SPLIT_INTERVAL_CAPABLE = 1 << 5, }; const char *blk_integrity_profile_name(struct blk_integrity *bi); -- cgit v1.2.3 From 0e24d17bd9668f9dad78ede6a0e8f13dab176682 Mon Sep 17 00:00:00 2001 From: Simon Baatz Date: Mon, 9 Mar 2026 09:02:26 +0100 Subject: tcp: implement RFC 7323 window retraction receiver requirements By default, the Linux TCP implementation does not shrink the advertised window (RFC 7323 calls this "window retraction") with the following exceptions: - When an incoming segment cannot be added due to the receive buffer running out of memory. Since commit 8c670bdfa58e ("tcp: correct handling of extreme memory squeeze") a zero window will be advertised in this case. It turns out that reaching the required memory pressure is easy when window scaling is in use. In the simplest case, sending a sufficient number of segments smaller than the scale factor to a receiver that does not read data is enough. - Commit b650d953cd39 ("tcp: enforce receive buffer memory limits by allowing the tcp window to shrink") addressed the "eating memory" problem by introducing a sysctl knob that allows shrinking the window before running out of memory. However, RFC 7323 does not only state that shrinking the window is necessary in some cases, it also formulates requirements for TCP implementations when doing so (Section 2.4). This commit addresses the receiver-side requirements: After retracting the window, the peer may have a snd_nxt that lies within a previously advertised window but is now beyond the retracted window. This means that all incoming segments (including pure ACKs) will be rejected until the application happens to read enough data to let the peer's snd_nxt be in window again (which may be never). To comply with RFC 7323, the receiver MUST honor any segment that would have been in window for any ACK sent by the receiver and, when window scaling is in effect, SHOULD track the maximum window sequence number it has advertised. This patch tracks that maximum window sequence number rcv_mwnd_seq throughout the connection and uses it in tcp_sequence() when deciding whether a segment is acceptable. rcv_mwnd_seq is updated together with rcv_wup and rcv_wnd in tcp_select_window(). If we count tcp_sequence() as fast path, it is read in the fast path. Therefore, rcv_mwnd_seq is put into rcv_wnd's cacheline group. The logic for handling received data in tcp_data_queue() is already sufficient and does not need to be updated. Signed-off-by: Simon Baatz Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20260309-tcp_rfc7323_retract_wnd_rfc-v3-1-4c7f96b1ec69@gmail.com Signed-off-by: Jakub Kicinski --- .../networking/net_cachelines/tcp_sock.rst | 1 + include/linux/tcp.h | 3 +++ include/net/tcp.h | 22 ++++++++++++++++++++++ net/ipv4/tcp.c | 2 ++ net/ipv4/tcp_fastopen.c | 1 + net/ipv4/tcp_input.c | 10 +++++----- net/ipv4/tcp_minisocks.c | 1 + net/ipv4/tcp_output.c | 3 +++ .../net/packetdrill/tcp_rcv_big_endseq.pkt | 2 +- 9 files changed, 39 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index 563daea10d6c..fecf61166a54 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -121,6 +121,7 @@ u64 delivered_mstamp read_write u32 rate_delivered read_mostly tcp_rate_gen u32 rate_interval_us read_mostly rate_delivered,rate_app_limited u32 rcv_wnd read_write read_mostly tcp_select_window,tcp_receive_window,tcp_fast_path_check +u32 rcv_mwnd_seq read_write tcp_select_window u32 write_seq read_write tcp_rate_check_app_limited,tcp_write_queue_empty,tcp_skb_entail,forced_push,tcp_mark_push u32 notsent_lowat read_mostly tcp_stream_memory_free u32 pushed_seq read_write tcp_mark_push,forced_push diff --git a/include/linux/tcp.h b/include/linux/tcp.h index bcebc4f07532..6982f10e826b 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -316,6 +316,9 @@ struct tcp_sock { */ u32 app_limited; /* limited until "delivered" reaches this val */ u32 rcv_wnd; /* Current receiver window */ + u32 rcv_mwnd_seq; /* Maximum window sequence number (RFC 7323, + * section 2.4, receiver requirements) + */ u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ /* * Options received (usually on last packet, some only on SYN packets). diff --git a/include/net/tcp.h b/include/net/tcp.h index 48dffcca0a71..f87bdacb5a69 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -934,6 +934,28 @@ static inline u32 tcp_receive_window(const struct tcp_sock *tp) return (u32) win; } +/* Compute the maximum receive window we ever advertised. + * Rcv_nxt can be after the window if our peer push more data + * than the offered window. + */ +static inline u32 tcp_max_receive_window(const struct tcp_sock *tp) +{ + s32 win = tp->rcv_mwnd_seq - tp->rcv_nxt; + + if (win < 0) + win = 0; + return (u32) win; +} + +/* Check if we need to update the maximum receive window sequence number */ +static inline void tcp_update_max_rcv_wnd_seq(struct tcp_sock *tp) +{ + u32 wre = tp->rcv_wup + tp->rcv_wnd; + + if (after(wre, tp->rcv_mwnd_seq)) + tp->rcv_mwnd_seq = wre; +} + /* Choose a new window, without checks for shrinking, and without * scaling applied to the result. The caller does these things * if necessary. This is a "raw" window selection. diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ed6f6712f060..516087c622ad 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3561,6 +3561,7 @@ static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len) tp->rcv_wnd = opt.rcv_wnd; tp->rcv_wup = opt.rcv_wup; + tp->rcv_mwnd_seq = opt.rcv_wup + opt.rcv_wnd; return 0; } @@ -5275,6 +5276,7 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ecn_bytes); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_mwnd_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_tstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt); diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 9fdc19accafd..4e389d609f91 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -377,6 +377,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, tcp_rsk(req)->rcv_nxt = tp->rcv_nxt; tp->rcv_wup = tp->rcv_nxt; + tp->rcv_mwnd_seq = tp->rcv_wup + tp->rcv_wnd; /* tcp_conn_request() is sending the SYNACK, * and queues the child into listener accept queue. */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 71ac69b7b75e..2e1b23760815 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4808,20 +4808,18 @@ static enum skb_drop_reason tcp_sequence(const struct sock *sk, const struct tcphdr *th) { const struct tcp_sock *tp = tcp_sk(sk); - u32 seq_limit; if (before(end_seq, tp->rcv_wup)) return SKB_DROP_REASON_TCP_OLD_SEQUENCE; - seq_limit = tp->rcv_nxt + tcp_receive_window(tp); - if (unlikely(after(end_seq, seq_limit))) { + if (unlikely(after(end_seq, tp->rcv_nxt + tcp_max_receive_window(tp)))) { /* Some stacks are known to handle FIN incorrectly; allow the * FIN to extend beyond the window and check it in detail later. */ - if (!after(end_seq - th->fin, seq_limit)) + if (!after(end_seq - th->fin, tp->rcv_nxt + tcp_receive_window(tp))) return SKB_NOT_DROPPED_YET; - if (after(seq, seq_limit)) + if (after(seq, tp->rcv_nxt + tcp_max_receive_window(tp))) return SKB_DROP_REASON_TCP_INVALID_SEQUENCE; /* Only accept this packet if receive queue is empty. */ @@ -6903,6 +6901,7 @@ consume: */ WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1); tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; + tp->rcv_mwnd_seq = tp->rcv_wup + tp->rcv_wnd; /* RFC1323: The window in SYN & SYN/ACK segments is * never scaled. @@ -7015,6 +7014,7 @@ consume: WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1); WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; + tp->rcv_mwnd_seq = tp->rcv_wup + tp->rcv_wnd; /* RFC1323: The window in SYN & SYN/ACK segments is * never scaled. diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index dafb63b923d0..d350d794a959 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -604,6 +604,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->window_clamp = req->rsk_window_clamp; newtp->rcv_ssthresh = req->rsk_rcv_wnd; newtp->rcv_wnd = req->rsk_rcv_wnd; + newtp->rcv_mwnd_seq = newtp->rcv_wup + req->rsk_rcv_wnd; newtp->rx_opt.wscale_ok = ireq->wscale_ok; if (newtp->rx_opt.wscale_ok) { newtp->rx_opt.snd_wscale = ireq->snd_wscale; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 34a25ef61006..35c3b0ab5a0c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -293,6 +293,7 @@ static u16 tcp_select_window(struct sock *sk) tp->pred_flags = 0; tp->rcv_wnd = 0; tp->rcv_wup = tp->rcv_nxt; + tcp_update_max_rcv_wnd_seq(tp); return 0; } @@ -316,6 +317,7 @@ static u16 tcp_select_window(struct sock *sk) tp->rcv_wnd = new_win; tp->rcv_wup = tp->rcv_nxt; + tcp_update_max_rcv_wnd_seq(tp); /* Make sure we do not exceed the maximum possible * scaled window. @@ -4165,6 +4167,7 @@ static void tcp_connect_init(struct sock *sk) else tp->rcv_tstamp = tcp_jiffies32; tp->rcv_wup = tp->rcv_nxt; + tp->rcv_mwnd_seq = tp->rcv_nxt + tp->rcv_wnd; WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); inet_csk(sk)->icsk_rto = tcp_timeout_init(sk); diff --git a/tools/testing/selftests/net/packetdrill/tcp_rcv_big_endseq.pkt b/tools/testing/selftests/net/packetdrill/tcp_rcv_big_endseq.pkt index 6c0f32c40f19..12882be10f2e 100644 --- a/tools/testing/selftests/net/packetdrill/tcp_rcv_big_endseq.pkt +++ b/tools/testing/selftests/net/packetdrill/tcp_rcv_big_endseq.pkt @@ -36,7 +36,7 @@ +0 read(4, ..., 100000) = 4000 -// If queue is empty, accept a packet even if its end_seq is above wup + rcv_wnd +// If queue is empty, accept a packet even if its end_seq is above rcv_mwnd_seq +0 < P. 4001:54001(50000) ack 1 win 257 * > . 1:1(0) ack 54001 win 0 -- cgit v1.2.3 From f807b5b9b89eb9220d034115c272c312251cbcac Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 12 Mar 2026 12:13:52 +0000 Subject: net: stmmac: avoid passing pci_dev The pci_dev is only used to provide the ethtool bus_info using pci_name(priv->plat->pdev). This is the same as dev_name(priv->device). Thus, rather than passing the pci_dev, make use of what we already have. To avoid unexpectedly exposing the device name through ethtool where it wasn't provided before, add a flag priv->plat->provide_bus_info to enable this, which only dwmac-intel needs to set. Signed-off-by: Russell King (Oracle) Reviewed-by: Simon Horman Link: https://patch.msgid.link/E1w0evI-0000000CzY7-1fyo@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c | 2 +- drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c | 5 ++--- include/linux/stmmac.h | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c index 421c6c81ca5e..f621077c30a4 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel.c @@ -589,7 +589,7 @@ static int intel_mgbe_common_data(struct pci_dev *pdev, int ret; int i; - plat->pdev = pdev; + plat->provide_bus_info = true; plat->phy_addr = -1; plat->clk_csr = STMMAC_CSR_250_300M; plat->core_type = DWMAC_CORE_GMAC4; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c index c1e26965d9b5..92585d27ab88 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c @@ -312,10 +312,9 @@ static void stmmac_ethtool_getdrvinfo(struct net_device *dev, strscpy(info->driver, MAC100_ETHTOOL_NAME, sizeof(info->driver)); - if (priv->plat->pdev) { - strscpy(info->bus_info, pci_name(priv->plat->pdev), + if (priv->plat->provide_bus_info) + strscpy(info->bus_info, dev_name(priv->device), sizeof(info->bus_info)); - } } static int stmmac_ethtool_get_link_ksettings(struct net_device *dev, diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 937985276e6b..72febd246bdb 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -348,7 +348,7 @@ struct plat_stmmacenet_data { int rss_en; int mac_port_sel_speed; u8 vlan_fail_q; - struct pci_dev *pdev; + bool provide_bus_info; int int_snapshot_num; int msi_mac_vec; int msi_wol_vec; -- cgit v1.2.3 From 6df1459605cedd2112ebf660c77f42bb87d5c306 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 9 Mar 2026 18:03:31 +0100 Subject: net: phy: make mdio_device.c part of libphy This patch - makes mdio_device.c part of libphy - makes mdio_device_(un)register_reset() static - moves mdiobus_(un)register_device() from mdio_bus.c to mdio_device.c, stops exporting both functions and makes them private to phylib This further decouples the MDIO consumer functionality from libphy. Note: This makes MDIO driver registration part of phylib, therefore adjust Kconfig dependencies where needed. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/c6dbf9b3-3ca0-434b-ad3a-71fe602ab809@gmail.com Signed-off-by: Jakub Kicinski --- drivers/clk/qcom/Kconfig | 2 +- drivers/net/phy/Makefile | 6 +++--- drivers/net/phy/mdio-private.h | 11 ----------- drivers/net/phy/mdio_bus.c | 36 ------------------------------------ drivers/net/phy/mdio_device.c | 39 ++++++++++++++++++++++++++++++++++++--- drivers/net/phy/phylib-internal.h | 4 ++++ drivers/phy/broadcom/Kconfig | 4 ++-- include/linux/mdio.h | 2 -- 8 files changed, 46 insertions(+), 58 deletions(-) delete mode 100644 drivers/net/phy/mdio-private.h (limited to 'include/linux') diff --git a/drivers/clk/qcom/Kconfig b/drivers/clk/qcom/Kconfig index a8a86ea6bb74..a277c434d641 100644 --- a/drivers/clk/qcom/Kconfig +++ b/drivers/clk/qcom/Kconfig @@ -392,7 +392,7 @@ config IPQ_NSSCC_9574 config IPQ_NSSCC_QCA8K tristate "QCA8K(QCA8386 or QCA8084) NSS Clock Controller" - depends on MDIO_BUS + depends on PHYLIB help Support for NSS(Network SubSystem) clock controller on qca8386/qca8084 chip. diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile index 3a34917adea7..8d262b4e2be2 100644 --- a/drivers/net/phy/Makefile +++ b/drivers/net/phy/Makefile @@ -3,8 +3,8 @@ libphy-y := phy.o phy-c45.o phy-core.o phy_device.o \ linkmode.o phy_link_topology.o \ - phy_caps.o mdio_bus_provider.o phy_port.o -mdio-bus-y += mdio_bus.o mdio_device.o + phy_caps.o mdio_bus_provider.o phy_port.o \ + mdio_device.o ifdef CONFIG_PHYLIB # built-in whenever PHYLIB is built-in or module @@ -15,7 +15,7 @@ libphy-$(CONFIG_SWPHY) += swphy.o libphy-$(CONFIG_LED_TRIGGER_PHY) += phy_led_triggers.o libphy-$(CONFIG_OPEN_ALLIANCE_HELPERS) += open_alliance_helpers.o -obj-$(CONFIG_MDIO_BUS) += mdio-bus.o +obj-$(CONFIG_MDIO_BUS) += mdio_bus.o obj-$(CONFIG_PHYLINK) += phylink.o obj-$(CONFIG_PHYLIB) += libphy.o obj-$(CONFIG_PHYLIB) += mdio_devres.o diff --git a/drivers/net/phy/mdio-private.h b/drivers/net/phy/mdio-private.h deleted file mode 100644 index 8bc6d9088af1..000000000000 --- a/drivers/net/phy/mdio-private.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef __MDIO_PRIVATE_H -#define __MDIO_PRIVATE_H - -/* MDIO internal helpers - */ - -int mdio_device_register_reset(struct mdio_device *mdiodev); -void mdio_device_unregister_reset(struct mdio_device *mdiodev); - -#endif /* __MDIO_PRIVATE_H */ diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index 48c0447e6a8f..a30c679feeca 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -29,46 +29,10 @@ #include #include #include -#include "mdio-private.h" #define CREATE_TRACE_POINTS #include -int mdiobus_register_device(struct mdio_device *mdiodev) -{ - int err; - - if (mdiodev->bus->mdio_map[mdiodev->addr]) - return -EBUSY; - - if (mdiodev->flags & MDIO_DEVICE_FLAG_PHY) { - err = mdio_device_register_reset(mdiodev); - if (err) - return err; - - /* Assert the reset signal */ - mdio_device_reset(mdiodev, 1); - } - - mdiodev->bus->mdio_map[mdiodev->addr] = mdiodev; - - return 0; -} -EXPORT_SYMBOL(mdiobus_register_device); - -int mdiobus_unregister_device(struct mdio_device *mdiodev) -{ - if (mdiodev->bus->mdio_map[mdiodev->addr] != mdiodev) - return -EINVAL; - - mdio_device_unregister_reset(mdiodev); - - mdiodev->bus->mdio_map[mdiodev->addr] = NULL; - - return 0; -} -EXPORT_SYMBOL(mdiobus_unregister_device); - static struct mdio_device *mdiobus_find_device(struct mii_bus *bus, int addr) { bool addr_valid = addr >= 0 && addr < ARRAY_SIZE(bus->mdio_map); diff --git a/drivers/net/phy/mdio_device.c b/drivers/net/phy/mdio_device.c index da4fb7484c7c..56080d3d2d25 100644 --- a/drivers/net/phy/mdio_device.c +++ b/drivers/net/phy/mdio_device.c @@ -22,7 +22,7 @@ #include #include #include -#include "mdio-private.h" +#include "phylib-internal.h" /** * mdio_device_register_reset - Read and initialize the reset properties of @@ -31,7 +31,7 @@ * * Return: Zero if successful, negative error code on failure */ -int mdio_device_register_reset(struct mdio_device *mdiodev) +static int mdio_device_register_reset(struct mdio_device *mdiodev) { struct reset_control *reset; @@ -67,7 +67,7 @@ int mdio_device_register_reset(struct mdio_device *mdiodev) * an mdio device * @mdiodev: mdio_device structure */ -void mdio_device_unregister_reset(struct mdio_device *mdiodev) +static void mdio_device_unregister_reset(struct mdio_device *mdiodev) { gpiod_put(mdiodev->reset_gpio); mdiodev->reset_gpio = NULL; @@ -189,6 +189,39 @@ void mdio_device_remove(struct mdio_device *mdiodev) } EXPORT_SYMBOL(mdio_device_remove); +int mdiobus_register_device(struct mdio_device *mdiodev) +{ + int err; + + if (mdiodev->bus->mdio_map[mdiodev->addr]) + return -EBUSY; + + if (mdiodev->flags & MDIO_DEVICE_FLAG_PHY) { + err = mdio_device_register_reset(mdiodev); + if (err) + return err; + + /* Assert the reset signal */ + mdio_device_reset(mdiodev, 1); + } + + mdiodev->bus->mdio_map[mdiodev->addr] = mdiodev; + + return 0; +} + +int mdiobus_unregister_device(struct mdio_device *mdiodev) +{ + if (mdiodev->bus->mdio_map[mdiodev->addr] != mdiodev) + return -EINVAL; + + mdio_device_unregister_reset(mdiodev); + + mdiodev->bus->mdio_map[mdiodev->addr] = NULL; + + return 0; +} + /** * mdio_probe - probe an MDIO device * @dev: device to probe diff --git a/drivers/net/phy/phylib-internal.h b/drivers/net/phy/phylib-internal.h index dc9592c6bb8e..bfb1aa823868 100644 --- a/drivers/net/phy/phylib-internal.h +++ b/drivers/net/phy/phylib-internal.h @@ -6,6 +6,7 @@ #ifndef __PHYLIB_INTERNAL_H #define __PHYLIB_INTERNAL_H +struct mdio_device; struct phy_device; /* @@ -20,6 +21,9 @@ void of_set_phy_timing_role(struct phy_device *phydev); int phy_speed_down_core(struct phy_device *phydev); void phy_check_downshift(struct phy_device *phydev); +int mdiobus_register_device(struct mdio_device *mdiodev); +int mdiobus_unregister_device(struct mdio_device *mdiodev); + int genphy_c45_read_eee_adv(struct phy_device *phydev, unsigned long *adv); #endif /* __PHYLIB_INTERNAL_H */ diff --git a/drivers/phy/broadcom/Kconfig b/drivers/phy/broadcom/Kconfig index 1d89a2fd9b79..46371a8940d7 100644 --- a/drivers/phy/broadcom/Kconfig +++ b/drivers/phy/broadcom/Kconfig @@ -52,7 +52,7 @@ config PHY_BCM_NS_USB3 tristate "Broadcom Northstar USB 3.0 PHY Driver" depends on ARCH_BCM_IPROC || COMPILE_TEST depends on HAS_IOMEM && OF - depends on MDIO_BUS + depends on PHYLIB select GENERIC_PHY help Enable this to support Broadcom USB 3.0 PHY connected to the USB @@ -60,7 +60,7 @@ config PHY_BCM_NS_USB3 config PHY_NS2_PCIE tristate "Broadcom Northstar2 PCIe PHY driver" - depends on (OF && MDIO_BUS_MUX_BCM_IPROC) || (COMPILE_TEST && MDIO_BUS) + depends on (OF && MDIO_BUS_MUX_BCM_IPROC) || (COMPILE_TEST && PHYLIB) select GENERIC_PHY default ARCH_BCM_IPROC help diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 5d1203b9af20..f4f9d9609448 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -688,8 +688,6 @@ static inline int mdiodev_c45_write(struct mdio_device *mdiodev, u32 devad, val); } -int mdiobus_register_device(struct mdio_device *mdiodev); -int mdiobus_unregister_device(struct mdio_device *mdiodev); bool mdiobus_is_registered_device(struct mii_bus *bus, int addr); struct phy_device *mdiobus_get_phy(struct mii_bus *bus, int addr); -- cgit v1.2.3 From c4399af5e55658e832779b256d8458323011f983 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 9 Mar 2026 18:06:00 +0100 Subject: net: phy: move remaining provider code to mdio_bus_provider.c This moves definition of mdio_bus class and bus_type to the provider side, what allows to make them private to libphy. As a prerequisite MDIO statistics handling is moved to the provider side as well. Note: This patch causes a checkpatch error "Macros with complex values should be enclosed in parentheses" for MDIO_BUS_STATS_ADDR_ATTR_GROUP. I consider this a false positive here, in addition the patch just moves existing code. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/47b85676-b349-4aa0-a5ef-cd37769a4c69@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/mdio_bus.c | 282 ------------------------------------ drivers/net/phy/mdio_bus_provider.c | 275 +++++++++++++++++++++++++++++++++++ drivers/net/phy/phylib-internal.h | 3 + include/linux/phy.h | 3 - 4 files changed, 278 insertions(+), 285 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index 9fb473326027..00d0e4159e9b 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -10,20 +10,14 @@ #include #include -#include #include -#include #include #include #include #include #include #include -#include -#include -#include #include -#include #include #include #include @@ -64,218 +58,6 @@ bool mdiobus_is_registered_device(struct mii_bus *bus, int addr) } EXPORT_SYMBOL(mdiobus_is_registered_device); -/** - * mdiobus_release - mii_bus device release callback - * @d: the target struct device that contains the mii_bus - * - * Description: called when the last reference to an mii_bus is - * dropped, to free the underlying memory. - */ -static void mdiobus_release(struct device *d) -{ - struct mii_bus *bus = to_mii_bus(d); - - WARN(bus->state != MDIOBUS_RELEASED && - /* for compatibility with error handling in drivers */ - bus->state != MDIOBUS_ALLOCATED, - "%s: not in RELEASED or ALLOCATED state\n", - bus->id); - - if (bus->state == MDIOBUS_RELEASED) - fwnode_handle_put(dev_fwnode(d)); - - kfree(bus); -} - -struct mdio_bus_stat_attr { - struct device_attribute attr; - int address; - unsigned int field_offset; -}; - -static struct mdio_bus_stat_attr *to_sattr(struct device_attribute *attr) -{ - return container_of(attr, struct mdio_bus_stat_attr, attr); -} - -static u64 mdio_bus_get_stat(struct mdio_bus_stats *s, unsigned int offset) -{ - const u64_stats_t *stats = (const void *)s + offset; - unsigned int start; - u64 val = 0; - - do { - start = u64_stats_fetch_begin(&s->syncp); - val = u64_stats_read(stats); - } while (u64_stats_fetch_retry(&s->syncp, start)); - - return val; -} - -static ssize_t mdio_bus_stat_field_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct mdio_bus_stat_attr *sattr = to_sattr(attr); - struct mii_bus *bus = to_mii_bus(dev); - u64 val = 0; - - if (sattr->address < 0) { - /* get global stats */ - for (int i = 0; i < PHY_MAX_ADDR; i++) - val += mdio_bus_get_stat(&bus->stats[i], - sattr->field_offset); - } else { - val = mdio_bus_get_stat(&bus->stats[sattr->address], - sattr->field_offset); - } - - return sysfs_emit(buf, "%llu\n", val); -} - -static ssize_t mdio_bus_device_stat_field_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct mdio_bus_stat_attr *sattr = to_sattr(attr); - struct mdio_device *mdiodev = to_mdio_device(dev); - struct mii_bus *bus = mdiodev->bus; - int addr = mdiodev->addr; - u64 val; - - val = mdio_bus_get_stat(&bus->stats[addr], sattr->field_offset); - - return sysfs_emit(buf, "%llu\n", val); -} - -#define MDIO_BUS_STATS_ATTR(field) \ -static const struct mdio_bus_stat_attr dev_attr_mdio_bus_##field = { \ - .attr = __ATTR(field, 0444, mdio_bus_stat_field_show, NULL), \ - .address = -1, \ - .field_offset = offsetof(struct mdio_bus_stats, field), \ -}; \ -static const struct mdio_bus_stat_attr dev_attr_mdio_bus_device_##field = { \ - .attr = __ATTR(field, 0444, mdio_bus_device_stat_field_show, NULL), \ - .field_offset = offsetof(struct mdio_bus_stats, field), \ -} - -MDIO_BUS_STATS_ATTR(transfers); -MDIO_BUS_STATS_ATTR(errors); -MDIO_BUS_STATS_ATTR(writes); -MDIO_BUS_STATS_ATTR(reads); - -#define MDIO_BUS_STATS_ADDR_ATTR_DECL(field, addr, file) \ -static const struct mdio_bus_stat_attr \ -dev_attr_mdio_bus_addr_##field##_##addr = { \ - .attr = { .attr = { .name = file, .mode = 0444 }, \ - .show = mdio_bus_stat_field_show, \ - }, \ - .address = addr, \ - .field_offset = offsetof(struct mdio_bus_stats, field), \ -} - -#define MDIO_BUS_STATS_ADDR_ATTR(field, addr) \ - MDIO_BUS_STATS_ADDR_ATTR_DECL(field, addr, \ - __stringify(field) "_" __stringify(addr)) - -#define MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(addr) \ - MDIO_BUS_STATS_ADDR_ATTR(transfers, addr); \ - MDIO_BUS_STATS_ADDR_ATTR(errors, addr); \ - MDIO_BUS_STATS_ADDR_ATTR(writes, addr); \ - MDIO_BUS_STATS_ADDR_ATTR(reads, addr) \ - -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(0); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(1); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(2); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(3); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(4); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(5); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(6); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(7); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(8); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(9); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(10); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(11); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(12); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(13); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(14); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(15); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(16); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(17); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(18); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(19); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(20); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(21); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(22); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(23); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(24); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(25); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(26); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(27); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(28); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(29); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(30); -MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(31); - -#define MDIO_BUS_STATS_ADDR_ATTR_GROUP(addr) \ - &dev_attr_mdio_bus_addr_transfers_##addr.attr.attr, \ - &dev_attr_mdio_bus_addr_errors_##addr.attr.attr, \ - &dev_attr_mdio_bus_addr_writes_##addr.attr.attr, \ - &dev_attr_mdio_bus_addr_reads_##addr.attr.attr \ - -static const struct attribute *const mdio_bus_statistics_attrs[] = { - &dev_attr_mdio_bus_transfers.attr.attr, - &dev_attr_mdio_bus_errors.attr.attr, - &dev_attr_mdio_bus_writes.attr.attr, - &dev_attr_mdio_bus_reads.attr.attr, - MDIO_BUS_STATS_ADDR_ATTR_GROUP(0), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(1), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(2), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(3), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(4), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(5), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(6), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(7), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(8), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(9), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(10), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(11), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(12), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(13), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(14), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(15), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(16), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(17), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(18), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(19), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(20), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(21), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(22), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(23), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(24), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(25), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(26), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(27), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(28), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(29), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(30), - MDIO_BUS_STATS_ADDR_ATTR_GROUP(31), - NULL, -}; - -static const struct attribute_group mdio_bus_statistics_group = { - .name = "statistics", - .attrs_const = mdio_bus_statistics_attrs, -}; -__ATTRIBUTE_GROUPS(mdio_bus_statistics); - -const struct class mdio_bus_class = { - .name = "mdio_bus", - .dev_release = mdiobus_release, - .dev_groups = mdio_bus_statistics_groups, -}; -EXPORT_SYMBOL_GPL(mdio_bus_class); - static void mdiobus_stats_acct(struct mdio_bus_stats *stats, bool op, int ret) { preempt_disable(); @@ -841,69 +623,5 @@ int mdiobus_c45_modify_changed(struct mii_bus *bus, int addr, int devad, } EXPORT_SYMBOL_GPL(mdiobus_c45_modify_changed); -/** - * mdio_bus_match - determine if given MDIO driver supports the given - * MDIO device - * @dev: target MDIO device - * @drv: given MDIO driver - * - * Return: 1 if the driver supports the device, 0 otherwise - * - * Description: This may require calling the devices own match function, - * since different classes of MDIO devices have different match criteria. - */ -static int mdio_bus_match(struct device *dev, const struct device_driver *drv) -{ - const struct mdio_driver *mdiodrv = to_mdio_driver(drv); - struct mdio_device *mdio = to_mdio_device(dev); - - /* Both the driver and device must type-match */ - if (!(mdiodrv->mdiodrv.flags & MDIO_DEVICE_IS_PHY) != - !(mdio->flags & MDIO_DEVICE_FLAG_PHY)) - return 0; - - if (of_driver_match_device(dev, drv)) - return 1; - - if (mdio->bus_match) - return mdio->bus_match(dev, drv); - - return 0; -} - -static int mdio_uevent(const struct device *dev, struct kobj_uevent_env *env) -{ - int rc; - - /* Some devices have extra OF data and an OF-style MODALIAS */ - rc = of_device_uevent_modalias(dev, env); - if (rc != -ENODEV) - return rc; - - return 0; -} - -static const struct attribute *const mdio_bus_device_statistics_attrs[] = { - &dev_attr_mdio_bus_device_transfers.attr.attr, - &dev_attr_mdio_bus_device_errors.attr.attr, - &dev_attr_mdio_bus_device_writes.attr.attr, - &dev_attr_mdio_bus_device_reads.attr.attr, - NULL, -}; - -static const struct attribute_group mdio_bus_device_statistics_group = { - .name = "statistics", - .attrs_const = mdio_bus_device_statistics_attrs, -}; -__ATTRIBUTE_GROUPS(mdio_bus_device_statistics); - -const struct bus_type mdio_bus_type = { - .name = "mdio_bus", - .dev_groups = mdio_bus_device_statistics_groups, - .match = mdio_bus_match, - .uevent = mdio_uevent, -}; -EXPORT_SYMBOL(mdio_bus_type); - MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MDIO bus/device layer"); diff --git a/drivers/net/phy/mdio_bus_provider.c b/drivers/net/phy/mdio_bus_provider.c index d50fe6eb4b02..041576eba47a 100644 --- a/drivers/net/phy/mdio_bus_provider.c +++ b/drivers/net/phy/mdio_bus_provider.c @@ -28,6 +28,281 @@ #include #include #include +#include "phylib-internal.h" + +/** + * mdiobus_release - mii_bus device release callback + * @d: the target struct device that contains the mii_bus + * + * Description: called when the last reference to an mii_bus is + * dropped, to free the underlying memory. + */ +static void mdiobus_release(struct device *d) +{ + struct mii_bus *bus = to_mii_bus(d); + + WARN(bus->state != MDIOBUS_RELEASED && + /* for compatibility with error handling in drivers */ + bus->state != MDIOBUS_ALLOCATED, + "%s: not in RELEASED or ALLOCATED state\n", + bus->id); + + if (bus->state == MDIOBUS_RELEASED) + fwnode_handle_put(dev_fwnode(d)); + + kfree(bus); +} + +struct mdio_bus_stat_attr { + struct device_attribute attr; + int address; + unsigned int field_offset; +}; + +static struct mdio_bus_stat_attr *to_sattr(struct device_attribute *attr) +{ + return container_of(attr, struct mdio_bus_stat_attr, attr); +} + +static u64 mdio_bus_get_stat(struct mdio_bus_stats *s, unsigned int offset) +{ + const u64_stats_t *stats = (const void *)s + offset; + unsigned int start; + u64 val = 0; + + do { + start = u64_stats_fetch_begin(&s->syncp); + val = u64_stats_read(stats); + } while (u64_stats_fetch_retry(&s->syncp, start)); + + return val; +} + +static ssize_t mdio_bus_stat_field_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct mdio_bus_stat_attr *sattr = to_sattr(attr); + struct mii_bus *bus = to_mii_bus(dev); + u64 val = 0; + + if (sattr->address < 0) { + /* get global stats */ + for (int i = 0; i < PHY_MAX_ADDR; i++) + val += mdio_bus_get_stat(&bus->stats[i], + sattr->field_offset); + } else { + val = mdio_bus_get_stat(&bus->stats[sattr->address], + sattr->field_offset); + } + + return sysfs_emit(buf, "%llu\n", val); +} + +static ssize_t mdio_bus_device_stat_field_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct mdio_bus_stat_attr *sattr = to_sattr(attr); + struct mdio_device *mdiodev = to_mdio_device(dev); + struct mii_bus *bus = mdiodev->bus; + int addr = mdiodev->addr; + u64 val; + + val = mdio_bus_get_stat(&bus->stats[addr], sattr->field_offset); + + return sysfs_emit(buf, "%llu\n", val); +} + +#define MDIO_BUS_STATS_ATTR(field) \ +static const struct mdio_bus_stat_attr dev_attr_mdio_bus_##field = { \ + .attr = __ATTR(field, 0444, mdio_bus_stat_field_show, NULL), \ + .address = -1, \ + .field_offset = offsetof(struct mdio_bus_stats, field), \ +}; \ +static const struct mdio_bus_stat_attr dev_attr_mdio_bus_device_##field = { \ + .attr = __ATTR(field, 0444, mdio_bus_device_stat_field_show, NULL), \ + .field_offset = offsetof(struct mdio_bus_stats, field), \ +} + +MDIO_BUS_STATS_ATTR(transfers); +MDIO_BUS_STATS_ATTR(errors); +MDIO_BUS_STATS_ATTR(writes); +MDIO_BUS_STATS_ATTR(reads); + +#define MDIO_BUS_STATS_ADDR_ATTR_DECL(field, addr, file) \ +static const struct mdio_bus_stat_attr \ +dev_attr_mdio_bus_addr_##field##_##addr = { \ + .attr = { .attr = { .name = file, .mode = 0444 }, \ + .show = mdio_bus_stat_field_show, \ + }, \ + .address = addr, \ + .field_offset = offsetof(struct mdio_bus_stats, field), \ +} + +#define MDIO_BUS_STATS_ADDR_ATTR(field, addr) \ + MDIO_BUS_STATS_ADDR_ATTR_DECL(field, addr, \ + __stringify(field) "_" __stringify(addr)) + +#define MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(addr) \ + MDIO_BUS_STATS_ADDR_ATTR(transfers, addr); \ + MDIO_BUS_STATS_ADDR_ATTR(errors, addr); \ + MDIO_BUS_STATS_ADDR_ATTR(writes, addr); \ + MDIO_BUS_STATS_ADDR_ATTR(reads, addr) \ + +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(0); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(1); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(2); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(3); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(4); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(5); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(6); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(7); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(8); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(9); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(10); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(11); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(12); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(13); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(14); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(15); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(16); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(17); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(18); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(19); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(20); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(21); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(22); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(23); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(24); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(25); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(26); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(27); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(28); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(29); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(30); +MDIO_BUS_STATS_ADDR_ATTR_GROUP_DECL(31); + +#define MDIO_BUS_STATS_ADDR_ATTR_GROUP(addr) \ + &(dev_attr_mdio_bus_addr_transfers_##addr).attr.attr, \ + &(dev_attr_mdio_bus_addr_errors_##addr).attr.attr, \ + &(dev_attr_mdio_bus_addr_writes_##addr).attr.attr, \ + &(dev_attr_mdio_bus_addr_reads_##addr).attr.attr \ + +static const struct attribute *const mdio_bus_statistics_attrs[] = { + &dev_attr_mdio_bus_transfers.attr.attr, + &dev_attr_mdio_bus_errors.attr.attr, + &dev_attr_mdio_bus_writes.attr.attr, + &dev_attr_mdio_bus_reads.attr.attr, + MDIO_BUS_STATS_ADDR_ATTR_GROUP(0), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(1), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(2), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(3), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(4), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(5), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(6), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(7), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(8), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(9), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(10), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(11), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(12), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(13), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(14), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(15), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(16), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(17), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(18), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(19), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(20), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(21), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(22), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(23), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(24), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(25), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(26), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(27), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(28), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(29), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(30), + MDIO_BUS_STATS_ADDR_ATTR_GROUP(31), + NULL, +}; + +static const struct attribute_group mdio_bus_statistics_group = { + .name = "statistics", + .attrs_const = mdio_bus_statistics_attrs, +}; +__ATTRIBUTE_GROUPS(mdio_bus_statistics); + +const struct class mdio_bus_class = { + .name = "mdio_bus", + .dev_release = mdiobus_release, + .dev_groups = mdio_bus_statistics_groups, +}; + +/** + * mdio_bus_match - determine if given MDIO driver supports the given + * MDIO device + * @dev: target MDIO device + * @drv: given MDIO driver + * + * Return: 1 if the driver supports the device, 0 otherwise + * + * Description: This may require calling the devices own match function, + * since different classes of MDIO devices have different match criteria. + */ +static int mdio_bus_match(struct device *dev, const struct device_driver *drv) +{ + const struct mdio_driver *mdiodrv = to_mdio_driver(drv); + struct mdio_device *mdio = to_mdio_device(dev); + + /* Both the driver and device must type-match */ + if (!(mdiodrv->mdiodrv.flags & MDIO_DEVICE_IS_PHY) != + !(mdio->flags & MDIO_DEVICE_FLAG_PHY)) + return 0; + + if (of_driver_match_device(dev, drv)) + return 1; + + if (mdio->bus_match) + return mdio->bus_match(dev, drv); + + return 0; +} + +static int mdio_uevent(const struct device *dev, struct kobj_uevent_env *env) +{ + int rc; + + /* Some devices have extra OF data and an OF-style MODALIAS */ + rc = of_device_uevent_modalias(dev, env); + if (rc != -ENODEV) + return rc; + + return 0; +} + +static const struct attribute *const mdio_bus_device_statistics_attrs[] = { + &dev_attr_mdio_bus_device_transfers.attr.attr, + &dev_attr_mdio_bus_device_errors.attr.attr, + &dev_attr_mdio_bus_device_writes.attr.attr, + &dev_attr_mdio_bus_device_reads.attr.attr, + NULL, +}; + +static const struct attribute_group mdio_bus_device_statistics_group = { + .name = "statistics", + .attrs_const = mdio_bus_device_statistics_attrs, +}; +__ATTRIBUTE_GROUPS(mdio_bus_device_statistics); + +const struct bus_type mdio_bus_type = { + .name = "mdio_bus", + .dev_groups = mdio_bus_device_statistics_groups, + .match = mdio_bus_match, + .uevent = mdio_uevent, +}; /** * mdiobus_alloc_size - allocate a mii_bus structure diff --git a/drivers/net/phy/phylib-internal.h b/drivers/net/phy/phylib-internal.h index bfb1aa823868..664ed7faa518 100644 --- a/drivers/net/phy/phylib-internal.h +++ b/drivers/net/phy/phylib-internal.h @@ -9,6 +9,9 @@ struct mdio_device; struct phy_device; +extern const struct bus_type mdio_bus_type; +extern const struct class mdio_bus_class; + /* * phy_supported_speeds - return all speeds currently supported by a PHY device */ diff --git a/include/linux/phy.h b/include/linux/phy.h index e9b0d7427b0e..5de4b172cd0b 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2446,9 +2446,6 @@ int __phy_hwtstamp_set(struct phy_device *phydev, struct phy_port *phy_get_sfp_port(struct phy_device *phydev); -extern const struct bus_type mdio_bus_type; -extern const struct class mdio_bus_class; - /** * phy_module_driver() - Helper macro for registering PHY drivers * @__phy_drivers: array of PHY drivers to register -- cgit v1.2.3 From 2a8c8a03f306e21a0ea74c93d4332119557f4575 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 12 Mar 2026 11:04:07 +0100 Subject: net/mlx5: Add a shared devlink instance for PFs on same chip Use the previously introduced shared devlink infrastructure to create a shared devlink instance for mlx5 PFs that reside on the same physical chip. The shared instance is identified by the chip's serial number extracted from PCI VPD (V3 keyword, with fallback to serial number for older devices). Each PF that probes calls mlx5_shd_init() which extracts the chip serial number and uses devlink_shd_get() to get or create the shared instance. When a PF is removed, mlx5_shd_uninit() calls devlink_shd_put() to release the reference. The shared instance is automatically destroyed when the last PF is removed. Make the PF devlink instances nested in this shared devlink instance, allowing userspace to identify which PFs belong to the same physical chip. Example: pci/0000:08:00.0: index 0 nested_devlink: auxiliary/mlx5_core.eth.0 devlink_index/1: index 1 nested_devlink: pci/0000:08:00.0 pci/0000:08:00.1 auxiliary/mlx5_core.eth.0: index 2 pci/0000:08:00.1: index 3 nested_devlink: auxiliary/mlx5_core.eth.1 auxiliary/mlx5_core.eth.1: index 4 Signed-off-by: Jiri Pirko Link: https://patch.msgid.link/20260312100407.551173-14-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/Makefile | 5 +- drivers/net/ethernet/mellanox/mlx5/core/main.c | 17 ++++++ .../net/ethernet/mellanox/mlx5/core/sh_devlink.c | 61 ++++++++++++++++++++++ .../net/ethernet/mellanox/mlx5/core/sh_devlink.h | 12 +++++ include/linux/mlx5/driver.h | 1 + 5 files changed, 94 insertions(+), 2 deletions(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.c create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.h (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index 8ffa286a18f5..d39fe9c4a87c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -16,8 +16,9 @@ mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \ transobj.o vport.o sriov.o fs_cmd.o fs_core.o pci_irq.o \ fs_counters.o fs_ft_pool.o rl.o lag/debugfs.o lag/lag.o dev.o events.o wq.o lib/gid.o \ lib/devcom.o lib/pci_vsc.o lib/dm.o lib/fs_ttc.o diag/fs_tracepoint.o \ - diag/fw_tracer.o diag/crdump.o devlink.o diag/rsc_dump.o diag/reporter_vnic.o \ - fw_reset.o qos.o lib/tout.o lib/aso.o wc.o fs_pool.o lib/nv_param.o + diag/fw_tracer.o diag/crdump.o devlink.o sh_devlink.o diag/rsc_dump.o \ + diag/reporter_vnic.o fw_reset.o qos.o lib/tout.o lib/aso.o wc.o fs_pool.o \ + lib/nv_param.o # # Netdev basic diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index fdc3ba20912e..1c35c3fc3bb3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -74,6 +74,7 @@ #include "mlx5_irq.h" #include "hwmon.h" #include "lag/lag.h" +#include "sh_devlink.h" MODULE_AUTHOR("Eli Cohen "); MODULE_DESCRIPTION("Mellanox 5th generation network adapters (ConnectX series) core driver"); @@ -1520,10 +1521,16 @@ int mlx5_init_one(struct mlx5_core_dev *dev) int err; devl_lock(devlink); + if (dev->shd) { + err = devl_nested_devlink_set(dev->shd, devlink); + if (err) + goto unlock; + } devl_register(devlink); err = mlx5_init_one_devl_locked(dev); if (err) devl_unregister(devlink); +unlock: devl_unlock(devlink); return err; } @@ -2005,6 +2012,13 @@ static int probe_one(struct pci_dev *pdev, const struct pci_device_id *id) goto pci_init_err; } + err = mlx5_shd_init(dev); + if (err) { + mlx5_core_err(dev, "mlx5_shd_init failed with error code %d\n", + err); + goto shd_init_err; + } + err = mlx5_init_one(dev); if (err) { mlx5_core_err(dev, "mlx5_init_one failed with error code %d\n", @@ -2018,6 +2032,8 @@ static int probe_one(struct pci_dev *pdev, const struct pci_device_id *id) return 0; err_init_one: + mlx5_shd_uninit(dev); +shd_init_err: mlx5_pci_close(dev); pci_init_err: mlx5_mdev_uninit(dev); @@ -2039,6 +2055,7 @@ static void remove_one(struct pci_dev *pdev) mlx5_drain_health_wq(dev); mlx5_sriov_disable(pdev, false); mlx5_uninit_one(dev); + mlx5_shd_uninit(dev); mlx5_pci_close(dev); mlx5_mdev_uninit(dev); mlx5_adev_idx_free(dev->priv.adev_idx); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.c new file mode 100644 index 000000000000..bc33f95302df --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#include +#include + +#include "sh_devlink.h" + +static const struct devlink_ops mlx5_shd_ops = { +}; + +int mlx5_shd_init(struct mlx5_core_dev *dev) +{ + u8 *vpd_data __free(kfree) = NULL; + struct pci_dev *pdev = dev->pdev; + unsigned int vpd_size, kw_len; + struct devlink *devlink; + char *sn, *end; + int start; + int err; + + if (!mlx5_core_is_pf(dev)) + return 0; + + vpd_data = pci_vpd_alloc(pdev, &vpd_size); + if (IS_ERR(vpd_data)) { + err = PTR_ERR(vpd_data); + return err == -ENODEV ? 0 : err; + } + start = pci_vpd_find_ro_info_keyword(vpd_data, vpd_size, "V3", &kw_len); + if (start < 0) { + /* Fall-back to SN for older devices. */ + start = pci_vpd_find_ro_info_keyword(vpd_data, vpd_size, + PCI_VPD_RO_KEYWORD_SERIALNO, &kw_len); + if (start < 0) + return -ENOENT; + } + sn = kstrndup(vpd_data + start, kw_len, GFP_KERNEL); + if (!sn) + return -ENOMEM; + /* Firmware may return spaces at the end of the string, strip it. */ + end = strchrnul(sn, ' '); + *end = '\0'; + + /* Get or create shared devlink instance */ + devlink = devlink_shd_get(sn, &mlx5_shd_ops, 0, pdev->dev.driver); + kfree(sn); + if (!devlink) + return -ENOMEM; + + dev->shd = devlink; + return 0; +} + +void mlx5_shd_uninit(struct mlx5_core_dev *dev) +{ + if (!dev->shd) + return; + + devlink_shd_put(dev->shd); +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.h b/drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.h new file mode 100644 index 000000000000..8ab8d6940227 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/sh_devlink.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#ifndef __MLX5_SH_DEVLINK_H__ +#define __MLX5_SH_DEVLINK_H__ + +#include + +int mlx5_shd_init(struct mlx5_core_dev *dev); +void mlx5_shd_uninit(struct mlx5_core_dev *dev); + +#endif /* __MLX5_SH_DEVLINK_H__ */ diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 04dcd09f7517..1268fcf35ec7 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -798,6 +798,7 @@ struct mlx5_core_dev { enum mlx5_wc_state wc_state; /* sync write combining state */ struct mutex wc_state_lock; + struct devlink *shd; }; struct mlx5_db { -- cgit v1.2.3 From 0834d6f4abd0ca35b5706d267a6e4b78303a95de Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 12 Mar 2026 14:02:29 +0100 Subject: PCI: endpoint: Do not mark the BAR succeeding a 64-bit BAR as BAR_RESERVED A BAR that can only be configured as a 64-bit BAR by an EPC driver is marked as such using the "only_64bit" flag. Currently, the documentation says that an EPC driver should explicitly mark the BAR succeeding an "only_64bit" BAR as BAR_RESERVED. However, a 64-bit BAR will always take up two BARs. It is thus redundant to mark both BARs. pci_epc_get_next_free_bar() already skips the BAR succeeding a "only_64bit" BAR, regardless if the succeeding BAR is marked as BAR_RESERVED or not. Thus, drop the BAR_RESERVED for a BAR succeeding a "only_64bit" BAR. No functional changes. Suggested-by: Manivannan Sadhasivam Signed-off-by: Niklas Cassel Signed-off-by: Manivannan Sadhasivam Link: https://patch.msgid.link/20260312130229.2282001-13-cassel@kernel.org --- drivers/pci/controller/dwc/pci-layerscape-ep.c | 2 -- drivers/pci/controller/dwc/pcie-keembay.c | 3 --- drivers/pci/controller/dwc/pcie-qcom-ep.c | 2 -- drivers/pci/controller/dwc/pcie-tegra194.c | 1 - drivers/pci/controller/dwc/pcie-uniphier-ep.c | 5 ----- drivers/pci/controller/pcie-rcar-ep.c | 3 --- include/linux/pci-epc.h | 3 +-- 7 files changed, 1 insertion(+), 18 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c b/drivers/pci/controller/dwc/pci-layerscape-ep.c index a4a800699f89..79d226e0cc80 100644 --- a/drivers/pci/controller/dwc/pci-layerscape-ep.c +++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c @@ -251,9 +251,7 @@ static int __init ls_pcie_ep_probe(struct platform_device *pdev) pci->ops = pcie->drvdata->dw_pcie_ops; ls_epc->bar[BAR_2].only_64bit = true; - ls_epc->bar[BAR_3].type = BAR_RESERVED; ls_epc->bar[BAR_4].only_64bit = true; - ls_epc->bar[BAR_5].type = BAR_RESERVED; ls_epc->linkup_notifier = true; pcie->pci = pci; diff --git a/drivers/pci/controller/dwc/pcie-keembay.c b/drivers/pci/controller/dwc/pcie-keembay.c index 2666a9c3d67e..7cf2c312ecec 100644 --- a/drivers/pci/controller/dwc/pcie-keembay.c +++ b/drivers/pci/controller/dwc/pcie-keembay.c @@ -313,11 +313,8 @@ static const struct pci_epc_features keembay_pcie_epc_features = { .msi_capable = true, .msix_capable = true, .bar[BAR_0] = { .only_64bit = true, }, - .bar[BAR_1] = { .type = BAR_RESERVED, }, .bar[BAR_2] = { .only_64bit = true, }, - .bar[BAR_3] = { .type = BAR_RESERVED, }, .bar[BAR_4] = { .only_64bit = true, }, - .bar[BAR_5] = { .type = BAR_RESERVED, }, .align = SZ_16K, }; diff --git a/drivers/pci/controller/dwc/pcie-qcom-ep.c b/drivers/pci/controller/dwc/pcie-qcom-ep.c index 18460f01b2c6..ffb4409c0468 100644 --- a/drivers/pci/controller/dwc/pcie-qcom-ep.c +++ b/drivers/pci/controller/dwc/pcie-qcom-ep.c @@ -850,9 +850,7 @@ static const struct pci_epc_features qcom_pcie_epc_features = { .msi_capable = true, .align = SZ_4K, .bar[BAR_0] = { .only_64bit = true, }, - .bar[BAR_1] = { .type = BAR_RESERVED, }, .bar[BAR_2] = { .only_64bit = true, }, - .bar[BAR_3] = { .type = BAR_RESERVED, }, }; static const struct pci_epc_features * diff --git a/drivers/pci/controller/dwc/pcie-tegra194.c b/drivers/pci/controller/dwc/pcie-tegra194.c index 06571d806ab3..f1f70fb824b2 100644 --- a/drivers/pci/controller/dwc/pcie-tegra194.c +++ b/drivers/pci/controller/dwc/pcie-tegra194.c @@ -1993,7 +1993,6 @@ static const struct pci_epc_features tegra_pcie_epc_features = { .msi_capable = true, .bar[BAR_0] = { .type = BAR_FIXED, .fixed_size = SZ_1M, .only_64bit = true, }, - .bar[BAR_1] = { .type = BAR_RESERVED, }, .bar[BAR_2] = { .type = BAR_RESERVED, }, .bar[BAR_3] = { .type = BAR_RESERVED, }, .bar[BAR_4] = { .type = BAR_RESERVED, }, diff --git a/drivers/pci/controller/dwc/pcie-uniphier-ep.c b/drivers/pci/controller/dwc/pcie-uniphier-ep.c index d52753060970..b7020131f626 100644 --- a/drivers/pci/controller/dwc/pcie-uniphier-ep.c +++ b/drivers/pci/controller/dwc/pcie-uniphier-ep.c @@ -426,9 +426,7 @@ static const struct uniphier_pcie_ep_soc_data uniphier_pro5_data = { .msix_capable = false, .align = 1 << 16, .bar[BAR_0] = { .only_64bit = true, }, - .bar[BAR_1] = { .type = BAR_RESERVED, }, .bar[BAR_2] = { .only_64bit = true, }, - .bar[BAR_3] = { .type = BAR_RESERVED, }, .bar[BAR_4] = { .type = BAR_RESERVED, }, .bar[BAR_5] = { .type = BAR_RESERVED, }, }, @@ -445,11 +443,8 @@ static const struct uniphier_pcie_ep_soc_data uniphier_nx1_data = { .msix_capable = false, .align = 1 << 12, .bar[BAR_0] = { .only_64bit = true, }, - .bar[BAR_1] = { .type = BAR_RESERVED, }, .bar[BAR_2] = { .only_64bit = true, }, - .bar[BAR_3] = { .type = BAR_RESERVED, }, .bar[BAR_4] = { .only_64bit = true, }, - .bar[BAR_5] = { .type = BAR_RESERVED, }, }, }; diff --git a/drivers/pci/controller/pcie-rcar-ep.c b/drivers/pci/controller/pcie-rcar-ep.c index 657875ef4657..c2da8ac1f2e8 100644 --- a/drivers/pci/controller/pcie-rcar-ep.c +++ b/drivers/pci/controller/pcie-rcar-ep.c @@ -440,13 +440,10 @@ static const struct pci_epc_features rcar_pcie_epc_features = { /* use 64-bit BARs so mark BAR[1,3,5] as reserved */ .bar[BAR_0] = { .type = BAR_FIXED, .fixed_size = 128, .only_64bit = true, }, - .bar[BAR_1] = { .type = BAR_RESERVED, }, .bar[BAR_2] = { .type = BAR_FIXED, .fixed_size = 256, .only_64bit = true, }, - .bar[BAR_3] = { .type = BAR_RESERVED, }, .bar[BAR_4] = { .type = BAR_FIXED, .fixed_size = 256, .only_64bit = true, }, - .bar[BAR_5] = { .type = BAR_RESERVED, }, }; static const struct pci_epc_features* diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index c021c7af175f..c981ea7d52c0 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -206,8 +206,7 @@ enum pci_epc_bar_type { * @fixed_size: the fixed size, only applicable if type is BAR_FIXED_MASK. * @only_64bit: if true, an EPF driver is not allowed to choose if this BAR * should be configured as 32-bit or 64-bit, the EPF driver must - * configure this BAR as 64-bit. Additionally, the BAR succeeding - * this BAR must be set to type BAR_RESERVED. + * configure this BAR as 64-bit. * * only_64bit should not be set on a BAR of type BAR_RESERVED. * (If BARx is a 64-bit BAR that an EPF driver is not allowed to -- cgit v1.2.3 From 27ce1d8ecb9b9ae025b9e9e199845624bc950998 Mon Sep 17 00:00:00 2001 From: Manikanta Maddireddy Date: Thu, 12 Mar 2026 14:02:30 +0100 Subject: PCI: endpoint: Allow only_64bit on BAR_RESERVED Remove the documentation that forbids setting only_64bit on a BAR of type BAR_RESERVED. When a reserved BAR is 64-bit by default, setting only_64bit is the most accurate description. If we later add support to disable a reserved BAR (e.g. disable_bar() for BARs that were never set via set_bar()), the implementation will need to clear the adjacent BAR (upper 32 bits) as well; having only_64bit set documents that requirement. Signed-off-by: Manikanta Maddireddy Signed-off-by: Niklas Cassel Signed-off-by: Manivannan Sadhasivam Link: https://patch.msgid.link/20260312130229.2282001-14-cassel@kernel.org --- include/linux/pci-epc.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index c981ea7d52c0..5c59f5606869 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -207,11 +207,6 @@ enum pci_epc_bar_type { * @only_64bit: if true, an EPF driver is not allowed to choose if this BAR * should be configured as 32-bit or 64-bit, the EPF driver must * configure this BAR as 64-bit. - * - * only_64bit should not be set on a BAR of type BAR_RESERVED. - * (If BARx is a 64-bit BAR that an EPF driver is not allowed to - * touch, then both BARx and BARx+1 must be set to type - * BAR_RESERVED.) */ struct pci_epc_bar_desc { enum pci_epc_bar_type type; -- cgit v1.2.3 From f51644eb40a73677fcd0c92d8174eddde5d0be0e Mon Sep 17 00:00:00 2001 From: Koichiro Den Date: Thu, 12 Mar 2026 14:02:31 +0100 Subject: PCI: endpoint: Describe reserved subregions within BARs Some endpoint controllers expose platform-owned, fixed register windows within a BAR that EPF drivers must not reprogram (e.g. a BAR marked BAR_RESERVED). Even in that case, EPF drivers may need to reference a well-defined subset of that BAR, e.g. to reuse an integrated DMA controller MMIO window as a doorbell target. Introduce struct pci_epc_bar_rsvd_region and extend struct pci_epc_bar_desc so EPC drivers can advertise such fixed subregions in a controller-agnostic way. No functional change for existing users. Signed-off-by: Koichiro Den Signed-off-by: Niklas Cassel Signed-off-by: Manivannan Sadhasivam Tested-by: Manikanta Maddireddy Tested-by: Koichiro Den Reviewed-by: Frank Li Link: https://patch.msgid.link/20260312130229.2282001-15-cassel@kernel.org --- include/linux/pci-epc.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index 5c59f5606869..ebcdf70aa9b9 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -200,6 +200,30 @@ enum pci_epc_bar_type { BAR_RESERVED, }; +/** + * enum pci_epc_bar_rsvd_region_type - type of a fixed subregion behind a BAR + * @PCI_EPC_BAR_RSVD_DMA_CTRL_MMIO: Integrated DMA controller MMIO window + * + * BARs marked BAR_RESERVED are owned by the SoC/EPC hardware and must not be + * reprogrammed by EPF drivers. Some of them still expose fixed subregions that + * EPFs may want to reference (e.g. embedded doorbell fallback). + */ +enum pci_epc_bar_rsvd_region_type { + PCI_EPC_BAR_RSVD_DMA_CTRL_MMIO = 0, +}; + +/** + * struct pci_epc_bar_rsvd_region - fixed subregion behind a BAR + * @type: reserved region type + * @offset: offset within the BAR aperture + * @size: size of the reserved region + */ +struct pci_epc_bar_rsvd_region { + enum pci_epc_bar_rsvd_region_type type; + resource_size_t offset; + resource_size_t size; +}; + /** * struct pci_epc_bar_desc - hardware description for a BAR * @type: the type of the BAR @@ -207,11 +231,15 @@ enum pci_epc_bar_type { * @only_64bit: if true, an EPF driver is not allowed to choose if this BAR * should be configured as 32-bit or 64-bit, the EPF driver must * configure this BAR as 64-bit. + * @nr_rsvd_regions: number of fixed subregions described for BAR_RESERVED + * @rsvd_regions: fixed subregions behind BAR_RESERVED */ struct pci_epc_bar_desc { enum pci_epc_bar_type type; u64 fixed_size; bool only_64bit; + u8 nr_rsvd_regions; + const struct pci_epc_bar_rsvd_region *rsvd_regions; }; /** -- cgit v1.2.3 From 33642e9e36dc084e4fc9245a266c9843bc8303b9 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Thu, 12 Mar 2026 14:02:33 +0100 Subject: PCI: endpoint: Introduce pci_epc_bar_type BAR_DISABLED Add a pci_epc_bar_type BAR_DISABLED to more clearly differentiate from BAR_RESERVED. This BAR type will only be used to describe a BAR that the EPC driver should disable, and will thus never be available to an EPF driver. (Unlike BAR_RESERVED, which will never be disabled by default by an EPC driver.) Co-developed-by: Manikanta Maddireddy Signed-off-by: Manikanta Maddireddy Signed-off-by: Niklas Cassel Signed-off-by: Manivannan Sadhasivam Tested-by: Koichiro Den Tested-by: Manikanta Maddireddy Reviewed-by: Frank Li Reviewed-by: Manikanta Maddireddy Link: https://patch.msgid.link/20260312130229.2282001-17-cassel@kernel.org --- drivers/pci/endpoint/pci-epc-core.c | 5 +++-- include/linux/pci-epc.h | 10 +++++++++- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/endpoint/pci-epc-core.c b/drivers/pci/endpoint/pci-epc-core.c index e546b3dbb240..6c3c58185fc5 100644 --- a/drivers/pci/endpoint/pci-epc-core.c +++ b/drivers/pci/endpoint/pci-epc-core.c @@ -103,8 +103,9 @@ enum pci_barno pci_epc_get_next_free_bar(const struct pci_epc_features bar++; for (i = bar; i < PCI_STD_NUM_BARS; i++) { - /* If the BAR is not reserved, return it. */ - if (epc_features->bar[i].type != BAR_RESERVED) + /* If the BAR is not reserved or disabled, return it. */ + if (epc_features->bar[i].type != BAR_RESERVED && + epc_features->bar[i].type != BAR_DISABLED) return i; } diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h index ebcdf70aa9b9..334c2b7578d0 100644 --- a/include/linux/pci-epc.h +++ b/include/linux/pci-epc.h @@ -191,13 +191,21 @@ struct pci_epc { * @BAR_RESIZABLE: The BAR implements the PCI-SIG Resizable BAR Capability. * NOTE: An EPC driver can currently only set a single supported * size. - * @BAR_RESERVED: The BAR should not be touched by an EPF driver. + * @BAR_RESERVED: Used for HW-backed BARs (e.g. MSI-X table, DMA regs). The BAR + * should not be disabled by an EPC driver. The BAR should not be + * reprogrammed by an EPF driver. An EPF driver is allowed to + * disable the BAR if absolutely necessary. (However, right now + * there is no EPC operation to disable a BAR that has not been + * programmed using pci_epc_set_bar().) + * @BAR_DISABLED: The BAR should be disabled by an EPC driver. The BAR will be + * unavailable to an EPF driver. */ enum pci_epc_bar_type { BAR_PROGRAMMABLE = 0, BAR_FIXED, BAR_RESIZABLE, BAR_RESERVED, + BAR_DISABLED, }; /** -- cgit v1.2.3 From 45c2a55d13c698ba6a281315676934c44225b034 Mon Sep 17 00:00:00 2001 From: Unnathi Chalicheemala Date: Thu, 5 Mar 2026 19:12:05 -0800 Subject: soc: qcom: llcc: Add per-slice counter and common llcc slice descriptor Fix incorrect slice activation/deactivation accounting by replacing the bitmap-based activation tracking with per-slice atomic reference counters. This resolves mismatches that occur when multiple client drivers vote for the same slice or when llcc_slice_getd() is called multiple times. As part of this fix, simplify slice descriptor handling by eliminating dynamic allocation. llcc_slice_getd() now returns a pointer to a preallocated descriptor, removing the need for repeated allocation/free cycles and ensuring consistent reference tracking across all users. Signed-off-by: Unnathi Chalicheemala Signed-off-by: Francisco Munoz Ruiz Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20260305-external_llcc_changes1set-v1-1-6347e52e648e@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- drivers/soc/qcom/llcc-qcom.c | 57 +++++++++++++++++++------------------- include/linux/soc/qcom/llcc-qcom.h | 8 +++--- 2 files changed, 32 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/drivers/soc/qcom/llcc-qcom.c b/drivers/soc/qcom/llcc-qcom.c index e221e3c4982b..8fdca7658393 100644 --- a/drivers/soc/qcom/llcc-qcom.c +++ b/drivers/soc/qcom/llcc-qcom.c @@ -5,7 +5,6 @@ */ #include -#include #include #include #include @@ -4535,8 +4534,7 @@ static struct llcc_drv_data *drv_data = (void *) -EPROBE_DEFER; struct llcc_slice_desc *llcc_slice_getd(u32 uid) { const struct llcc_slice_config *cfg; - struct llcc_slice_desc *desc; - u32 sz, count; + u32 sz, i; if (IS_ERR(drv_data)) return ERR_CAST(drv_data); @@ -4544,21 +4542,14 @@ struct llcc_slice_desc *llcc_slice_getd(u32 uid) cfg = drv_data->cfg; sz = drv_data->cfg_size; - for (count = 0; cfg && count < sz; count++, cfg++) + for (i = 0; cfg && i < sz; i++, cfg++) if (cfg->usecase_id == uid) break; - if (count == sz || !cfg) + if (i == sz) return ERR_PTR(-ENODEV); - desc = kzalloc_obj(*desc); - if (!desc) - return ERR_PTR(-ENOMEM); - - desc->slice_id = cfg->slice_id; - desc->slice_size = cfg->max_cap; - - return desc; + return &drv_data->desc[i]; } EXPORT_SYMBOL_GPL(llcc_slice_getd); @@ -4569,7 +4560,7 @@ EXPORT_SYMBOL_GPL(llcc_slice_getd); void llcc_slice_putd(struct llcc_slice_desc *desc) { if (!IS_ERR_OR_NULL(desc)) - kfree(desc); + return; } EXPORT_SYMBOL_GPL(llcc_slice_putd); @@ -4645,7 +4636,8 @@ int llcc_slice_activate(struct llcc_slice_desc *desc) return -EINVAL; mutex_lock(&drv_data->lock); - if (test_bit(desc->slice_id, drv_data->bitmap)) { + /* Already active; try to take another reference. */ + if (refcount_inc_not_zero(&desc->refcount)) { mutex_unlock(&drv_data->lock); return 0; } @@ -4659,7 +4651,8 @@ int llcc_slice_activate(struct llcc_slice_desc *desc) return ret; } - __set_bit(desc->slice_id, drv_data->bitmap); + /* Set first reference */ + refcount_set(&desc->refcount, 1); mutex_unlock(&drv_data->lock); return ret; @@ -4685,10 +4678,12 @@ int llcc_slice_deactivate(struct llcc_slice_desc *desc) return -EINVAL; mutex_lock(&drv_data->lock); - if (!test_bit(desc->slice_id, drv_data->bitmap)) { + /* refcount > 1, drop one ref and we’re done. */ + if (refcount_dec_not_one(&desc->refcount)) { mutex_unlock(&drv_data->lock); return 0; } + act_ctrl_val = ACT_CTRL_OPCODE_DEACTIVATE << ACT_CTRL_OPCODE_SHIFT; ret = llcc_update_act_ctrl(desc->slice_id, act_ctrl_val, @@ -4698,7 +4693,8 @@ int llcc_slice_deactivate(struct llcc_slice_desc *desc) return ret; } - __clear_bit(desc->slice_id, drv_data->bitmap); + /* Finalize: atomically transition 1 -> 0 */ + WARN_ON_ONCE(!refcount_dec_if_one(&desc->refcount)); mutex_unlock(&drv_data->lock); return ret; @@ -4742,7 +4738,7 @@ static int _qcom_llcc_cfg_program(const struct llcc_slice_config *config, u32 attr1_val; u32 attr0_val; u32 max_cap_cacheline; - struct llcc_slice_desc desc; + struct llcc_slice_desc *desc; attr1_val = config->cache_mode; attr1_val |= config->probe_target_ways << ATTR1_PROBE_TARGET_WAYS_SHIFT; @@ -4891,8 +4887,11 @@ static int _qcom_llcc_cfg_program(const struct llcc_slice_config *config, } if (config->activate_on_init) { - desc.slice_id = config->slice_id; - ret = llcc_slice_activate(&desc); + desc = llcc_slice_getd(config->usecase_id); + if (IS_ERR(desc)) + return PTR_ERR(desc); + + ret = llcc_slice_activate(desc); } return ret; @@ -5205,18 +5204,18 @@ static int qcom_llcc_probe(struct platform_device *pdev) llcc_cfg = cfg->sct_data; sz = cfg->size; - - for (i = 0; i < sz; i++) - if (llcc_cfg[i].slice_id > drv_data->max_slices) - drv_data->max_slices = llcc_cfg[i].slice_id; - - drv_data->bitmap = devm_bitmap_zalloc(dev, drv_data->max_slices, - GFP_KERNEL); - if (!drv_data->bitmap) { + drv_data->desc = devm_kcalloc(dev, sz, sizeof(struct llcc_slice_desc), GFP_KERNEL); + if (!drv_data->desc) { ret = -ENOMEM; goto err; } + for (i = 0; i < sz; i++) { + drv_data->desc[i].slice_id = llcc_cfg[i].slice_id; + drv_data->desc[i].slice_size = llcc_cfg[i].max_cap; + refcount_set(&drv_data->desc[i].refcount, 0); + } + drv_data->cfg = llcc_cfg; drv_data->cfg_size = sz; drv_data->edac_reg_offset = cfg->edac_reg_offset; diff --git a/include/linux/soc/qcom/llcc-qcom.h b/include/linux/soc/qcom/llcc-qcom.h index 8243ab3a12a8..227125d84318 100644 --- a/include/linux/soc/qcom/llcc-qcom.h +++ b/include/linux/soc/qcom/llcc-qcom.h @@ -91,10 +91,12 @@ * struct llcc_slice_desc - Cache slice descriptor * @slice_id: llcc slice id * @slice_size: Size allocated for the llcc slice + * @refcount: Atomic counter to track activate/deactivate calls */ struct llcc_slice_desc { u32 slice_id; size_t slice_size; + refcount_t refcount; }; /** @@ -152,11 +154,10 @@ struct llcc_edac_reg_offset { * @edac_reg_offset: Offset of the LLCC EDAC registers * @lock: mutex associated with each slice * @cfg_size: size of the config data table - * @max_slices: max slices as read from device tree * @num_banks: Number of llcc banks - * @bitmap: Bit map to track the active slice ids * @ecc_irq: interrupt for llcc cache error detection and reporting * @ecc_irq_configured: 'True' if firmware has already configured the irq propagation + * @desc: Array pointer of pre-allocated LLCC slice descriptors * @version: Indicates the LLCC version */ struct llcc_drv_data { @@ -167,12 +168,11 @@ struct llcc_drv_data { const struct llcc_edac_reg_offset *edac_reg_offset; struct mutex lock; u32 cfg_size; - u32 max_slices; u32 num_banks; - unsigned long *bitmap; int ecc_irq; bool ecc_irq_configured; u32 version; + struct llcc_slice_desc *desc; }; #if IS_ENABLED(CONFIG_QCOM_LLCC) -- cgit v1.2.3 From e4ee7621d732162ea2ec714ae76dac2f70519417 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 10 Mar 2026 00:03:30 +0100 Subject: soc: qcom: qmi: Enumerate the service IDs of QMI The QMI framework proposes a set of services which are defined by an integer identifier. The different QMI client lookup for the services via this identifier. Moreover, the function qmi_add_lookup() and qmi_add_server() must match the service ID but the code in different places set the same value but with a different macro name. These macros are spreaded across the different subsystems implementing the protocols associated with a service. It would make more sense to define them in the QMI header for the sake of consistency and clarity. This change use an unified naming for the services and enumerate the ones implemented in the Linux kernel. More services can come later and put the service ID in this same header. Signed-off-by: Daniel Lezcano Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20260309230346.3584252-2-daniel.lezcano@oss.qualcomm.com [bjorn: Lower case hex constants] Signed-off-by: Bjorn Andersson --- include/linux/soc/qcom/qmi.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/qcom/qmi.h b/include/linux/soc/qcom/qmi.h index 291cdc7ef49c..b9dcb437a0be 100644 --- a/include/linux/soc/qcom/qmi.h +++ b/include/linux/soc/qcom/qmi.h @@ -92,6 +92,18 @@ struct qmi_elem_info { #define QMI_ERR_INCOMPATIBLE_STATE_V01 90 #define QMI_ERR_NOT_SUPPORTED_V01 94 +/* + * Enumerate the IDs of the QMI services + */ +#define QMI_SERVICE_ID_TEST 0x0f /* 15 */ +#define QMI_SERVICE_ID_SSCTL 0x2b /* 43 */ +#define QMI_SERVICE_ID_IPA 0x31 /* 49 */ +#define QMI_SERVICE_ID_SERVREG_LOC 0x40 /* 64 */ +#define QMI_SERVICE_ID_SERVREG_NOTIF 0x42 /* 66 */ +#define QMI_SERVICE_ID_WLFW 0x45 /* 69 */ +#define QMI_SERVICE_ID_SLIMBUS 0x301 /* 769 */ +#define QMI_SERVICE_ID_USB_AUDIO_STREAM 0x41d /* 1053 */ + /** * struct qmi_response_type_v01 - common response header (decoded) * @result: result of the transaction -- cgit v1.2.3 From dea046e7f46f2357124a465e058c92cac3e351c5 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 9 Mar 2026 13:42:41 +0100 Subject: gpio: remove machine hogs With no more users, remove legacy machine hog API from the kernel. Reviewed-by: Linus Walleij Reviewed-by: Andy Shevchenko Link: https://patch.msgid.link/20260309-gpio-hog-fwnode-v2-5-4e61f3dbf06a@oss.qualcomm.com Signed-off-by: Bartosz Golaszewski --- Documentation/driver-api/gpio/board.rst | 16 -------- drivers/gpio/gpiolib.c | 71 --------------------------------- include/linux/gpio/machine.h | 33 --------------- 3 files changed, 120 deletions(-) (limited to 'include/linux') diff --git a/Documentation/driver-api/gpio/board.rst b/Documentation/driver-api/gpio/board.rst index 069b54d8591b..0993cac891fb 100644 --- a/Documentation/driver-api/gpio/board.rst +++ b/Documentation/driver-api/gpio/board.rst @@ -239,22 +239,6 @@ mapping and is thus transparent to GPIO consumers. A set of functions such as gpiod_set_value() is available to work with the new descriptor-oriented interface. -Boards using platform data can also hog GPIO lines by defining GPIO hog tables. - -.. code-block:: c - - struct gpiod_hog gpio_hog_table[] = { - GPIO_HOG("gpio.0", 10, "foo", GPIO_ACTIVE_LOW, GPIOD_OUT_HIGH), - { } - }; - -And the table can be added to the board code as follows:: - - gpiod_add_hogs(gpio_hog_table); - -The line will be hogged as soon as the gpiochip is created or - in case the -chip was created earlier - when the hog table is registered. - Arrays of pins -------------- In addition to requesting pins belonging to a function one by one, a device may diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 4a57d9882600..56fda7891d55 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -103,9 +103,6 @@ static DEFINE_MUTEX(gpio_devices_lock); /* Ensures coherence during read-only accesses to the list of GPIO devices. */ DEFINE_STATIC_SRCU(gpio_devices_srcu); -static DEFINE_MUTEX(gpio_machine_hogs_mutex); -static LIST_HEAD(gpio_machine_hogs); - const char *const gpio_suffixes[] = { "gpios", "gpio", NULL }; static void gpiochip_free_hogs(struct gpio_chip *gc); @@ -930,36 +927,6 @@ err_remove_device: return ret; } -static void gpiochip_machine_hog(struct gpio_chip *gc, struct gpiod_hog *hog) -{ - struct gpio_desc *desc; - int rv; - - desc = gpiochip_get_desc(gc, hog->chip_hwnum); - if (IS_ERR(desc)) { - gpiochip_err(gc, "%s: unable to get GPIO desc: %ld\n", - __func__, PTR_ERR(desc)); - return; - } - - rv = gpiod_hog(desc, hog->line_name, hog->lflags, hog->dflags); - if (rv) - gpiod_err(desc, "%s: unable to hog GPIO line (%s:%u): %d\n", - __func__, gc->label, hog->chip_hwnum, rv); -} - -static void gpiochip_machine_hog_lines(struct gpio_chip *gc) -{ - struct gpiod_hog *hog; - - guard(mutex)(&gpio_machine_hogs_mutex); - - list_for_each_entry(hog, &gpio_machine_hogs, list) { - if (!strcmp(gc->label, hog->chip_label)) - gpiochip_machine_hog(gc, hog); - } -} - int gpiochip_add_hog(struct gpio_chip *gc, struct fwnode_handle *fwnode) { struct fwnode_handle *gc_node = dev_fwnode(&gc->gpiodev->dev); @@ -1047,8 +1014,6 @@ static int gpiochip_hog_lines(struct gpio_chip *gc) return ret; } - gpiochip_machine_hog_lines(gc); - return 0; } @@ -4578,42 +4543,6 @@ void gpiod_remove_lookup_table(struct gpiod_lookup_table *table) } EXPORT_SYMBOL_GPL(gpiod_remove_lookup_table); -/** - * gpiod_add_hogs() - register a set of GPIO hogs from machine code - * @hogs: table of gpio hog entries with a zeroed sentinel at the end - */ -void gpiod_add_hogs(struct gpiod_hog *hogs) -{ - struct gpiod_hog *hog; - - guard(mutex)(&gpio_machine_hogs_mutex); - - for (hog = &hogs[0]; hog->chip_label; hog++) { - list_add_tail(&hog->list, &gpio_machine_hogs); - - /* - * The chip may have been registered earlier, so check if it - * exists and, if so, try to hog the line now. - */ - struct gpio_device *gdev __free(gpio_device_put) = - gpio_device_find_by_label(hog->chip_label); - if (gdev) - gpiochip_machine_hog(gpio_device_get_chip(gdev), hog); - } -} -EXPORT_SYMBOL_GPL(gpiod_add_hogs); - -void gpiod_remove_hogs(struct gpiod_hog *hogs) -{ - struct gpiod_hog *hog; - - guard(mutex)(&gpio_machine_hogs_mutex); - - for (hog = &hogs[0]; hog->chip_label; hog++) - list_del(&hog->list); -} -EXPORT_SYMBOL_GPL(gpiod_remove_hogs); - static bool gpiod_match_lookup_table(struct device *dev, const struct gpiod_lookup_table *table) { diff --git a/include/linux/gpio/machine.h b/include/linux/gpio/machine.h index 44e5f162973e..5eb88f5d0630 100644 --- a/include/linux/gpio/machine.h +++ b/include/linux/gpio/machine.h @@ -46,23 +46,6 @@ struct gpiod_lookup_table { struct gpiod_lookup table[]; }; -/** - * struct gpiod_hog - GPIO line hog table - * @chip_label: name of the chip the GPIO belongs to - * @chip_hwnum: hardware number (i.e. relative to the chip) of the GPIO - * @line_name: consumer name for the hogged line - * @lflags: bitmask of gpio_lookup_flags GPIO_* values - * @dflags: GPIO flags used to specify the direction and value - */ -struct gpiod_hog { - struct list_head list; - const char *chip_label; - u16 chip_hwnum; - const char *line_name; - unsigned long lflags; - int dflags; -}; - /* * Helper for lookup tables with just one single lookup for a device. */ @@ -95,24 +78,10 @@ static struct gpiod_lookup_table _name = { \ .flags = _flags, \ } -/* - * Simple definition of a single GPIO hog in an array. - */ -#define GPIO_HOG(_chip_label, _chip_hwnum, _line_name, _lflags, _dflags) \ -(struct gpiod_hog) { \ - .chip_label = _chip_label, \ - .chip_hwnum = _chip_hwnum, \ - .line_name = _line_name, \ - .lflags = _lflags, \ - .dflags = _dflags, \ -} - #ifdef CONFIG_GPIOLIB void gpiod_add_lookup_table(struct gpiod_lookup_table *table); void gpiod_add_lookup_tables(struct gpiod_lookup_table **tables, size_t n); void gpiod_remove_lookup_table(struct gpiod_lookup_table *table); -void gpiod_add_hogs(struct gpiod_hog *hogs); -void gpiod_remove_hogs(struct gpiod_hog *hogs); #else /* ! CONFIG_GPIOLIB */ static inline void gpiod_add_lookup_table(struct gpiod_lookup_table *table) {} @@ -120,8 +89,6 @@ static inline void gpiod_add_lookup_tables(struct gpiod_lookup_table **tables, size_t n) {} static inline void gpiod_remove_lookup_table(struct gpiod_lookup_table *table) {} -static inline void gpiod_add_hogs(struct gpiod_hog *hogs) {} -static inline void gpiod_remove_hogs(struct gpiod_hog *hogs) {} #endif /* CONFIG_GPIOLIB */ #endif /* __LINUX_GPIO_MACHINE_H */ -- cgit v1.2.3 From a25f48fd920b557e6ad02f692f690520c82f5914 Mon Sep 17 00:00:00 2001 From: Alban Bedel Date: Wed, 11 Mar 2026 15:31:20 +0100 Subject: gpio: kempld: Implement the interrupt controller Add a GPIO IRQ chip implementation for the kempld GPIO controller. Of note is only how the parent IRQ is obtained. The IRQ for the GPIO controller can be configured in the BIOS, along with the IRQ for the I2C controller. These IRQ are returned by ACPI but this information is only usable if both IRQ are configured. When only one is configured, only one is returned making it impossible to know which one it is. Luckily the BIOS will set the configured IRQ in the PLD registers, so it can be read from there instead, and that also work on platforms without ACPI. The vendor driver allowed to override the IRQ using a module parameters, so there are boards in field which used this parameter instead of properly configuring the BIOS. This implementation provides this as well for compatibility. Signed-off-by: Alban Bedel Link: https://patch.msgid.link/20260311143120.2179347-5-alban.bedel@lht.dlh.de Signed-off-by: Bartosz Golaszewski --- drivers/gpio/Kconfig | 1 + drivers/gpio/gpio-kempld.c | 192 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/mfd/kempld.h | 1 + 3 files changed, 194 insertions(+) (limited to 'include/linux') diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index b45fb799e36c..d665afe19709 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -1440,6 +1440,7 @@ config GPIO_JANZ_TTL config GPIO_KEMPLD tristate "Kontron ETX / COMexpress GPIO" depends on MFD_KEMPLD + select GPIOLIB_IRQCHIP help This enables support for the PLD GPIO interface on some Kontron ETX and COMexpress (ETXexpress) modules. diff --git a/drivers/gpio/gpio-kempld.c b/drivers/gpio/gpio-kempld.c index 7dd94ff6f2df..5a63df3ea5fa 100644 --- a/drivers/gpio/gpio-kempld.c +++ b/drivers/gpio/gpio-kempld.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -19,13 +20,26 @@ #define KEMPLD_GPIO_MASK(x) (BIT((x) % 8)) #define KEMPLD_GPIO_DIR 0x40 #define KEMPLD_GPIO_LVL 0x42 +#define KEMPLD_GPIO_STS 0x44 #define KEMPLD_GPIO_EVT_LVL_EDGE 0x46 +#define KEMPLD_GPIO_EVT_LOW_HIGH 0x48 #define KEMPLD_GPIO_IEN 0x4A +#define KEMPLD_GPIO_OUT_LVL 0x4E + +/* The IRQ to use if none was configured in the BIOS */ +static unsigned int gpio_irq; +module_param_hw(gpio_irq, uint, irq, 0444); +MODULE_PARM_DESC(gpio_irq, "Set legacy GPIO IRQ (1-15)"); struct kempld_gpio_data { struct gpio_chip chip; struct kempld_device_data *pld; u8 out_lvl_reg; + + struct mutex irq_lock; + u16 ien; + u16 evt_low_high; + u16 evt_lvl_edge; }; /* @@ -193,6 +207,180 @@ static int kempld_gpio_pincount(struct kempld_device_data *pld) return evt ? __ffs(evt) : 16; } +static void kempld_irq_mask(struct irq_data *data) +{ + struct gpio_chip *chip = irq_data_get_irq_chip_data(data); + struct kempld_gpio_data *gpio = gpiochip_get_data(chip); + + gpio->ien &= ~BIT(irqd_to_hwirq(data)); + gpiochip_disable_irq(chip, irqd_to_hwirq(data)); +} + +static void kempld_irq_unmask(struct irq_data *data) +{ + struct gpio_chip *chip = irq_data_get_irq_chip_data(data); + struct kempld_gpio_data *gpio = gpiochip_get_data(chip); + + gpiochip_enable_irq(chip, irqd_to_hwirq(data)); + gpio->ien |= BIT(irqd_to_hwirq(data)); +} + +static int kempld_irq_set_type(struct irq_data *data, unsigned int type) +{ + struct gpio_chip *chip = irq_data_get_irq_chip_data(data); + struct kempld_gpio_data *gpio = gpiochip_get_data(chip); + + switch (type) { + case IRQ_TYPE_EDGE_RISING: + gpio->evt_low_high |= BIT(data->hwirq); + gpio->evt_lvl_edge |= BIT(data->hwirq); + break; + case IRQ_TYPE_EDGE_FALLING: + gpio->evt_low_high &= ~BIT(data->hwirq); + gpio->evt_lvl_edge |= BIT(data->hwirq); + break; + case IRQ_TYPE_LEVEL_HIGH: + gpio->evt_low_high |= BIT(data->hwirq); + gpio->evt_lvl_edge &= ~BIT(data->hwirq); + break; + case IRQ_TYPE_LEVEL_LOW: + gpio->evt_low_high &= ~BIT(data->hwirq); + gpio->evt_lvl_edge &= ~BIT(data->hwirq); + break; + default: + return -EINVAL; + } + + return 0; +} + +static void kempld_irq_bus_lock(struct irq_data *data) +{ + struct gpio_chip *chip = irq_data_get_irq_chip_data(data); + struct kempld_gpio_data *gpio = gpiochip_get_data(chip); + + mutex_lock(&gpio->irq_lock); +} + +static void kempld_irq_bus_sync_unlock(struct irq_data *data) +{ + struct gpio_chip *chip = irq_data_get_irq_chip_data(data); + struct kempld_gpio_data *gpio = gpiochip_get_data(chip); + struct kempld_device_data *pld = gpio->pld; + + kempld_get_mutex(pld); + kempld_write16(pld, KEMPLD_GPIO_EVT_LVL_EDGE, gpio->evt_lvl_edge); + kempld_write16(pld, KEMPLD_GPIO_EVT_LOW_HIGH, gpio->evt_low_high); + kempld_write16(pld, KEMPLD_GPIO_IEN, gpio->ien); + kempld_release_mutex(pld); + + mutex_unlock(&gpio->irq_lock); +} + +static const struct irq_chip kempld_irqchip = { + .name = "kempld-gpio", + .irq_mask = kempld_irq_mask, + .irq_unmask = kempld_irq_unmask, + .irq_set_type = kempld_irq_set_type, + .irq_bus_lock = kempld_irq_bus_lock, + .irq_bus_sync_unlock = kempld_irq_bus_sync_unlock, + .flags = IRQCHIP_IMMUTABLE, + GPIOCHIP_IRQ_RESOURCE_HELPERS, +}; + +static irqreturn_t kempld_gpio_irq_handler(int irq, void *data) +{ + struct kempld_gpio_data *gpio = data; + struct gpio_chip *chip = &gpio->chip; + unsigned int pin, child_irq; + unsigned long status; + + kempld_get_mutex(gpio->pld); + + status = kempld_read16(gpio->pld, KEMPLD_GPIO_STS); + if (status) + kempld_write16(gpio->pld, KEMPLD_GPIO_STS, status); + + kempld_release_mutex(gpio->pld); + + status &= gpio->ien; + if (!status) + return IRQ_NONE; + + for_each_set_bit(pin, &status, chip->ngpio) { + child_irq = irq_find_mapping(chip->irq.domain, pin); + handle_nested_irq(child_irq); + } + + return IRQ_HANDLED; +} + +static int kempld_gpio_irq_init(struct device *dev, + struct kempld_gpio_data *gpio) +{ + struct kempld_device_data *pld = gpio->pld; + struct gpio_chip *chip = &gpio->chip; + struct gpio_irq_chip *girq; + unsigned int irq; + int ret; + + /* Get the IRQ configured by the BIOS in the PLD */ + kempld_get_mutex(pld); + irq = kempld_read8(pld, KEMPLD_IRQ_GPIO); + kempld_release_mutex(pld); + + if (irq == 0xff) { + dev_info(dev, "GPIO controller has no IRQ support\n"); + return 0; + } + + /* Allow overriding the IRQ with the module parameter */ + if (gpio_irq > 0) { + dev_warn(dev, "Forcing IRQ to %d\n", gpio_irq); + irq &= ~KEMPLD_IRQ_GPIO_MASK; + irq |= gpio_irq & KEMPLD_IRQ_GPIO_MASK; + } + + if (!(irq & KEMPLD_IRQ_GPIO_MASK)) { + dev_warn(dev, "No IRQ configured\n"); + return 0; + } + + /* Get the current config, disable all child interrupts, clear them + * and set the parent IRQ + */ + kempld_get_mutex(pld); + gpio->evt_low_high = kempld_read16(pld, KEMPLD_GPIO_EVT_LOW_HIGH); + gpio->evt_lvl_edge = kempld_read16(pld, KEMPLD_GPIO_EVT_LVL_EDGE); + kempld_write16(pld, KEMPLD_GPIO_IEN, 0); + kempld_write16(pld, KEMPLD_GPIO_STS, 0xFFFF); + kempld_write16(pld, KEMPLD_IRQ_GPIO, irq); + kempld_release_mutex(pld); + + girq = &chip->irq; + gpio_irq_chip_set_chip(girq, &kempld_irqchip); + + girq->parent_handler = NULL; + girq->num_parents = 0; + girq->parents = NULL; + girq->default_type = IRQ_TYPE_NONE; + girq->handler = handle_simple_irq; + girq->threaded = true; + + mutex_init(&gpio->irq_lock); + + ret = devm_request_threaded_irq(dev, irq & KEMPLD_IRQ_GPIO_MASK, + NULL, kempld_gpio_irq_handler, + IRQF_ONESHOT, chip->label, + gpio); + if (ret) { + dev_err(dev, "failed to request irq %d\n", irq); + return ret; + } + + return 0; +} + static int kempld_gpio_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; @@ -247,6 +435,10 @@ static int kempld_gpio_probe(struct platform_device *pdev) return -ENODEV; } + ret = kempld_gpio_irq_init(dev, gpio); + if (ret) + return ret; + ret = devm_gpiochip_add_data(dev, chip, gpio); if (ret) { dev_err(dev, "Could not register GPIO chip\n"); diff --git a/include/linux/mfd/kempld.h b/include/linux/mfd/kempld.h index 643c096b93ac..2dbd80abfd1d 100644 --- a/include/linux/mfd/kempld.h +++ b/include/linux/mfd/kempld.h @@ -37,6 +37,7 @@ #define KEMPLD_SPEC_GET_MINOR(x) (x & 0x0f) #define KEMPLD_SPEC_GET_MAJOR(x) ((x >> 4) & 0x0f) #define KEMPLD_IRQ_GPIO 0x35 +#define KEMPLD_IRQ_GPIO_MASK 0x0f #define KEMPLD_IRQ_I2C 0x36 #define KEMPLD_CFG 0x37 #define KEMPLD_CFG_GPIO_I2C_MUX (1 << 0) -- cgit v1.2.3 From 428c56525bf5dbc3bd5e30014df1f5213f8bd7c8 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Fri, 13 Mar 2026 09:22:18 +0100 Subject: jump_label: use ATOMIC_INIT() for initialization of .enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently ATOMIC_INIT() is not used because in the past that macro was provided by linux/atomic.h which is not usable from linux/jump_label.h. However since commit 7ca8cf5347f7 ("locking/atomic: Move ATOMIC_INIT into linux/types.h") the macro only requires linux/types.h. Remove the now unnecessary workaround and the associated assertions. Signed-off-by: Thomas Weißschuh Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260313-jump_label-cleanup-v2-1-35d3c0bde549@linutronix.de --- include/linux/jump_label.h | 11 ++--------- kernel/jump_label.c | 9 --------- 2 files changed, 2 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index fdb79dd1ebd8..e494b360d36d 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -238,18 +238,11 @@ extern void static_key_enable_cpuslocked(struct static_key *key); extern void static_key_disable_cpuslocked(struct static_key *key); extern enum jump_label_type jump_label_init_type(struct jump_entry *entry); -/* - * We should be using ATOMIC_INIT() for initializing .enabled, but - * the inclusion of atomic.h is problematic for inclusion of jump_label.h - * in 'low-level' headers. Thus, we are initializing .enabled with a - * raw value, but have added a BUILD_BUG_ON() to catch any issues in - * jump_label_init() see: kernel/jump_label.c. - */ #define STATIC_KEY_INIT_TRUE \ - { .enabled = { 1 }, \ + { .enabled = ATOMIC_INIT(1), \ { .type = JUMP_TYPE_TRUE } } #define STATIC_KEY_INIT_FALSE \ - { .enabled = { 0 }, \ + { .enabled = ATOMIC_INIT(0), \ { .type = JUMP_TYPE_FALSE } } #else /* !CONFIG_JUMP_LABEL */ diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 7cb19e601426..e851e4b37d0e 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -529,15 +529,6 @@ void __init jump_label_init(void) struct static_key *key = NULL; struct jump_entry *iter; - /* - * Since we are initializing the static_key.enabled field with - * with the 'raw' int values (to avoid pulling in atomic.h) in - * jump_label.h, let's make sure that is safe. There are only two - * cases to check since we initialize to 0 or 1. - */ - BUILD_BUG_ON((int)ATOMIC_INIT(0) != 0); - BUILD_BUG_ON((int)ATOMIC_INIT(1) != 1); - if (static_key_initialized) return; -- cgit v1.2.3 From acb38872d4cbec5b6825345d9d757e21d2d9d953 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Fri, 13 Mar 2026 09:22:19 +0100 Subject: jump_label: remove workaround for old compilers in initializations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The extra braces for the initialization of the anonymous union members were added in commit cd8d860dcce9 ("jump_label: Fix anonymous union initialization") to compensate for limitations in gcc < 4.6. Versions of gcc this old are not supported anymore, so drop the workaround. Signed-off-by: Thomas Weißschuh Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260313-jump_label-cleanup-v2-2-35d3c0bde549@linutronix.de --- include/linux/jump_label.h | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index e494b360d36d..b9c7b0ebf7b9 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -87,13 +87,6 @@ struct static_key { atomic_t enabled; #ifdef CONFIG_JUMP_LABEL /* - * Note: - * To make anonymous unions work with old compilers, the static - * initialization of them requires brackets. This creates a dependency - * on the order of the struct with the initializers. If any fields - * are added, STATIC_KEY_INIT_TRUE and STATIC_KEY_INIT_FALSE may need - * to be modified. - * * bit 0 => 1 if key is initially true * 0 if initially false * bit 1 => 1 if points to struct static_key_mod @@ -240,10 +233,10 @@ extern enum jump_label_type jump_label_init_type(struct jump_entry *entry); #define STATIC_KEY_INIT_TRUE \ { .enabled = ATOMIC_INIT(1), \ - { .type = JUMP_TYPE_TRUE } } + .type = JUMP_TYPE_TRUE } #define STATIC_KEY_INIT_FALSE \ { .enabled = ATOMIC_INIT(0), \ - { .type = JUMP_TYPE_FALSE } } + .type = JUMP_TYPE_FALSE } #else /* !CONFIG_JUMP_LABEL */ -- cgit v1.2.3 From 2deccd5c862a0337a691bcfaa87919b4216e6103 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 9 Mar 2026 17:40:42 +0100 Subject: cleanup: Optimize guards Andrew reported that a guard() conversion of zone_lock increased the code size unnecessarily. It turns out the unconditional __GUARD_IS_ERR() is to blame. As explored earlier [1], __GUARD_IS_ERR(), similar to IS_ERR_OR_NULL(), generates somewhat sub-optimal code. However, looking at things again, it is possible to avoid doing the __GUARD_IS_ERR() unconditionally. Revert the normal destructors to a simple NULL test and only add the IS_ERR bit to COND guards. This cures the reported overhead; as compiled by GCC-16: page_alloc.o: pre: Total: Before=45299, After=45371, chg +0.16% post: Total: Before=45299, After=45026, chg -0.60% [1] https://lkml.kernel.org/r/20250513085001.GC25891@noisy.programming.kicks-ass.net Reported-by: Andrew Morton Signed-off-by: Peter Zijlstra (Intel) Tested-by: Dan Williams Link: https://patch.msgid.link/20260309164516.GE606826@noisy.programming.kicks-ass.net --- include/linux/cleanup.h | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index dbc4162921e9..ea95ca4bc11c 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -286,15 +286,18 @@ static __always_inline _type class_##_name##_constructor(_init_args) \ __no_context_analysis \ { _type t = _init; return t; } -#define EXTEND_CLASS(_name, ext, _init, _init_args...) \ -typedef lock_##_name##_t lock_##_name##ext##_t; \ +#define EXTEND_CLASS_COND(_name, ext, _cond, _init, _init_args...) \ +typedef lock_##_name##_t lock_##_name##ext##_t; \ typedef class_##_name##_t class_##_name##ext##_t; \ -static __always_inline void class_##_name##ext##_destructor(class_##_name##_t *p) \ -{ class_##_name##_destructor(p); } \ +static __always_inline void class_##_name##ext##_destructor(class_##_name##_t *_T) \ +{ if (_cond) return; class_##_name##_destructor(_T); } \ static __always_inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ __no_context_analysis \ { class_##_name##_t t = _init; return t; } +#define EXTEND_CLASS(_name, ext, _init, _init_args...) \ + EXTEND_CLASS_COND(_name, ext, 0, _init, _init_args) + #define CLASS(_name, var) \ class_##_name##_t var __cleanup(class_##_name##_destructor) = \ class_##_name##_constructor @@ -394,12 +397,12 @@ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond __DEFINE_GUARD_LOCK_PTR(_name, _T) #define DEFINE_GUARD(_name, _type, _lock, _unlock) \ - DEFINE_CLASS(_name, _type, if (!__GUARD_IS_ERR(_T)) { _unlock; }, ({ _lock; _T; }), _type _T); \ + DEFINE_CLASS(_name, _type, if (_T) { _unlock; }, ({ _lock; _T; }), _type _T); \ DEFINE_CLASS_IS_GUARD(_name) #define DEFINE_GUARD_COND_4(_name, _ext, _lock, _cond) \ __DEFINE_CLASS_IS_CONDITIONAL(_name##_ext, true); \ - EXTEND_CLASS(_name, _ext, \ + EXTEND_CLASS_COND(_name, _ext, __GUARD_IS_ERR(*_T), \ ({ void *_t = _T; int _RET = (_lock); if (_T && !(_cond)) _t = ERR_PTR(_RET); _t; }), \ class_##_name##_t _T) \ static __always_inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ @@ -488,7 +491,7 @@ typedef struct { \ static __always_inline void class_##_name##_destructor(class_##_name##_t *_T) \ __no_context_analysis \ { \ - if (!__GUARD_IS_ERR(_T->lock)) { _unlock; } \ + if (_T->lock) { _unlock; } \ } \ \ __DEFINE_GUARD_LOCK_PTR(_name, &_T->lock) @@ -565,7 +568,7 @@ __DEFINE_LOCK_GUARD_0(_name, _lock) #define DEFINE_LOCK_GUARD_1_COND_4(_name, _ext, _lock, _cond) \ __DEFINE_CLASS_IS_CONDITIONAL(_name##_ext, true); \ - EXTEND_CLASS(_name, _ext, \ + EXTEND_CLASS_COND(_name, _ext, __GUARD_IS_ERR(_T->lock), \ ({ class_##_name##_t _t = { .lock = l }, *_T = &_t;\ int _RET = (_lock); \ if (_T->lock && !(_cond)) _T->lock = ERR_PTR(_RET);\ -- cgit v1.2.3 From 756a0e011cfca0b45a48464aa25b05d9a9c2fb0b Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 13 Mar 2026 10:15:07 -0700 Subject: locking: Fix rwlock support in Architecture support for rwlocks must be available whether or not CONFIG_DEBUG_SPINLOCK has been defined. Move the definitions of the arch_{read,write}_{lock,trylock,unlock}() macros such that these become visbile if CONFIG_DEBUG_SPINLOCK=n. This patch prepares for converting do_raw_{read,write}_trylock() into inline functions. Without this patch that conversion triggers a build failure for UP architectures, e.g. arm-ep93xx. I used the following kernel configuration to build the kernel for that architecture: CONFIG_ARCH_MULTIPLATFORM=y CONFIG_ARCH_MULTI_V7=n CONFIG_ATAGS=y CONFIG_MMU=y CONFIG_ARCH_MULTI_V4T=y CONFIG_CPU_LITTLE_ENDIAN=y CONFIG_ARCH_EP93XX=y Fixes: fb1c8f93d869 ("[PATCH] spinlock consolidation") Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260313171510.230998-2-bvanassche@acm.org --- include/linux/spinlock_up.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h index 1e84e71ca495..3a50976471d7 100644 --- a/include/linux/spinlock_up.h +++ b/include/linux/spinlock_up.h @@ -48,16 +48,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) lock->slock = 1; } -/* - * Read-write spinlocks. No debug version. - */ -#define arch_read_lock(lock) do { barrier(); (void)(lock); } while (0) -#define arch_write_lock(lock) do { barrier(); (void)(lock); } while (0) -#define arch_read_trylock(lock) ({ barrier(); (void)(lock); 1; }) -#define arch_write_trylock(lock) ({ barrier(); (void)(lock); 1; }) -#define arch_read_unlock(lock) do { barrier(); (void)(lock); } while (0) -#define arch_write_unlock(lock) do { barrier(); (void)(lock); } while (0) - #else /* DEBUG_SPINLOCK */ #define arch_spin_is_locked(lock) ((void)(lock), 0) /* for sched/core.c and kernel_lock.c: */ @@ -68,4 +58,14 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) #define arch_spin_is_contended(lock) (((void)(lock), 0)) +/* + * Read-write spinlocks. No debug version. + */ +#define arch_read_lock(lock) do { barrier(); (void)(lock); } while (0) +#define arch_write_lock(lock) do { barrier(); (void)(lock); } while (0) +#define arch_read_trylock(lock) ({ barrier(); (void)(lock); 1; }) +#define arch_write_trylock(lock) ({ barrier(); (void)(lock); 1; }) +#define arch_read_unlock(lock) do { barrier(); (void)(lock); } while (0) +#define arch_write_unlock(lock) do { barrier(); (void)(lock); } while (0) + #endif /* __LINUX_SPINLOCK_UP_H */ -- cgit v1.2.3 From c4d3b8c77d85082d32250c505beb1d9e46ee47ee Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 13 Mar 2026 10:15:08 -0700 Subject: locking: Add lock context support in do_raw_{read,write}_trylock() Convert do_raw_{read,write}_trylock() from macros into inline functions and annotate these inline functions with __cond_acquires_shared() or __cond_acquires() as appropriate. This change is necessary to build kernel drivers or subsystems that use rwlock synchronization objects with lock context analysis enabled. The return type 'int' matches the return type for CONFIG_DEBUG_SPINLOCK=y. Signed-off-by: Bart Van Assche Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260313171510.230998-3-bvanassche@acm.org --- include/linux/rwlock.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h index 21ceefc4a49f..4e67cd934d8f 100644 --- a/include/linux/rwlock.h +++ b/include/linux/rwlock.h @@ -37,10 +37,20 @@ extern int do_raw_write_trylock(rwlock_t *lock) __cond_acquires(true, lock); extern void do_raw_write_unlock(rwlock_t *lock) __releases(lock); #else # define do_raw_read_lock(rwlock) do {__acquire_shared(lock); arch_read_lock(&(rwlock)->raw_lock); } while (0) -# define do_raw_read_trylock(rwlock) arch_read_trylock(&(rwlock)->raw_lock) +static inline int do_raw_read_trylock(rwlock_t *rwlock) + __cond_acquires_shared(true, rwlock) + __no_context_analysis +{ + return arch_read_trylock(&(rwlock)->raw_lock); +} # define do_raw_read_unlock(rwlock) do {arch_read_unlock(&(rwlock)->raw_lock); __release_shared(lock); } while (0) # define do_raw_write_lock(rwlock) do {__acquire(lock); arch_write_lock(&(rwlock)->raw_lock); } while (0) -# define do_raw_write_trylock(rwlock) arch_write_trylock(&(rwlock)->raw_lock) +static inline int do_raw_write_trylock(rwlock_t *rwlock) + __cond_acquires(true, rwlock) + __no_context_analysis +{ + return arch_write_trylock(&(rwlock)->raw_lock); +} # define do_raw_write_unlock(rwlock) do {arch_write_unlock(&(rwlock)->raw_lock); __release(lock); } while (0) #endif -- cgit v1.2.3 From cb15d8e6cbe8d085ac585016deb2e1e0107b99e5 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sat, 14 Mar 2026 23:56:49 +0100 Subject: ASoC: codec: arizona: Convert to use GPIO descriptors This converts the Arizona driver to use GPIO descriptors exclusively, deletes the legacy code path an updates the in-tree user of legacy GPIO. The GPIO lines for mic detect polarity and headphone ID detection are made exclusively descriptor-oriented. The headphone ID detection could actually only be used by the legacy GPIO code, but I converted it to use a descriptor if someone would actually need it so we don't just drop useful code. The compatible "wlf,hpdet-id-gpio" is not in the device tree bindings and only intended to be used by software nodes if any. If someone insists I can try to add a binding for it, but I doubt there is any real user so it seems pointless. Signed-off-by: Linus Walleij Reviewed-by: Charles Keepax Reviewed-by: Bartosz Golaszewski Acked-by: Krzysztof Kozlowski Link: https://patch.msgid.link/20260314-asoc-arizona-v1-1-ecc9a165307c@kernel.org Signed-off-by: Mark Brown --- arch/arm/mach-s3c/mach-crag6410-module.c | 6 +- include/linux/mfd/arizona/pdata.h | 10 ---- sound/soc/codecs/arizona-jack.c | 95 ++++++++++---------------------- sound/soc/codecs/arizona.h | 1 + 4 files changed, 34 insertions(+), 78 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/mach-s3c/mach-crag6410-module.c b/arch/arm/mach-s3c/mach-crag6410-module.c index 4ffcf024b09d..14b0f9cc103e 100644 --- a/arch/arm/mach-s3c/mach-crag6410-module.c +++ b/arch/arm/mach-s3c/mach-crag6410-module.c @@ -239,7 +239,6 @@ static struct gpiod_lookup_table wm8994_gpiod_table = { static struct arizona_pdata wm5102_reva_pdata = { .gpio_base = CODEC_GPIO_BASE, .irq_flags = IRQF_TRIGGER_HIGH, - .micd_pol_gpio = CODEC_GPIO_BASE + 4, .micd_rate = 6, .gpio_defaults = { [2] = 0x10000, /* AIF3TXLRCLK */ @@ -265,6 +264,8 @@ static struct gpiod_lookup_table wm5102_reva_gpiod_table = { .table = { GPIO_LOOKUP("GPION", 7, "wlf,ldoena", GPIO_ACTIVE_HIGH), + GPIO_LOOKUP("arizona", 4, + "wlf,micd-pol", GPIO_ACTIVE_HIGH), { }, }, }; @@ -272,7 +273,6 @@ static struct gpiod_lookup_table wm5102_reva_gpiod_table = { static struct arizona_pdata wm5102_pdata = { .gpio_base = CODEC_GPIO_BASE, .irq_flags = IRQF_TRIGGER_HIGH, - .micd_pol_gpio = CODEC_GPIO_BASE + 2, .gpio_defaults = { [2] = 0x10000, /* AIF3TXLRCLK */ [3] = 0x4, /* OPCLK */ @@ -297,6 +297,8 @@ static struct gpiod_lookup_table wm5102_gpiod_table = { .table = { GPIO_LOOKUP("GPION", 7, "wlf,ldo1ena", GPIO_ACTIVE_HIGH), + GPIO_LOOKUP("arizona", 2, + "wlf,micd-pol", GPIO_ACTIVE_HIGH), { }, }, }; diff --git a/include/linux/mfd/arizona/pdata.h b/include/linux/mfd/arizona/pdata.h index f72e6d4b14a7..d465dcd8c90a 100644 --- a/include/linux/mfd/arizona/pdata.h +++ b/include/linux/mfd/arizona/pdata.h @@ -117,11 +117,6 @@ struct arizona_pdata { /** Check for line output with HPDET method */ bool hpdet_acc_id_line; -#ifdef CONFIG_GPIOLIB_LEGACY - /** GPIO used for mic isolation with HPDET */ - int hpdet_id_gpio; -#endif - /** Channel to use for headphone detection */ unsigned int hpdet_channel; @@ -131,11 +126,6 @@ struct arizona_pdata { /** Extra debounce timeout used during initial mic detection (ms) */ unsigned int micd_detect_debounce; -#ifdef CONFIG_GPIOLIB_LEGACY - /** GPIO for mic detection polarity */ - int micd_pol_gpio; -#endif - /** Mic detect ramp rate */ unsigned int micd_bias_start_time; diff --git a/sound/soc/codecs/arizona-jack.c b/sound/soc/codecs/arizona-jack.c index 303c1d44ebd8..a9063bac2752 100644 --- a/sound/soc/codecs/arizona-jack.c +++ b/sound/soc/codecs/arizona-jack.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include @@ -459,11 +458,6 @@ static int arizona_hpdet_do_id(struct arizona_priv *info, int *reading, bool *mic) { struct arizona *arizona = info->arizona; -#ifdef CONFIG_GPIOLIB_LEGACY - int id_gpio = arizona->pdata.hpdet_id_gpio; -#else - int id_gpio = 0; -#endif if (!arizona->pdata.hpdet_acc_id) return 0; @@ -474,9 +468,8 @@ static int arizona_hpdet_do_id(struct arizona_priv *info, int *reading, */ info->hpdet_res[info->num_hpdet_res++] = *reading; -#ifdef CONFIG_GPIOLIB_LEGACY /* Only check the mic directly if we didn't already ID it */ - if (id_gpio && info->num_hpdet_res == 1) { + if (info->hpdet_id_gpio && info->num_hpdet_res == 1) { dev_dbg(arizona->dev, "Measuring mic\n"); regmap_update_bits(arizona->regmap, @@ -486,13 +479,12 @@ static int arizona_hpdet_do_id(struct arizona_priv *info, int *reading, ARIZONA_ACCDET_MODE_HPR | info->micd_modes[0].src); - gpio_set_value_cansleep(id_gpio, 1); + gpiod_set_value_cansleep(info->hpdet_id_gpio, 1); regmap_update_bits(arizona->regmap, ARIZONA_HEADPHONE_DETECT_1, ARIZONA_HP_POLL, ARIZONA_HP_POLL); return -EAGAIN; } -#endif /* OK, got both. Now, compare... */ dev_dbg(arizona->dev, "HPDET measured %d %d\n", @@ -514,7 +506,7 @@ static int arizona_hpdet_do_id(struct arizona_priv *info, int *reading, /* * If we measure the mic as high impedance */ - if (!id_gpio || info->hpdet_res[1] > 50) { + if (!info->hpdet_id_gpio || info->hpdet_res[1] > 50) { dev_dbg(arizona->dev, "Detected mic\n"); *mic = true; info->detecting = true; @@ -533,9 +525,6 @@ static irqreturn_t arizona_hpdet_irq(int irq, void *data) { struct arizona_priv *info = data; struct arizona *arizona = info->arizona; -#ifdef CONFIG_GPIOLIB_LEGACY - int id_gpio = arizona->pdata.hpdet_id_gpio; -#endif int ret, reading, state, report; bool mic = false; @@ -591,10 +580,8 @@ done: arizona_extcon_hp_clamp(info, false); -#ifdef CONFIG_GPIOLIB_LEGACY - if (id_gpio) - gpio_set_value_cansleep(id_gpio, 0); -#endif + if (info->hpdet_id_gpio) + gpiod_set_value_cansleep(info->hpdet_id_gpio, 0); /* If we have a mic then reenable MICDET */ if (state && (mic || info->mic)) @@ -1325,58 +1312,33 @@ int arizona_jack_codec_dev_probe(struct arizona_priv *info, struct device *dev) regmap_update_bits(arizona->regmap, ARIZONA_GP_SWITCH_1, ARIZONA_SW1_MODE_MASK, arizona->pdata.gpsw); -#ifdef CONFIG_GPIOLIB_LEGACY - if (pdata->micd_pol_gpio > 0) { - if (info->micd_modes[0].gpio) - mode = GPIOF_OUT_INIT_HIGH; - else - mode = GPIOF_OUT_INIT_LOW; - - ret = devm_gpio_request_one(dev, pdata->micd_pol_gpio, - mode, "MICD polarity"); - if (ret != 0) { - dev_err(arizona->dev, "Failed to request GPIO%d: %d\n", - pdata->micd_pol_gpio, ret); - return ret; - } - - info->micd_pol_gpio = gpio_to_desc(pdata->micd_pol_gpio); - } else -#endif - { - if (info->micd_modes[0].gpio) - mode = GPIOD_OUT_HIGH; - else - mode = GPIOD_OUT_LOW; + if (info->micd_modes[0].gpio) + mode = GPIOD_OUT_HIGH; + else + mode = GPIOD_OUT_LOW; - /* We can't use devm here because we need to do the get - * against the MFD device, as that is where the of_node - * will reside, but if we devm against that the GPIO - * will not be freed if the extcon driver is unloaded. - */ - info->micd_pol_gpio = gpiod_get_optional(arizona->dev, - "wlf,micd-pol", - mode); - if (IS_ERR(info->micd_pol_gpio)) { - ret = PTR_ERR(info->micd_pol_gpio); - dev_err_probe(arizona->dev, ret, "getting microphone polarity GPIO\n"); - return ret; - } + /* We can't use devm here because we need to do the get + * against the MFD device, as that is where the of_node + * will reside, but if we devm against that the GPIO + * will not be freed if the extcon driver is unloaded. + */ + info->micd_pol_gpio = gpiod_get_optional(arizona->dev, + "wlf,micd-pol", + mode); + if (IS_ERR(info->micd_pol_gpio)) { + ret = PTR_ERR(info->micd_pol_gpio); + dev_err_probe(arizona->dev, ret, "getting microphone polarity GPIO\n"); + return ret; } -#ifdef CONFIG_GPIOLIB_LEGACY - if (arizona->pdata.hpdet_id_gpio > 0) { - ret = devm_gpio_request_one(dev, arizona->pdata.hpdet_id_gpio, - GPIOF_OUT_INIT_LOW, - "HPDET"); - if (ret != 0) { - dev_err(arizona->dev, "Failed to request GPIO%d: %d\n", - arizona->pdata.hpdet_id_gpio, ret); - gpiod_put(info->micd_pol_gpio); - return ret; - } + info->hpdet_id_gpio = gpiod_get_optional(arizona->dev, + "wlf,hpdet-id-gpio", + mode); + if (IS_ERR(info->hpdet_id_gpio)) { + ret = PTR_ERR(info->hpdet_id_gpio); + dev_err_probe(arizona->dev, ret, "getting headphone detect ID GPIO\n"); + return ret; } -#endif return 0; } @@ -1385,6 +1347,7 @@ EXPORT_SYMBOL_GPL(arizona_jack_codec_dev_probe); int arizona_jack_codec_dev_remove(struct arizona_priv *info) { gpiod_put(info->micd_pol_gpio); + gpiod_put(info->hpdet_id_gpio); return 0; } EXPORT_SYMBOL_GPL(arizona_jack_codec_dev_remove); diff --git a/sound/soc/codecs/arizona.h b/sound/soc/codecs/arizona.h index ecd8890eefc1..0703182d87b3 100644 --- a/sound/soc/codecs/arizona.h +++ b/sound/soc/codecs/arizona.h @@ -100,6 +100,7 @@ struct arizona_priv { struct snd_soc_jack *jack; struct regulator *micvdd; struct gpio_desc *micd_pol_gpio; + struct gpio_desc *hpdet_id_gpio; u16 last_jackdet; -- cgit v1.2.3 From f8e761655997cc0ee434fb5f35570d2e93d3a707 Mon Sep 17 00:00:00 2001 From: Alexei Lazar Date: Mon, 9 Mar 2026 11:34:27 +0200 Subject: net/mlx5: Add IFC bits for shared headroom pool PBMC support Add hardware interface definitions for shared headroom pool (SHP) in port buffer management: - shp_pbmc_pbsr_support: capability bit in PCAM enhanced features indicating device support for shared headroom pool in PBMC/PBSR. - shared_headroom_pool: buffer entry in PBMC register (pbmc_reg_bits) for the shared headroom pool configuration, reusing the bufferx layout; reduce trailing reserved region accordingly. Signed-off-by: Alexei Lazar Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260309093435.1850724-2-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index a3948b36820d..a76c54bf1927 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -10845,7 +10845,9 @@ struct mlx5_ifc_pcam_enhanced_features_bits { u8 fec_200G_per_lane_in_pplm[0x1]; u8 reserved_at_1e[0x2a]; u8 fec_100G_per_lane_in_pplm[0x1]; - u8 reserved_at_49[0xa]; + u8 reserved_at_49[0x2]; + u8 shp_pbmc_pbsr_support[0x1]; + u8 reserved_at_4c[0x7]; u8 buffer_ownership[0x1]; u8 resereved_at_54[0x14]; u8 fec_50G_per_lane_in_pplm[0x1]; @@ -12090,8 +12092,9 @@ struct mlx5_ifc_pbmc_reg_bits { u8 port_buffer_size[0x10]; struct mlx5_ifc_bufferx_reg_bits buffer[10]; + struct mlx5_ifc_bufferx_reg_bits shared_headroom_pool; - u8 reserved_at_2e0[0x80]; + u8 reserved_at_320[0x40]; }; struct mlx5_ifc_sbpr_reg_bits { -- cgit v1.2.3 From 691dffc7255e740bc3df1c68b50b36786aadeb3a Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Mon, 9 Mar 2026 11:34:28 +0200 Subject: net/mlx5: Add silent mode set/query and VHCA RX IFC bits Update the mlx5 IFC headers with newly defined capability and command-layout bits: - Add silent_mode_query and rename silent_mode to silent_mode_set cap fields. - Add forward_vhca_rx and MLX5_IFC_FLOW_DESTINATION_TYPE_VHCA_RX. - Expose silent mode fields in the L2 table query command structures. Update the SD support check to use the new capability name (silent_mode_set) to match the updated IFC definition. Signed-off-by: Shay Drory Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260309093435.1850724-3-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c | 2 +- include/linux/mlx5/mlx5_ifc.h | 19 ++++++++++++++----- 3 files changed, 16 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c index c348ee62cd3a..16b28028609d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c @@ -1183,7 +1183,7 @@ int mlx5_fs_cmd_set_l2table_entry_silent(struct mlx5_core_dev *dev, u8 silent_mo { u32 in[MLX5_ST_SZ_DW(set_l2_table_entry_in)] = {}; - if (silent_mode && !MLX5_CAP_GEN(dev, silent_mode)) + if (silent_mode && !MLX5_CAP_GEN(dev, silent_mode_set)) return -EOPNOTSUPP; MLX5_SET(set_l2_table_entry_in, in, opcode, MLX5_CMD_OP_SET_L2_TABLE_ENTRY); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c index 954942ad93c5..762c783156b4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c @@ -107,7 +107,7 @@ static bool mlx5_sd_is_supported(struct mlx5_core_dev *dev, u8 host_buses) /* Disconnect secondaries from the network */ if (!MLX5_CAP_GEN(dev, eswitch_manager)) return false; - if (!MLX5_CAP_GEN(dev, silent_mode)) + if (!MLX5_CAP_GEN(dev, silent_mode_set)) return false; /* RX steering from primary to secondaries */ diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index a76c54bf1927..8fa4fb3d36cf 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -469,7 +469,8 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 table_miss_action_domain[0x1]; u8 termination_table[0x1]; u8 reformat_and_fwd_to_table[0x1]; - u8 reserved_at_1a[0x2]; + u8 forward_vhca_rx[0x1]; + u8 reserved_at_1b[0x1]; u8 ipsec_encrypt[0x1]; u8 ipsec_decrypt[0x1]; u8 sw_owner_v2[0x1]; @@ -2012,12 +2013,14 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 disable_local_lb_mc[0x1]; u8 log_min_hairpin_wq_data_sz[0x5]; u8 reserved_at_3e8[0x1]; - u8 silent_mode[0x1]; + u8 silent_mode_set[0x1]; u8 vhca_state[0x1]; u8 log_max_vlan_list[0x5]; u8 reserved_at_3f0[0x3]; u8 log_max_current_mc_list[0x5]; - u8 reserved_at_3f8[0x3]; + u8 reserved_at_3f8[0x1]; + u8 silent_mode_query[0x1]; + u8 reserved_at_3fa[0x1]; u8 log_max_current_uc_list[0x5]; u8 general_obj_types[0x40]; @@ -2279,6 +2282,7 @@ enum mlx5_ifc_flow_destination_type { MLX5_IFC_FLOW_DESTINATION_TYPE_VPORT = 0x0, MLX5_IFC_FLOW_DESTINATION_TYPE_FLOW_TABLE = 0x1, MLX5_IFC_FLOW_DESTINATION_TYPE_TIR = 0x2, + MLX5_IFC_FLOW_DESTINATION_TYPE_VHCA_RX = 0x4, MLX5_IFC_FLOW_DESTINATION_TYPE_FLOW_SAMPLER = 0x6, MLX5_IFC_FLOW_DESTINATION_TYPE_UPLINK = 0x8, MLX5_IFC_FLOW_DESTINATION_TYPE_TABLE_TYPE = 0xA, @@ -6265,7 +6269,9 @@ struct mlx5_ifc_query_l2_table_entry_out_bits { u8 reserved_at_40[0xa0]; - u8 reserved_at_e0[0x13]; + u8 reserved_at_e0[0x11]; + u8 silent_mode[0x1]; + u8 reserved_at_f2[0x1]; u8 vlan_valid[0x1]; u8 vlan[0xc]; @@ -6281,7 +6287,10 @@ struct mlx5_ifc_query_l2_table_entry_in_bits { u8 reserved_at_20[0x10]; u8 op_mod[0x10]; - u8 reserved_at_40[0x60]; + u8 reserved_at_40[0x40]; + + u8 silent_mode_query[0x1]; + u8 reserved_at_81[0x1f]; u8 reserved_at_a0[0x8]; u8 table_index[0x18]; -- cgit v1.2.3 From 971b28accc09436fe6a6d5afd667dcbfb3ed7e03 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Mon, 9 Mar 2026 11:34:32 +0200 Subject: net/mlx5: LAG, replace mlx5_get_dev_index with LAG sequence number Introduce mlx5_lag_get_dev_seq() which returns a device's sequence number within the LAG: master is always 0, remaining devices numbered sequentially. This provides a stable index for peer flow tracking and vport ordering without depending on native_port_num. Replace mlx5_get_dev_index() usage in en_tc.c (peer flow array indexing) and ib_rep.c (vport index ordering) with the new API. Signed-off-by: Shay Drory Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260309093435.1850724-7-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/ib_rep.c | 4 ++- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 9 +++--- drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c | 34 +++++++++++++++++++++++ include/linux/mlx5/lag.h | 11 ++++++++ 4 files changed, 53 insertions(+), 5 deletions(-) create mode 100644 include/linux/mlx5/lag.h (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c index 621834d75205..df8f049c5806 100644 --- a/drivers/infiniband/hw/mlx5/ib_rep.c +++ b/drivers/infiniband/hw/mlx5/ib_rep.c @@ -3,6 +3,7 @@ * Copyright (c) 2018 Mellanox Technologies. All rights reserved. */ +#include #include #include "ib_rep.h" #include "srq.h" @@ -134,7 +135,8 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) /* Only 1 ib port is the representor for all uplinks */ peer_n_ports--; - if (mlx5_get_dev_index(peer_dev) < mlx5_get_dev_index(dev)) + if (mlx5_lag_get_dev_seq(peer_dev) < + mlx5_lag_get_dev_seq(dev)) vport_index += peer_n_ports; } } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index 1434b65d4746..397a93584fd6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -2131,7 +2132,7 @@ static void mlx5e_tc_del_fdb_peer_flow(struct mlx5e_tc_flow *flow, mutex_unlock(&esw->offloads.peer_mutex); list_for_each_entry_safe(peer_flow, tmp, &flow->peer_flows, peer_flows) { - if (peer_index != mlx5_get_dev_index(peer_flow->priv->mdev)) + if (peer_index != mlx5_lag_get_dev_seq(peer_flow->priv->mdev)) continue; list_del(&peer_flow->peer_flows); @@ -2154,7 +2155,7 @@ static void mlx5e_tc_del_fdb_peers_flow(struct mlx5e_tc_flow *flow) devcom = flow->priv->mdev->priv.eswitch->devcom; mlx5_devcom_for_each_peer_entry(devcom, peer_esw, pos) { - i = mlx5_get_dev_index(peer_esw->dev); + i = mlx5_lag_get_dev_seq(peer_esw->dev); mlx5e_tc_del_fdb_peer_flow(flow, i); } } @@ -4584,7 +4585,7 @@ static int mlx5e_tc_add_fdb_peer_flow(struct flow_cls_offload *f, struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; struct mlx5_esw_flow_attr *attr = flow->attr->esw_attr; struct mlx5e_tc_flow_parse_attr *parse_attr; - int i = mlx5_get_dev_index(peer_esw->dev); + int i = mlx5_lag_get_dev_seq(peer_esw->dev); struct mlx5e_rep_priv *peer_urpriv; struct mlx5e_tc_flow *peer_flow; struct mlx5_core_dev *in_mdev; @@ -5525,7 +5526,7 @@ void mlx5e_tc_clean_fdb_peer_flows(struct mlx5_eswitch *esw) devcom = esw->devcom; mlx5_devcom_for_each_peer_entry(devcom, peer_esw, pos) { - i = mlx5_get_dev_index(peer_esw->dev); + i = mlx5_lag_get_dev_seq(peer_esw->dev); list_for_each_entry_safe(flow, tmp, &esw->offloads.peer_flows[i], peer[i]) mlx5e_tc_del_fdb_peers_flow(flow); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index 4beee64c937a..51ec8f61ecbb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -35,6 +35,7 @@ #include #include #include +#include #include "lib/mlx5.h" #include "lib/devcom.h" #include "mlx5_core.h" @@ -369,6 +370,39 @@ int mlx5_lag_get_dev_index_by_seq(struct mlx5_lag *ldev, int seq) return -ENOENT; } +/* Reverse of mlx5_lag_get_dev_index_by_seq: given a device, return its + * sequence number in the LAG. Master is always 0, others numbered + * sequentially starting from 1. + */ +int mlx5_lag_get_dev_seq(struct mlx5_core_dev *dev) +{ + struct mlx5_lag *ldev = mlx5_lag_dev(dev); + int master_idx, i, num = 1; + struct lag_func *pf; + + if (!ldev) + return -ENOENT; + + master_idx = mlx5_lag_get_master_idx(ldev); + if (master_idx < 0) + return -ENOENT; + + pf = mlx5_lag_pf(ldev, master_idx); + if (pf && pf->dev == dev) + return 0; + + mlx5_ldev_for_each(i, 0, ldev) { + if (i == master_idx) + continue; + pf = mlx5_lag_pf(ldev, i); + if (pf->dev == dev) + return num; + num++; + } + return -ENOENT; +} +EXPORT_SYMBOL(mlx5_lag_get_dev_seq); + /* Devcom events for LAG master marking */ #define LAG_DEVCOM_PAIR (0) #define LAG_DEVCOM_UNPAIR (1) diff --git a/include/linux/mlx5/lag.h b/include/linux/mlx5/lag.h new file mode 100644 index 000000000000..d370dfd19055 --- /dev/null +++ b/include/linux/mlx5/lag.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#ifndef __MLX5_LAG_API_H__ +#define __MLX5_LAG_API_H__ + +struct mlx5_core_dev; + +int mlx5_lag_get_dev_seq(struct mlx5_core_dev *dev); + +#endif /* __MLX5_LAG_API_H__ */ -- cgit v1.2.3 From 0bc9059fab6365feaf95cc9a796a3d381915a70f Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Mon, 9 Mar 2026 11:34:33 +0200 Subject: net/mlx5: Add VHCA RX flow destination support for FW steering Introduce MLX5_FLOW_DESTINATION_TYPE_VHCA_RX as a new flow steering destination type. Wire the new destination through flow steering command setup by mapping it to MLX5_IFC_FLOW_DESTINATION_TYPE_VHCA_RX and passing the vhca id, extend forward-destination validation to accept it, and teach the flow steering tracepoint formatter to print rx_vhca_id. Signed-off-by: Shay Drory Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260309093435.1850724-8-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c | 3 +++ drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c | 4 ++++ drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 7 +++++-- include/linux/mlx5/fs.h | 4 ++++ 4 files changed, 16 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c index 6d73127b7217..2cf1d3825def 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c @@ -282,6 +282,9 @@ const char *parse_fs_dst(struct trace_seq *p, case MLX5_FLOW_DESTINATION_TYPE_NONE: trace_seq_printf(p, "none\n"); break; + case MLX5_FLOW_DESTINATION_TYPE_VHCA_RX: + trace_seq_printf(p, "rx_vhca_id=%u\n", dst->vhca.id); + break; } trace_seq_putc(p, 0); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c index 16b28028609d..1cd4cd898ec2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c @@ -716,6 +716,10 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev, id = dst->dest_attr.ft->id; ifc_type = MLX5_IFC_FLOW_DESTINATION_TYPE_TABLE_TYPE; break; + case MLX5_FLOW_DESTINATION_TYPE_VHCA_RX: + id = dst->dest_attr.vhca.id; + ifc_type = MLX5_IFC_FLOW_DESTINATION_TYPE_VHCA_RX; + break; default: id = dst->dest_attr.tir_num; ifc_type = MLX5_IFC_FLOW_DESTINATION_TYPE_TIR; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 2c3544880a30..003d211306a7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -503,7 +503,8 @@ static bool is_fwd_dest_type(enum mlx5_flow_destination_type type) type == MLX5_FLOW_DESTINATION_TYPE_FLOW_SAMPLER || type == MLX5_FLOW_DESTINATION_TYPE_TIR || type == MLX5_FLOW_DESTINATION_TYPE_RANGE || - type == MLX5_FLOW_DESTINATION_TYPE_TABLE_TYPE; + type == MLX5_FLOW_DESTINATION_TYPE_TABLE_TYPE || + type == MLX5_FLOW_DESTINATION_TYPE_VHCA_RX; } static bool check_valid_spec(const struct mlx5_flow_spec *spec) @@ -1890,7 +1891,9 @@ static bool mlx5_flow_dests_cmp(struct mlx5_flow_destination *d1, d1->range.hit_ft == d2->range.hit_ft && d1->range.miss_ft == d2->range.miss_ft && d1->range.min == d2->range.min && - d1->range.max == d2->range.max)) + d1->range.max == d2->range.max) || + (d1->type == MLX5_FLOW_DESTINATION_TYPE_VHCA_RX && + d1->vhca.id == d2->vhca.id)) return true; } diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 9cadb1d5e6df..02064424e868 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -55,6 +55,7 @@ enum mlx5_flow_destination_type { MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM, MLX5_FLOW_DESTINATION_TYPE_RANGE, MLX5_FLOW_DESTINATION_TYPE_TABLE_TYPE, + MLX5_FLOW_DESTINATION_TYPE_VHCA_RX, }; enum { @@ -189,6 +190,9 @@ struct mlx5_flow_destination { u32 ft_num; struct mlx5_flow_table *ft; struct mlx5_fc *counter; + struct { + u16 id; + } vhca; struct { u16 num; u16 vhca_id; -- cgit v1.2.3 From d6c9b4de8109a3b4ca9c6c6b7c5fbc42cfeff9ae Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Mon, 9 Mar 2026 11:34:34 +0200 Subject: {net/RDMA}/mlx5: Add LAG demux table API and vport demux rules Downstream patches will introduce SW-only LAG (e.g. shared_fdb without HW LAG). In this mode the firmware cannot create the LAG demux table, but vport demuxing is still required. Move LAG demux flow-table ownership to the LAG layer and introduce APIs to init/cleanup the demux table and add/delete per-vport rules. Adjust the RDMA driver to use the new APIs. In this mode, the LAG layer will create a flow group that matches vport metadata. Vports that are not native to the LAG master eswitch add the demux rule during IB representor load and remove it on unload. The demux rule forward traffic from said vports to their native eswitch manager via a new dest type - MLX5_FLOW_DESTINATION_TYPE_VHCA_RX. Signed-off-by: Shay Drory Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260309093435.1850724-9-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/ib_rep.c | 20 ++- drivers/infiniband/hw/mlx5/main.c | 21 +-- drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 - drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 12 ++ .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 83 ++++++++++- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 10 +- drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c | 152 +++++++++++++++++++++ drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h | 12 ++ include/linux/mlx5/fs.h | 6 +- include/linux/mlx5/lag.h | 10 ++ 10 files changed, 300 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c index df8f049c5806..1709b628702e 100644 --- a/drivers/infiniband/hw/mlx5/ib_rep.c +++ b/drivers/infiniband/hw/mlx5/ib_rep.c @@ -10,11 +10,13 @@ static int mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev, + struct mlx5_core_dev *rep_dev, struct mlx5_eswitch_rep *rep, int vport_index) { struct mlx5_ib_dev *ibdev; struct net_device *ndev; + int ret; ibdev = mlx5_eswitch_uplink_get_proto_dev(dev->priv.eswitch, REP_IB); if (!ibdev) @@ -24,7 +26,17 @@ mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev, rep->rep_data[REP_IB].priv = ibdev; ndev = mlx5_ib_get_rep_netdev(rep->esw, rep->vport); - return ib_device_set_netdev(&ibdev->ib_dev, ndev, vport_index + 1); + ret = ib_device_set_netdev(&ibdev->ib_dev, ndev, vport_index + 1); + if (ret) + return ret; + + /* Only Vports that are not native to the LAG master eswitch need to add + * demux rule. + */ + if (mlx5_eswitch_get_total_vports(dev) > vport_index) + return 0; + + return mlx5_lag_demux_rule_add(rep_dev, rep->vport, vport_index); } static void mlx5_ib_register_peer_vport_reps(struct mlx5_core_dev *mdev); @@ -131,7 +143,7 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) if (mlx5_lag_is_master(peer_dev)) lag_master = peer_dev; - else if (!mlx5_lag_is_mpesw(dev)) + else if (!mlx5_lag_is_mpesw(peer_dev)) /* Only 1 ib port is the representor for all uplinks */ peer_n_ports--; @@ -145,7 +157,7 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) if (rep->vport == MLX5_VPORT_UPLINK && !new_uplink) profile = &raw_eth_profile; else - return mlx5_ib_set_vport_rep(lag_master, rep, vport_index); + return mlx5_ib_set_vport_rep(lag_master, dev, rep, vport_index); if (mlx5_lag_is_shared_fdb(dev)) { ret = mlx5_ib_take_transport(lag_master); @@ -233,6 +245,8 @@ mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep) vport_index = i; } + mlx5_lag_demux_rule_del(mdev, vport_index); + port = &dev->port[vport_index]; ib_device_set_netdev(&dev->ib_dev, NULL, vport_index + 1); diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 635002e684a5..9fb0629978bd 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -3678,12 +3679,12 @@ static void mlx5e_lag_event_unregister(struct mlx5_ib_dev *dev) static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev) { + struct mlx5_flow_table_attr ft_attr = {}; struct mlx5_core_dev *mdev = dev->mdev; - struct mlx5_flow_namespace *ns = mlx5_get_flow_namespace(mdev, - MLX5_FLOW_NAMESPACE_LAG); - struct mlx5_flow_table *ft; + struct mlx5_flow_namespace *ns; int err; + ns = mlx5_get_flow_namespace(mdev, MLX5_FLOW_NAMESPACE_LAG); if (!ns || !mlx5_lag_is_active(mdev)) return 0; @@ -3691,14 +3692,15 @@ static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev) if (err) return err; - ft = mlx5_create_lag_demux_flow_table(ns, 0, 0); - if (IS_ERR(ft)) { - err = PTR_ERR(ft); + ft_attr.level = 0; + ft_attr.prio = 0; + ft_attr.max_fte = dev->num_ports; + + err = mlx5_lag_demux_init(mdev, &ft_attr); + if (err) goto err_destroy_vport_lag; - } mlx5e_lag_event_register(dev); - dev->flow_db->lag_demux_ft = ft; dev->lag_ports = mlx5_lag_get_num_ports(mdev); dev->lag_active = true; return 0; @@ -3716,8 +3718,7 @@ static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev) dev->lag_active = false; mlx5e_lag_event_unregister(dev); - mlx5_destroy_flow_table(dev->flow_db->lag_demux_ft); - dev->flow_db->lag_demux_ft = NULL; + mlx5_lag_demux_cleanup(mdev); mlx5_cmd_destroy_vport_lag(mdev); } diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 4f4114d95130..3fc31415e107 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -306,7 +306,6 @@ struct mlx5_ib_flow_db { struct mlx5_ib_flow_prio rdma_rx[MLX5_IB_NUM_FLOW_FT]; struct mlx5_ib_flow_prio rdma_tx[MLX5_IB_NUM_FLOW_FT]; struct mlx5_ib_flow_prio opfcs[MLX5_IB_OPCOUNTER_MAX]; - struct mlx5_flow_table *lag_demux_ft; struct mlx5_ib_flow_prio *rdma_transport_rx[MLX5_RDMA_TRANSPORT_BYPASS_PRIO]; struct mlx5_ib_flow_prio *rdma_transport_tx[MLX5_RDMA_TRANSPORT_BYPASS_PRIO]; /* Protect flow steering bypass flow tables diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index 96309a732d50..9b729789537f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -940,6 +940,12 @@ int mlx5_esw_ipsec_vf_packet_offload_supported(struct mlx5_core_dev *dev, u16 vport_num); bool mlx5_esw_host_functions_enabled(const struct mlx5_core_dev *dev); void mlx5_eswitch_safe_aux_devs_remove(struct mlx5_core_dev *dev); +struct mlx5_flow_group * +mlx5_esw_lag_demux_fg_create(struct mlx5_eswitch *esw, + struct mlx5_flow_table *ft); +struct mlx5_flow_handle * +mlx5_esw_lag_demux_rule_create(struct mlx5_eswitch *esw, u16 vport_num, + struct mlx5_flow_table *lag_ft); #else /* CONFIG_MLX5_ESWITCH */ /* eswitch API stubs */ static inline int mlx5_eswitch_init(struct mlx5_core_dev *dev) { return 0; } @@ -1025,6 +1031,12 @@ mlx5_esw_vport_vhca_id(struct mlx5_eswitch *esw, u16 vportn, u16 *vhca_id) static inline void mlx5_eswitch_safe_aux_devs_remove(struct mlx5_core_dev *dev) {} +static inline struct mlx5_flow_handle * +mlx5_esw_lag_demux_rule_create(struct mlx5_eswitch *esw, u16 vport_num, + struct mlx5_flow_table *lag_ft) +{ + return ERR_PTR(-EOPNOTSUPP); +} #endif /* CONFIG_MLX5_ESWITCH */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 90e6f97bdf4a..f98837470f39 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -1459,6 +1459,83 @@ esw_add_restore_rule(struct mlx5_eswitch *esw, u32 tag) return flow_rule; } +struct mlx5_flow_group * +mlx5_esw_lag_demux_fg_create(struct mlx5_eswitch *esw, + struct mlx5_flow_table *ft) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_flow_group *fg; + void *match_criteria; + void *flow_group_in; + + if (!mlx5_eswitch_vport_match_metadata_enabled(esw)) + return ERR_PTR(-EOPNOTSUPP); + + if (IS_ERR(ft)) + return ERR_CAST(ft); + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + return ERR_PTR(-ENOMEM); + + match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, + match_criteria); + MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, + MLX5_MATCH_MISC_PARAMETERS_2); + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, + ft->max_fte - 1); + + MLX5_SET(fte_match_param, match_criteria, + misc_parameters_2.metadata_reg_c_0, + mlx5_eswitch_get_vport_metadata_mask()); + + fg = mlx5_create_flow_group(ft, flow_group_in); + kvfree(flow_group_in); + if (IS_ERR(fg)) + esw_warn(esw->dev, "Can't create LAG demux flow group\n"); + + return fg; +} + +struct mlx5_flow_handle * +mlx5_esw_lag_demux_rule_create(struct mlx5_eswitch *esw, u16 vport_num, + struct mlx5_flow_table *lag_ft) +{ + struct mlx5_flow_spec *spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + struct mlx5_flow_destination dest = {}; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_handle *ret; + void *misc; + + if (!spec) + return ERR_PTR(-ENOMEM); + + if (!mlx5_eswitch_vport_match_metadata_enabled(esw)) { + kvfree(spec); + return ERR_PTR(-EOPNOTSUPP); + } + + misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, + misc_parameters_2); + MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0, + mlx5_eswitch_get_vport_metadata_mask()); + spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS_2; + + misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, + misc_parameters_2); + MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0, + mlx5_eswitch_get_vport_metadata_for_match(esw, vport_num)); + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + dest.type = MLX5_FLOW_DESTINATION_TYPE_VHCA_RX; + dest.vhca.id = MLX5_CAP_GEN(esw->dev, vhca_id); + + ret = mlx5_add_flow_rules(lag_ft, spec, &flow_act, &dest, 1); + kvfree(spec); + return ret; +} + #define MAX_PF_SQ 256 #define MAX_SQ_NVPORTS 32 @@ -2047,7 +2124,8 @@ static int esw_create_vport_rx_group(struct mlx5_eswitch *esw) if (IS_ERR(g)) { err = PTR_ERR(g); - mlx5_core_warn(esw->dev, "Failed to create vport rx group err %d\n", err); + esw_warn(esw->dev, "Failed to create vport rx group err %d\n", + err); goto out; } @@ -2092,7 +2170,8 @@ static int esw_create_vport_rx_drop_group(struct mlx5_eswitch *esw) if (IS_ERR(g)) { err = PTR_ERR(g); - mlx5_core_warn(esw->dev, "Failed to create vport rx drop group err %d\n", err); + esw_warn(esw->dev, + "Failed to create vport rx drop group err %d\n", err); goto out; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 003d211306a7..61a6ba1e49dd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -1438,15 +1438,9 @@ mlx5_create_vport_flow_table(struct mlx5_flow_namespace *ns, struct mlx5_flow_table* mlx5_create_lag_demux_flow_table(struct mlx5_flow_namespace *ns, - int prio, u32 level) + struct mlx5_flow_table_attr *ft_attr) { - struct mlx5_flow_table_attr ft_attr = {}; - - ft_attr.level = level; - ft_attr.prio = prio; - ft_attr.max_fte = 1; - - return __mlx5_create_flow_table(ns, &ft_attr, FS_FT_OP_MOD_LAG_DEMUX, 0); + return __mlx5_create_flow_table(ns, ft_attr, FS_FT_OP_MOD_LAG_DEMUX, 0); } EXPORT_SYMBOL(mlx5_create_lag_demux_flow_table); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index 51ec8f61ecbb..449e4bd86c06 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -1471,6 +1471,158 @@ struct mlx5_devcom_comp_dev *mlx5_lag_get_devcom_comp(struct mlx5_lag *ldev) return devcom; } +static int mlx5_lag_demux_ft_fg_init(struct mlx5_core_dev *dev, + struct mlx5_flow_table_attr *ft_attr, + struct mlx5_lag *ldev) +{ +#ifdef CONFIG_MLX5_ESWITCH + struct mlx5_flow_namespace *ns; + struct mlx5_flow_group *fg; + int err; + + ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_LAG); + if (!ns) + return 0; + + ldev->lag_demux_ft = mlx5_create_flow_table(ns, ft_attr); + if (IS_ERR(ldev->lag_demux_ft)) + return PTR_ERR(ldev->lag_demux_ft); + + fg = mlx5_esw_lag_demux_fg_create(dev->priv.eswitch, + ldev->lag_demux_ft); + if (IS_ERR(fg)) { + err = PTR_ERR(fg); + mlx5_destroy_flow_table(ldev->lag_demux_ft); + ldev->lag_demux_ft = NULL; + return err; + } + + ldev->lag_demux_fg = fg; + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static int mlx5_lag_demux_fw_init(struct mlx5_core_dev *dev, + struct mlx5_flow_table_attr *ft_attr, + struct mlx5_lag *ldev) +{ + struct mlx5_flow_namespace *ns; + int err; + + ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_LAG); + if (!ns) + return 0; + + ldev->lag_demux_fg = NULL; + ft_attr->max_fte = 1; + ldev->lag_demux_ft = mlx5_create_lag_demux_flow_table(ns, ft_attr); + if (IS_ERR(ldev->lag_demux_ft)) { + err = PTR_ERR(ldev->lag_demux_ft); + ldev->lag_demux_ft = NULL; + return err; + } + + return 0; +} + +int mlx5_lag_demux_init(struct mlx5_core_dev *dev, + struct mlx5_flow_table_attr *ft_attr) +{ + struct mlx5_lag *ldev; + + if (!ft_attr) + return -EINVAL; + + ldev = mlx5_lag_dev(dev); + if (!ldev) + return -ENODEV; + + xa_init(&ldev->lag_demux_rules); + + if (mlx5_get_sd(dev)) + return mlx5_lag_demux_ft_fg_init(dev, ft_attr, ldev); + + return mlx5_lag_demux_fw_init(dev, ft_attr, ldev); +} +EXPORT_SYMBOL(mlx5_lag_demux_init); + +void mlx5_lag_demux_cleanup(struct mlx5_core_dev *dev) +{ + struct mlx5_flow_handle *rule; + struct mlx5_lag *ldev; + unsigned long vport_num; + + ldev = mlx5_lag_dev(dev); + if (!ldev) + return; + + xa_for_each(&ldev->lag_demux_rules, vport_num, rule) + mlx5_del_flow_rules(rule); + xa_destroy(&ldev->lag_demux_rules); + + if (ldev->lag_demux_fg) + mlx5_destroy_flow_group(ldev->lag_demux_fg); + if (ldev->lag_demux_ft) + mlx5_destroy_flow_table(ldev->lag_demux_ft); + ldev->lag_demux_fg = NULL; + ldev->lag_demux_ft = NULL; +} +EXPORT_SYMBOL(mlx5_lag_demux_cleanup); + +int mlx5_lag_demux_rule_add(struct mlx5_core_dev *vport_dev, u16 vport_num, + int index) +{ + struct mlx5_flow_handle *rule; + struct mlx5_lag *ldev; + int err; + + ldev = mlx5_lag_dev(vport_dev); + if (!ldev || !ldev->lag_demux_fg) + return 0; + + if (xa_load(&ldev->lag_demux_rules, index)) + return 0; + + rule = mlx5_esw_lag_demux_rule_create(vport_dev->priv.eswitch, + vport_num, ldev->lag_demux_ft); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + mlx5_core_warn(vport_dev, + "Failed to create LAG demux rule for vport %u, err %d\n", + vport_num, err); + return err; + } + + err = xa_err(xa_store(&ldev->lag_demux_rules, index, rule, + GFP_KERNEL)); + if (err) { + mlx5_del_flow_rules(rule); + mlx5_core_warn(vport_dev, + "Failed to store LAG demux rule for vport %u, err %d\n", + vport_num, err); + } + + return err; +} +EXPORT_SYMBOL(mlx5_lag_demux_rule_add); + +void mlx5_lag_demux_rule_del(struct mlx5_core_dev *dev, int index) +{ + struct mlx5_flow_handle *rule; + struct mlx5_lag *ldev; + + ldev = mlx5_lag_dev(dev); + if (!ldev || !ldev->lag_demux_fg) + return; + + rule = xa_erase(&ldev->lag_demux_rules, index); + if (rule) + mlx5_del_flow_rules(rule); +} +EXPORT_SYMBOL(mlx5_lag_demux_rule_del); + static void mlx5_queue_bond_work(struct mlx5_lag *ldev, unsigned long delay) { queue_delayed_work(ldev->wq, &ldev->bond_work, delay); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h index 30cbd61768f8..6c911374f409 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h @@ -5,6 +5,9 @@ #define __MLX5_LAG_H__ #include +#include +#include +#include #define MLX5_LAG_MAX_HASH_BUCKETS 16 /* XArray mark for the LAG master device @@ -83,6 +86,9 @@ struct mlx5_lag { /* Protect lag fields/state changes */ struct mutex lock; struct lag_mpesw lag_mpesw; + struct mlx5_flow_table *lag_demux_ft; + struct mlx5_flow_group *lag_demux_fg; + struct xarray lag_demux_rules; }; static inline struct mlx5_lag * @@ -133,6 +139,12 @@ mlx5_lag_is_ready(struct mlx5_lag *ldev) bool mlx5_lag_shared_fdb_supported(struct mlx5_lag *ldev); bool mlx5_lag_check_prereq(struct mlx5_lag *ldev); +int mlx5_lag_demux_init(struct mlx5_core_dev *dev, + struct mlx5_flow_table_attr *ft_attr); +void mlx5_lag_demux_cleanup(struct mlx5_core_dev *dev); +int mlx5_lag_demux_rule_add(struct mlx5_core_dev *dev, u16 vport_num, + int vport_index); +void mlx5_lag_demux_rule_del(struct mlx5_core_dev *dev, int vport_index); void mlx5_modify_lag(struct mlx5_lag *ldev, struct lag_tracker *tracker); int mlx5_activate_lag(struct mlx5_lag *ldev, diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 02064424e868..d8f3b7ef319e 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -252,9 +252,9 @@ mlx5_create_auto_grouped_flow_table(struct mlx5_flow_namespace *ns, struct mlx5_flow_table * mlx5_create_vport_flow_table(struct mlx5_flow_namespace *ns, struct mlx5_flow_table_attr *ft_attr, u16 vport); -struct mlx5_flow_table *mlx5_create_lag_demux_flow_table( - struct mlx5_flow_namespace *ns, - int prio, u32 level); +struct mlx5_flow_table * +mlx5_create_lag_demux_flow_table(struct mlx5_flow_namespace *ns, + struct mlx5_flow_table_attr *ft_attr); int mlx5_destroy_flow_table(struct mlx5_flow_table *ft); /* inbox should be set with the following values: diff --git a/include/linux/mlx5/lag.h b/include/linux/mlx5/lag.h index d370dfd19055..ab9f754664e5 100644 --- a/include/linux/mlx5/lag.h +++ b/include/linux/mlx5/lag.h @@ -4,8 +4,18 @@ #ifndef __MLX5_LAG_API_H__ #define __MLX5_LAG_API_H__ +#include + struct mlx5_core_dev; +struct mlx5_flow_table; +struct mlx5_flow_table_attr; +int mlx5_lag_demux_init(struct mlx5_core_dev *dev, + struct mlx5_flow_table_attr *ft_attr); +void mlx5_lag_demux_cleanup(struct mlx5_core_dev *dev); +int mlx5_lag_demux_rule_add(struct mlx5_core_dev *dev, u16 vport_num, + int vport_index); +void mlx5_lag_demux_rule_del(struct mlx5_core_dev *dev, int vport_index); int mlx5_lag_get_dev_seq(struct mlx5_core_dev *dev); #endif /* __MLX5_LAG_API_H__ */ -- cgit v1.2.3 From 4dd2115f43594da5271a1aa34fde6719b4259047 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 9 Mar 2026 11:34:35 +0200 Subject: net/mlx5: Expose MLX5_UMR_ALIGN definition Expose HW constant value in a shared header, to be used by core/EN drivers. Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20260309093435.1850724-10-tariqt@nvidia.com Reviewed-by: Dragos Tatulea Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mr.c | 1 - include/linux/mlx5/device.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 665323b90b64..ff56948597dd 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -51,7 +51,6 @@ enum { }; #define MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS 4 -#define MLX5_UMR_ALIGN 2048 static void create_mkey_callback(int status, struct mlx5_async_work *context); diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 25c6b42140b2..07a25f264292 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -293,6 +293,7 @@ enum { MLX5_UMR_INLINE = (1 << 7), }; +#define MLX5_UMR_ALIGN (2048) #define MLX5_UMR_FLEX_ALIGNMENT 0x40 #define MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_FLEX_ALIGNMENT / sizeof(struct mlx5_mtt)) #define MLX5_UMR_KLM_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_FLEX_ALIGNMENT / sizeof(struct mlx5_klm)) -- cgit v1.2.3 From f1a424e21c15993db0f9594cda17ef5d516ab3e9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 14 Mar 2026 08:41:04 -0600 Subject: io_uring: switch struct io_ring_ctx internal bitfields to flags Bitfields cannot be set and checked atomically, and this makes it more clear that these are indeed in shared storage and must be checked and set in a sane fashion. This is in preparation for annotating a few of the known racy, but harmless, flags checking. No intended functional changes in this patch. Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 34 ++++++++++-------- io_uring/eventfd.c | 4 +-- io_uring/io_uring.c | 82 +++++++++++++++++++++--------------------- io_uring/io_uring.h | 9 ++--- io_uring/msg_ring.c | 2 +- io_uring/register.c | 8 ++--- io_uring/rsrc.c | 8 ++--- io_uring/tctx.c | 2 +- io_uring/timeout.c | 4 +-- io_uring/tw.c | 2 +- 10 files changed, 82 insertions(+), 73 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index dd1420bfcb73..0b3f08adc217 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -268,24 +268,30 @@ struct io_alloc_cache { unsigned int init_clear; }; +enum { + IO_RING_F_DRAIN_NEXT = BIT(0), + IO_RING_F_OP_RESTRICTED = BIT(1), + IO_RING_F_REG_RESTRICTED = BIT(2), + IO_RING_F_OFF_TIMEOUT_USED = BIT(3), + IO_RING_F_DRAIN_ACTIVE = BIT(4), + IO_RING_F_HAS_EVFD = BIT(5), + /* all CQEs should be posted only by the submitter task */ + IO_RING_F_TASK_COMPLETE = BIT(6), + IO_RING_F_LOCKLESS_CQ = BIT(7), + IO_RING_F_SYSCALL_IOPOLL = BIT(8), + IO_RING_F_POLL_ACTIVATED = BIT(9), + IO_RING_F_DRAIN_DISABLED = BIT(10), + IO_RING_F_COMPAT = BIT(11), + IO_RING_F_IOWQ_LIMITS_SET = BIT(12), +}; + struct io_ring_ctx { /* const or read-mostly hot data */ struct { + /* ring setup flags */ unsigned int flags; - unsigned int drain_next: 1; - unsigned int op_restricted: 1; - unsigned int reg_restricted: 1; - unsigned int off_timeout_used: 1; - unsigned int drain_active: 1; - unsigned int has_evfd: 1; - /* all CQEs should be posted only by the submitter task */ - unsigned int task_complete: 1; - unsigned int lockless_cq: 1; - unsigned int syscall_iopoll: 1; - unsigned int poll_activated: 1; - unsigned int drain_disabled: 1; - unsigned int compat: 1; - unsigned int iowq_limits_set : 1; + /* internal state flags IO_RING_F_* flags , mostly read-only */ + unsigned int int_flags; struct task_struct *submitter_task; struct io_rings *rings; diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c index 7482a7dc6b38..3da028500f76 100644 --- a/io_uring/eventfd.c +++ b/io_uring/eventfd.c @@ -148,7 +148,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, spin_unlock(&ctx->completion_lock); ev_fd->eventfd_async = eventfd_async; - ctx->has_evfd = true; + ctx->int_flags |= IO_RING_F_HAS_EVFD; refcount_set(&ev_fd->refs, 1); atomic_set(&ev_fd->ops, 0); rcu_assign_pointer(ctx->io_ev_fd, ev_fd); @@ -162,7 +162,7 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx) ev_fd = rcu_dereference_protected(ctx->io_ev_fd, lockdep_is_held(&ctx->uring_lock)); if (ev_fd) { - ctx->has_evfd = false; + ctx->int_flags &= ~IO_RING_F_HAS_EVFD; rcu_assign_pointer(ctx->io_ev_fd, NULL); io_eventfd_put(ev_fd); return 0; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 9a37035e76c0..bfeb3bc3849d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -477,17 +477,17 @@ static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx) void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { - if (ctx->poll_activated) + if (ctx->int_flags & IO_RING_F_POLL_ACTIVATED) io_poll_wq_wake(ctx); - if (ctx->off_timeout_used) + if (ctx->int_flags & IO_RING_F_OFF_TIMEOUT_USED) io_flush_timeouts(ctx); - if (ctx->has_evfd) + if (ctx->int_flags & IO_RING_F_HAS_EVFD) io_eventfd_signal(ctx, true); } static inline void __io_cq_lock(struct io_ring_ctx *ctx) { - if (!ctx->lockless_cq) + if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ)) spin_lock(&ctx->completion_lock); } @@ -500,11 +500,11 @@ static inline void io_cq_lock(struct io_ring_ctx *ctx) static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx) { io_commit_cqring(ctx); - if (!ctx->task_complete) { - if (!ctx->lockless_cq) + if (!(ctx->int_flags & IO_RING_F_TASK_COMPLETE)) { + if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ)) spin_unlock(&ctx->completion_lock); /* IOPOLL rings only need to wake up if it's also SQPOLL */ - if (!ctx->syscall_iopoll) + if (!(ctx->int_flags & IO_RING_F_SYSCALL_IOPOLL)) io_cqring_wake(ctx); } io_commit_cqring_flush(ctx); @@ -830,7 +830,7 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { lockdep_assert_held(&ctx->uring_lock); - lockdep_assert(ctx->lockless_cq); + lockdep_assert(ctx->int_flags & IO_RING_F_LOCKLESS_CQ); if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) { struct io_cqe cqe = io_init_cqe(user_data, res, cflags); @@ -860,7 +860,7 @@ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags) lockdep_assert(!io_wq_current_is_worker()); lockdep_assert_held(&ctx->uring_lock); - if (!ctx->lockless_cq) { + if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ)) { spin_lock(&ctx->completion_lock); posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags); spin_unlock(&ctx->completion_lock); @@ -885,7 +885,7 @@ bool io_req_post_cqe32(struct io_kiocb *req, struct io_uring_cqe cqe[2]) lockdep_assert_held(&ctx->uring_lock); cqe[0].user_data = req->cqe.user_data; - if (!ctx->lockless_cq) { + if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ)) { spin_lock(&ctx->completion_lock); posted = io_fill_cqe_aux32(ctx, cqe); spin_unlock(&ctx->completion_lock); @@ -913,7 +913,7 @@ static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) * Handle special CQ sync cases via task_work. DEFER_TASKRUN requires * the submitter task context, IOPOLL protects with uring_lock. */ - if (ctx->lockless_cq || (req->flags & REQ_F_REISSUE)) { + if ((ctx->int_flags & IO_RING_F_LOCKLESS_CQ) || (req->flags & REQ_F_REISSUE)) { defer_complete: req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); @@ -1135,7 +1135,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) */ if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) && unlikely(!io_fill_cqe_req(ctx, req))) { - if (ctx->lockless_cq) + if (ctx->int_flags & IO_RING_F_LOCKLESS_CQ) io_cqe_overflow(ctx, &req->cqe, &req->big_cqe); else io_cqe_overflow_locked(ctx, &req->cqe, &req->big_cqe); @@ -1148,7 +1148,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) INIT_WQ_LIST(&state->compl_reqs); } - if (unlikely(ctx->drain_active)) + if (unlikely(ctx->int_flags & IO_RING_F_DRAIN_ACTIVE)) io_queue_deferred(ctx); ctx->submit_state.cq_flush = false; @@ -1344,7 +1344,7 @@ static __cold void io_drain_req(struct io_kiocb *req) list_add_tail(&de->list, &ctx->defer_list); io_queue_deferred(ctx); if (!drain && list_empty(&ctx->defer_list)) - ctx->drain_active = false; + ctx->int_flags &= ~IO_RING_F_DRAIN_ACTIVE; } static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def, @@ -1655,7 +1655,7 @@ static void io_queue_sqe_fallback(struct io_kiocb *req) } else { /* can't fail with IO_URING_F_INLINE */ io_req_sqe_copy(req, IO_URING_F_INLINE); - if (unlikely(req->ctx->drain_active)) + if (unlikely(req->ctx->int_flags & IO_RING_F_DRAIN_ACTIVE)) io_drain_req(req); else io_queue_iowq(req); @@ -1671,7 +1671,7 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx, struct io_kiocb *req, unsigned int sqe_flags) { - if (!ctx->op_restricted) + if (!(ctx->int_flags & IO_RING_F_OP_RESTRICTED)) return true; if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) return false; @@ -1691,7 +1691,7 @@ static void io_init_drain(struct io_ring_ctx *ctx) { struct io_kiocb *head = ctx->submit_state.link.head; - ctx->drain_active = true; + ctx->int_flags |= IO_RING_F_DRAIN_ACTIVE; if (head) { /* * If we need to drain a request in the middle of a link, drain @@ -1701,7 +1701,7 @@ static void io_init_drain(struct io_ring_ctx *ctx) * link. */ head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC; - ctx->drain_next = true; + ctx->int_flags |= IO_RING_F_DRAIN_NEXT; } } @@ -1767,23 +1767,23 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->buf_index = READ_ONCE(sqe->buf_group); } if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS) - ctx->drain_disabled = true; + ctx->int_flags |= IO_RING_F_DRAIN_DISABLED; if (sqe_flags & IOSQE_IO_DRAIN) { - if (ctx->drain_disabled) + if (ctx->int_flags & IO_RING_F_DRAIN_DISABLED) return io_init_fail_req(req, -EOPNOTSUPP); io_init_drain(ctx); } } - if (unlikely(ctx->op_restricted || ctx->drain_active || ctx->drain_next)) { + if (unlikely(ctx->int_flags & (IO_RING_F_OP_RESTRICTED | IO_RING_F_DRAIN_ACTIVE | IO_RING_F_DRAIN_NEXT))) { if (!io_check_restriction(ctx, req, sqe_flags)) return io_init_fail_req(req, -EACCES); /* knock it to the slow queue path, will be drained there */ - if (ctx->drain_active) + if (ctx->int_flags & IO_RING_F_DRAIN_ACTIVE) req->flags |= REQ_F_FORCE_ASYNC; /* if there is no link, we're at "next" request and need to drain */ - if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) { - ctx->drain_next = false; - ctx->drain_active = true; + if (unlikely(ctx->int_flags & IO_RING_F_DRAIN_NEXT) && !ctx->submit_state.link.head) { + ctx->int_flags &= ~IO_RING_F_DRAIN_NEXT; + ctx->int_flags |= IO_RING_F_DRAIN_ACTIVE; req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC; } } @@ -2204,7 +2204,7 @@ static __cold void io_activate_pollwq_cb(struct callback_head *cb) poll_wq_task_work); mutex_lock(&ctx->uring_lock); - ctx->poll_activated = true; + ctx->int_flags |= IO_RING_F_POLL_ACTIVATED; mutex_unlock(&ctx->uring_lock); /* @@ -2219,9 +2219,9 @@ __cold void io_activate_pollwq(struct io_ring_ctx *ctx) { spin_lock(&ctx->completion_lock); /* already activated or in progress */ - if (ctx->poll_activated || ctx->poll_wq_task_work.func) + if ((ctx->int_flags & IO_RING_F_POLL_ACTIVATED) || ctx->poll_wq_task_work.func) goto out; - if (WARN_ON_ONCE(!ctx->task_complete)) + if (WARN_ON_ONCE(!(ctx->int_flags & IO_RING_F_TASK_COMPLETE))) goto out; if (!ctx->submitter_task) goto out; @@ -2242,7 +2242,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) struct io_ring_ctx *ctx = file->private_data; __poll_t mask = 0; - if (unlikely(!ctx->poll_activated)) + if (unlikely(!(ctx->int_flags & IO_RING_F_POLL_ACTIVATED))) io_activate_pollwq(ctx); /* * provides mb() which pairs with barrier from wq_has_sleeper @@ -2607,7 +2607,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, goto out; } if (flags & IORING_ENTER_GETEVENTS) { - if (ctx->syscall_iopoll) + if (ctx->int_flags & IO_RING_F_SYSCALL_IOPOLL) goto iopoll_locked; /* * Ignore errors, we'll soon call io_cqring_wait() and @@ -2622,7 +2622,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (flags & IORING_ENTER_GETEVENTS) { int ret2; - if (ctx->syscall_iopoll) { + if (ctx->int_flags & IO_RING_F_SYSCALL_IOPOLL) { /* * We disallow the app entering submit/complete with * polling, but we still need to lock the ring to @@ -2923,9 +2923,9 @@ static void io_ctx_restriction_clone(struct io_ring_ctx *ctx, if (dst->bpf_filters) WRITE_ONCE(ctx->bpf_filters, dst->bpf_filters->filters); if (dst->op_registered) - ctx->op_restricted = 1; + ctx->int_flags |= IO_RING_F_OP_RESTRICTED; if (dst->reg_registered) - ctx->reg_restricted = 1; + ctx->int_flags |= IO_RING_F_REG_RESTRICTED; } static __cold int io_uring_create(struct io_ctx_config *config) @@ -2952,17 +2952,18 @@ static __cold int io_uring_create(struct io_ctx_config *config) if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && !(ctx->flags & IORING_SETUP_IOPOLL)) - ctx->task_complete = true; + ctx->int_flags |= IO_RING_F_TASK_COMPLETE; - if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) - ctx->lockless_cq = true; + if ((ctx->int_flags & IO_RING_F_TASK_COMPLETE) || + (ctx->flags & IORING_SETUP_IOPOLL)) + ctx->int_flags |= IO_RING_F_LOCKLESS_CQ; /* * lazy poll_wq activation relies on ->task_complete for synchronisation * purposes, see io_activate_pollwq() */ - if (!ctx->task_complete) - ctx->poll_activated = true; + if (!(ctx->int_flags & IO_RING_F_TASK_COMPLETE)) + ctx->int_flags |= IO_RING_F_POLL_ACTIVATED; /* * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user @@ -2972,9 +2973,10 @@ static __cold int io_uring_create(struct io_ctx_config *config) */ if (ctx->flags & IORING_SETUP_IOPOLL && !(ctx->flags & IORING_SETUP_SQPOLL)) - ctx->syscall_iopoll = 1; + ctx->int_flags |= IO_RING_F_SYSCALL_IOPOLL; - ctx->compat = in_compat_syscall(); + if (in_compat_syscall()) + ctx->int_flags |= IO_RING_F_COMPAT; if (!ns_capable_noaudit(&init_user_ns, CAP_IPC_LOCK)) ctx->user = get_uid(current_user()); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 0fa844faf287..5cb1983043cd 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -211,7 +211,7 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) if (ctx->flags & IORING_SETUP_IOPOLL) { lockdep_assert_held(&ctx->uring_lock); - } else if (!ctx->task_complete) { + } else if (!(ctx->int_flags & IO_RING_F_TASK_COMPLETE)) { lockdep_assert_held(&ctx->completion_lock); } else if (ctx->submitter_task) { /* @@ -228,7 +228,7 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) static inline bool io_is_compat(struct io_ring_ctx *ctx) { - return IS_ENABLED(CONFIG_COMPAT) && unlikely(ctx->compat); + return IS_ENABLED(CONFIG_COMPAT) && unlikely(ctx->int_flags & IO_RING_F_COMPAT); } static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) @@ -472,8 +472,9 @@ static inline void io_req_complete_defer(struct io_kiocb *req) static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx) { - if (unlikely(ctx->off_timeout_used || - ctx->has_evfd || ctx->poll_activated)) + if (unlikely(ctx->int_flags & (IO_RING_F_OFF_TIMEOUT_USED | + IO_RING_F_HAS_EVFD | + IO_RING_F_POLL_ACTIVATED))) __io_commit_cqring_flush(ctx); } diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 57ad0085869a..3ff9098573db 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -67,7 +67,7 @@ void io_msg_ring_cleanup(struct io_kiocb *req) static inline bool io_msg_need_remote(struct io_ring_ctx *target_ctx) { - return target_ctx->task_complete; + return target_ctx->int_flags & IO_RING_F_TASK_COMPLETE; } static void io_msg_tw_complete(struct io_tw_req tw_req, io_tw_token_t tw) diff --git a/io_uring/register.c b/io_uring/register.c index 0148735f7711..489a6feaf228 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -184,9 +184,9 @@ static __cold int io_register_restrictions(struct io_ring_ctx *ctx, return ret; } if (ctx->restrictions.op_registered) - ctx->op_restricted = 1; + ctx->int_flags |= IO_RING_F_OP_RESTRICTED; if (ctx->restrictions.reg_registered) - ctx->reg_restricted = 1; + ctx->int_flags |= IO_RING_F_REG_RESTRICTED; return 0; } @@ -384,7 +384,7 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, for (i = 0; i < ARRAY_SIZE(new_count); i++) if (new_count[i]) ctx->iowq_limits[i] = new_count[i]; - ctx->iowq_limits_set = true; + ctx->int_flags |= IO_RING_F_IOWQ_LIMITS_SET; if (tctx && tctx->io_wq) { ret = io_wq_max_workers(tctx->io_wq, new_count); @@ -725,7 +725,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, if (ctx->submitter_task && ctx->submitter_task != current) return -EEXIST; - if (ctx->reg_restricted && !(ctx->flags & IORING_SETUP_R_DISABLED)) { + if ((ctx->int_flags & IO_RING_F_REG_RESTRICTED) && !(ctx->flags & IORING_SETUP_R_DISABLED)) { opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); if (!test_bit(opcode, ctx->restrictions.register_op)) return -EACCES; diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 4fa59bf89bba..52554ed89b11 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -295,7 +295,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, u64 tag = 0; uvec = u64_to_user_ptr(user_data); - iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); + iov = iovec_from_user(uvec, 1, 1, &fast_iov, io_is_compat(ctx)); if (IS_ERR(iov)) { err = PTR_ERR(iov); break; @@ -319,7 +319,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, i = array_index_nospec(up->offset + done, ctx->buf_table.nr); io_reset_rsrc_node(ctx, &ctx->buf_table, i); ctx->buf_table.nodes[i] = node; - if (ctx->compat) + if (io_is_compat(ctx)) user_data += sizeof(struct compat_iovec); else user_data += sizeof(struct iovec); @@ -883,12 +883,12 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, if (arg) { uvec = (struct iovec __user *) arg; - iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); + iov = iovec_from_user(uvec, 1, 1, &fast_iov, io_is_compat(ctx)); if (IS_ERR(iov)) { ret = PTR_ERR(iov); break; } - if (ctx->compat) + if (io_is_compat(ctx)) arg += sizeof(struct compat_iovec); else arg += sizeof(struct iovec); diff --git a/io_uring/tctx.c b/io_uring/tctx.c index 7cbcb82aedfb..143de8e990eb 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -121,7 +121,7 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) return ret; tctx = current->io_uring; - if (ctx->iowq_limits_set) { + if (ctx->int_flags & IO_RING_F_IOWQ_LIMITS_SET) { unsigned int limits[2] = { ctx->iowq_limits[0], ctx->iowq_limits[1], }; diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 8eddf8add7a2..579fdddac71a 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -566,8 +566,8 @@ static int __io_timeout_prep(struct io_kiocb *req, INIT_LIST_HEAD(&timeout->list); timeout->off = off; - if (unlikely(off && !req->ctx->off_timeout_used)) - req->ctx->off_timeout_used = true; + if (unlikely(off && !(req->ctx->int_flags & IO_RING_F_OFF_TIMEOUT_USED))) + req->ctx->int_flags |= IO_RING_F_OFF_TIMEOUT_USED; /* * for multishot reqs w/ fixed nr of repeats, repeats tracks the * remaining nr diff --git a/io_uring/tw.c b/io_uring/tw.c index 2f2b4ac4b126..022fe8753c19 100644 --- a/io_uring/tw.c +++ b/io_uring/tw.c @@ -222,7 +222,7 @@ void io_req_local_work_add(struct io_kiocb *req, unsigned flags) if (!head) { io_ctx_mark_taskrun(ctx); - if (ctx->has_evfd) + if (ctx->int_flags & IO_RING_F_HAS_EVFD) io_eventfd_signal(ctx, false); } -- cgit v1.2.3 From 9165dc4fa969b64c2d4396ee4e1546a719978dd1 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Mon, 2 Mar 2026 10:29:10 -0700 Subject: io_uring: add REQ_F_IOPOLL A subsequent commit will allow uring_cmds to files that don't implement ->uring_cmd_iopoll() to be issued to IORING_SETUP_IOPOLL io_urings. This means the ctx's IORING_SETUP_IOPOLL flag isn't sufficient to determine whether a given request needs to be iopolled. Introduce a request flag REQ_F_IOPOLL set in ->issue() if a request needs to be iopolled to completion. Set the flag in io_rw_init_file() and io_uring_cmd() for requests issued to IORING_SETUP_IOPOLL ctxs. Use the request flag instead of IORING_SETUP_IOPOLL in places dealing with a specific request. A future possibility would be to add an option to enable/disable iopoll in the io_uring SQE instead of determining it from IORING_SETUP_IOPOLL. Signed-off-by: Caleb Sander Mateos Reviewed-by: Kanchan Joshi Reviewed-by: Anuj Gupta Link: https://patch.msgid.link/20260302172914.2488599-2-csander@purestorage.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +++ io_uring/io_uring.c | 9 ++++----- io_uring/rw.c | 11 ++++++----- io_uring/uring_cmd.c | 5 +++-- 4 files changed, 16 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 0b3f08adc217..4dbd7083dd54 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -550,6 +550,7 @@ enum { REQ_F_HAS_METADATA_BIT, REQ_F_IMPORT_BUFFER_BIT, REQ_F_SQE_COPIED_BIT, + REQ_F_IOPOLL_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -641,6 +642,8 @@ enum { REQ_F_IMPORT_BUFFER = IO_REQ_FLAG(REQ_F_IMPORT_BUFFER_BIT), /* ->sqe_copy() has been called, if necessary */ REQ_F_SQE_COPIED = IO_REQ_FLAG(REQ_F_SQE_COPIED_BIT), + /* request must be iopolled to completion (set in ->issue()) */ + REQ_F_IOPOLL = IO_REQ_FLAG(REQ_F_IOPOLL_BIT), }; struct io_tw_req { diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index fb5a263706be..a610eaa5fd7c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -356,7 +356,6 @@ static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) static void io_prep_async_work(struct io_kiocb *req) { const struct io_issue_def *def = &io_issue_defs[req->opcode]; - struct io_ring_ctx *ctx = req->ctx; if (!(req->flags & REQ_F_CREDS)) { req->flags |= REQ_F_CREDS; @@ -378,7 +377,7 @@ static void io_prep_async_work(struct io_kiocb *req) if (should_hash && (req->file->f_flags & O_DIRECT) && (req->file->f_op->fop_flags & FOP_DIO_PARALLEL_WRITE)) should_hash = false; - if (should_hash || (ctx->flags & IORING_SETUP_IOPOLL)) + if (should_hash || (req->flags & REQ_F_IOPOLL)) io_wq_hash_work(&req->work, file_inode(req->file)); } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { if (def->unbound_nonreg_file) @@ -1419,7 +1418,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) ret = 0; /* If the op doesn't have a file, we're not polling for it */ - if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue) + if ((req->flags & REQ_F_IOPOLL) && def->iopoll_queue) io_iopoll_req_issued(req, issue_flags); } return ret; @@ -1435,7 +1434,7 @@ int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw) io_tw_lock(req->ctx, tw); WARN_ON_ONCE(!req->file); - if (WARN_ON_ONCE(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (WARN_ON_ONCE(req->flags & REQ_F_IOPOLL)) return -EFAULT; ret = __io_issue_sqe(req, issue_flags, &io_issue_defs[req->opcode]); @@ -1533,7 +1532,7 @@ fail: * wait for request slots on the block side. */ if (!needs_poll) { - if (!(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (!(req->flags & REQ_F_IOPOLL)) break; if (io_wq_worker_stopped()) break; diff --git a/io_uring/rw.c b/io_uring/rw.c index 1a5f262734e8..3bdb9914e673 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -504,7 +504,7 @@ static bool io_rw_should_reissue(struct io_kiocb *req) if (!S_ISBLK(mode) && !S_ISREG(mode)) return false; if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() && - !(ctx->flags & IORING_SETUP_IOPOLL))) + !(req->flags & REQ_F_IOPOLL))) return false; /* * If ref is dying, we might be running poll reap from the exit work. @@ -640,7 +640,7 @@ static inline void io_rw_done(struct io_kiocb *req, ssize_t ret) } } - if (req->ctx->flags & IORING_SETUP_IOPOLL) + if (req->flags & REQ_F_IOPOLL) io_complete_rw_iopoll(&rw->kiocb, ret); else io_complete_rw(&rw->kiocb, ret); @@ -654,7 +654,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, if (ret >= 0 && req->flags & REQ_F_CUR_POS) req->file->f_pos = rw->kiocb.ki_pos; - if (ret >= 0 && !(req->ctx->flags & IORING_SETUP_IOPOLL)) { + if (ret >= 0 && !(req->flags & REQ_F_IOPOLL)) { u32 cflags = 0; __io_complete_rw_common(req, ret); @@ -876,6 +876,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) if (ctx->flags & IORING_SETUP_IOPOLL) { if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) return -EOPNOTSUPP; + req->flags |= REQ_F_IOPOLL; kiocb->private = NULL; kiocb->ki_flags |= IOCB_HIPRI; req->iopoll_completed = 0; @@ -963,7 +964,7 @@ static int __io_read(struct io_kiocb *req, struct io_br_sel *sel, if (io_file_can_poll(req)) return -EAGAIN; /* IOPOLL retry should happen for io-wq threads */ - if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (!force_nonblock && !(req->flags & REQ_F_IOPOLL)) goto done; /* no retry on NONBLOCK nor RWF_NOWAIT */ if (req->flags & REQ_F_NOWAIT) @@ -1188,7 +1189,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) goto done; if (!force_nonblock || ret2 != -EAGAIN) { /* IOPOLL retry should happen for io-wq threads */ - if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) + if (ret2 == -EAGAIN && (req->flags & REQ_F_IOPOLL)) goto ret_eagain; if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) { diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index ee7b49f47cb5..b651c63f6e20 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -110,7 +110,7 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, * because iopoll completion data overlaps with the hash_node used * for tracking. */ - if (ctx->flags & IORING_SETUP_IOPOLL) + if (req->flags & REQ_F_IOPOLL) return; if (!(cmd->flags & IORING_URING_CMD_CANCELABLE)) { @@ -167,7 +167,7 @@ void __io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, u64 res2, io_req_set_cqe32_extra(req, res2, 0); } io_req_uring_cleanup(req, issue_flags); - if (req->ctx->flags & IORING_SETUP_IOPOLL) { + if (req->flags & REQ_F_IOPOLL) { /* order with io_iopoll_req_issued() checking ->iopoll_complete */ smp_store_release(&req->iopoll_completed, 1); } else if (issue_flags & IO_URING_F_COMPLETE_DEFER) { @@ -260,6 +260,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) if (ctx->flags & IORING_SETUP_IOPOLL) { if (!file->f_op->uring_cmd_iopoll) return -EOPNOTSUPP; + req->flags |= REQ_F_IOPOLL; issue_flags |= IO_URING_F_IOPOLL; req->iopoll_completed = 0; if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) { -- cgit v1.2.3 From 033af2b3eb19c5ed96825572105bca3611635ada Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 26 Feb 2026 12:48:38 +0000 Subject: io_uring: introduce callback driven main loop The io_uring_enter() has a fixed order of execution: it submits requests, waits for completions, and returns to the user. Allow to optionally replace it with a custom loop driven by a callback called loop_step. The basic requirements to the callback is that it should be able to submit requests, wait for completions, parse them and repeat. Most of the communication including parameter passing can be implemented via shared memory. The callback should return IOU_LOOP_CONTINUE to continue execution or IOU_LOOP_STOP to return to the user space. Note that the kernel may decide to prematurely terminate it as well, e.g. in case the process was signalled or killed. The hook takes a structure with parameters. It can be used to ask the kernel to wait for CQEs by setting cq_wait_idx to the CQE index it wants to wait for. Spurious wake ups are possible and even likely, the callback is expected to handle it. There will be more parameters in the future like timeout. It can be used with kernel callbacks, for example, as a slow path deprecation mechanism overwiting SQEs and emulating the wanted behaviour, however it's more useful together with BPF programs implemented in following patches. Note that keeping it separately from the normal io_uring wait loop makes things much simpler and cleaner. It keeps it in one place instead of spreading a bunch of checks in different places including disabling the submission path. It holds the lock by default, which is a better fit for BPF synchronisation and the loop execution model. It nicely avoids existing quirks like forced wake ups on timeout request completion. And it should be easier to implement new features. Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/a2d369aa1c9dd23ad7edac9220cffc563abcaed6.1772109579.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 +++ io_uring/Makefile | 2 +- io_uring/io_uring.c | 11 +++++ io_uring/loop.c | 91 ++++++++++++++++++++++++++++++++++++++++++ io_uring/loop.h | 27 +++++++++++++ io_uring/wait.h | 1 + 6 files changed, 136 insertions(+), 1 deletion(-) create mode 100644 io_uring/loop.c create mode 100644 io_uring/loop.h (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 4dbd7083dd54..344b634b8989 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -41,6 +41,8 @@ enum io_uring_cmd_flags { IO_URING_F_COMPAT = (1 << 12), }; +struct iou_loop_params; + struct io_wq_work_node { struct io_wq_work_node *next; }; @@ -361,6 +363,9 @@ struct io_ring_ctx { struct io_alloc_cache rw_cache; struct io_alloc_cache cmd_cache; + int (*loop_step)(struct io_ring_ctx *ctx, + struct iou_loop_params *); + /* * Any cancelable uring_cmd is added to this list in * ->uring_cmd() by io_uring_cmd_insert_cancelable() diff --git a/io_uring/Makefile b/io_uring/Makefile index 931f9156132a..1c1f47de32a4 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -14,7 +14,7 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ advise.o openclose.o statx.o timeout.o \ cancel.o waitid.o register.o \ truncate.o memmap.o alloc_cache.o \ - query.o + query.o loop.o obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 74cd62b44d94..960d36c49ffe 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -95,6 +95,7 @@ #include "eventfd.h" #include "wait.h" #include "bpf_filter.h" +#include "loop.h" #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ IOSQE_IO_HARDLINK | IOSQE_ASYNC) @@ -588,6 +589,11 @@ void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } +void io_cqring_overflow_flush_locked(struct io_ring_ctx *ctx) +{ + __io_cqring_overflow_flush(ctx, false); +} + /* must to be called somewhat shortly after putting a request */ static inline void io_put_task(struct io_kiocb *req) { @@ -2571,6 +2577,11 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (unlikely(smp_load_acquire(&ctx->flags) & IORING_SETUP_R_DISABLED)) goto out; + if (io_has_loop_ops(ctx)) { + ret = io_run_loop(ctx); + goto out; + } + /* * For SQ polling, the thread will do all submissions and completions. * Just return the requested submit count, and wake the thread if diff --git a/io_uring/loop.c b/io_uring/loop.c new file mode 100644 index 000000000000..31843cc3e451 --- /dev/null +++ b/io_uring/loop.c @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include "io_uring.h" +#include "wait.h" +#include "loop.h" + +static inline int io_loop_nr_cqes(const struct io_ring_ctx *ctx, + const struct iou_loop_params *lp) +{ + return lp->cq_wait_idx - READ_ONCE(ctx->rings->cq.tail); +} + +static inline void io_loop_wait_start(struct io_ring_ctx *ctx, unsigned nr_wait) +{ + atomic_set(&ctx->cq_wait_nr, nr_wait); + set_current_state(TASK_INTERRUPTIBLE); +} + +static inline void io_loop_wait_finish(struct io_ring_ctx *ctx) +{ + __set_current_state(TASK_RUNNING); + atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); +} + +static void io_loop_wait(struct io_ring_ctx *ctx, struct iou_loop_params *lp, + unsigned nr_wait) +{ + io_loop_wait_start(ctx, nr_wait); + + if (unlikely(io_local_work_pending(ctx) || + io_loop_nr_cqes(ctx, lp) <= 0) || + READ_ONCE(ctx->check_cq)) { + io_loop_wait_finish(ctx); + return; + } + + mutex_unlock(&ctx->uring_lock); + schedule(); + io_loop_wait_finish(ctx); + mutex_lock(&ctx->uring_lock); +} + +static int __io_run_loop(struct io_ring_ctx *ctx) +{ + struct iou_loop_params lp = {}; + + while (true) { + int nr_wait, step_res; + + if (unlikely(!ctx->loop_step)) + return -EFAULT; + + step_res = ctx->loop_step(ctx, &lp); + if (step_res == IOU_LOOP_STOP) + break; + if (step_res != IOU_LOOP_CONTINUE) + return -EINVAL; + + nr_wait = io_loop_nr_cqes(ctx, &lp); + if (nr_wait > 0) + io_loop_wait(ctx, &lp, nr_wait); + else + nr_wait = 0; + + if (task_work_pending(current)) { + mutex_unlock(&ctx->uring_lock); + io_run_task_work(); + mutex_lock(&ctx->uring_lock); + } + if (unlikely(task_sigpending(current))) + return -EINTR; + io_run_local_work_locked(ctx, nr_wait); + + if (READ_ONCE(ctx->check_cq) & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) + io_cqring_overflow_flush_locked(ctx); + } + + return 0; +} + +int io_run_loop(struct io_ring_ctx *ctx) +{ + int ret; + + if (!io_allowed_run_tw(ctx)) + return -EEXIST; + + mutex_lock(&ctx->uring_lock); + ret = __io_run_loop(ctx); + mutex_unlock(&ctx->uring_lock); + return ret; +} diff --git a/io_uring/loop.h b/io_uring/loop.h new file mode 100644 index 000000000000..d7718b9ce61e --- /dev/null +++ b/io_uring/loop.h @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IOU_LOOP_H +#define IOU_LOOP_H + +#include + +struct iou_loop_params { + /* + * The CQE index to wait for. Only serves as a hint and can still be + * woken up earlier. + */ + __u32 cq_wait_idx; +}; + +enum { + IOU_LOOP_CONTINUE = 0, + IOU_LOOP_STOP, +}; + +static inline bool io_has_loop_ops(struct io_ring_ctx *ctx) +{ + return data_race(ctx->loop_step); +} + +int io_run_loop(struct io_ring_ctx *ctx); + +#endif diff --git a/io_uring/wait.h b/io_uring/wait.h index 5e236f74e1af..037e512dd80c 100644 --- a/io_uring/wait.h +++ b/io_uring/wait.h @@ -25,6 +25,7 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, struct ext_arg *ext_arg); int io_run_task_work_sig(struct io_ring_ctx *ctx); void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx); +void io_cqring_overflow_flush_locked(struct io_ring_ctx *ctx); static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) { -- cgit v1.2.3 From 98f37634b12b17ad5c56db8fb63cf9d7dc55d74c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 26 Feb 2026 12:48:41 +0000 Subject: io_uring/bpf-ops: implement bpf ops registration Implement BPF struct ops registration. It's registered off the BPF path, and can be removed by BPF as well as io_uring. To protect it, introduce a global lock synchronising registration. ctx->uring_lock can be nested under it. ctx->bpf_ops is write protected by both locks and so it's safe to read it under either of them. Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/1f46bffd76008de49cbafa2ad77d348810a4f69e.1772109579.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 +++ io_uring/bpf-ops.c | 92 +++++++++++++++++++++++++++++++++++++++++- io_uring/bpf-ops.h | 8 ++++ io_uring/io_uring.c | 1 + 4 files changed, 104 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 344b634b8989..28e5dbdac55b 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -8,6 +8,9 @@ #include #include +struct iou_loop_params; +struct io_uring_bpf_ops; + enum { /* * A hint to not wake right away but delay until there are enough of @@ -488,6 +491,8 @@ struct io_ring_ctx { DECLARE_HASHTABLE(napi_ht, 4); #endif + struct io_uring_bpf_ops *bpf_ops; + /* * Protection for resize vs mmap races - both the mmap and resize * side will need to grab this lock, to prevent either side from diff --git a/io_uring/bpf-ops.c b/io_uring/bpf-ops.c index 17518f4ecca9..e4b244337aa9 100644 --- a/io_uring/bpf-ops.c +++ b/io_uring/bpf-ops.c @@ -5,10 +5,11 @@ #include "io_uring.h" #include "register.h" +#include "loop.h" #include "memmap.h" #include "bpf-ops.h" -#include "loop.h" +static DEFINE_MUTEX(io_bpf_ctrl_mutex); static const struct btf_type *loop_params_type; __bpf_kfunc_start_defs(); @@ -143,16 +144,103 @@ static int bpf_io_init_member(const struct btf_type *t, const struct btf_member *member, void *kdata, const void *udata) { + u32 moff = __btf_member_bit_offset(t, member) / 8; + const struct io_uring_bpf_ops *uops = udata; + struct io_uring_bpf_ops *ops = kdata; + + switch (moff) { + case offsetof(struct io_uring_bpf_ops, ring_fd): + ops->ring_fd = uops->ring_fd; + return 1; + } + return 0; +} + +static int io_install_bpf(struct io_ring_ctx *ctx, struct io_uring_bpf_ops *ops) +{ + if (ctx->flags & (IORING_SETUP_SQPOLL | IORING_SETUP_IOPOLL)) + return -EOPNOTSUPP; + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) + return -EOPNOTSUPP; + + if (ctx->bpf_ops) + return -EBUSY; + if (WARN_ON_ONCE(!ops->loop_step)) + return -EINVAL; + + ops->priv = ctx; + ctx->bpf_ops = ops; + ctx->loop_step = ops->loop_step; return 0; } static int bpf_io_reg(void *kdata, struct bpf_link *link) { - return -EOPNOTSUPP; + struct io_uring_bpf_ops *ops = kdata; + struct io_ring_ctx *ctx; + struct file *file; + int ret = -EBUSY; + + file = io_uring_register_get_file(ops->ring_fd, false); + if (IS_ERR(file)) + return PTR_ERR(file); + ctx = file->private_data; + + scoped_guard(mutex, &io_bpf_ctrl_mutex) { + guard(mutex)(&ctx->uring_lock); + ret = io_install_bpf(ctx, ops); + } + + fput(file); + return ret; +} + +static void io_eject_bpf(struct io_ring_ctx *ctx) +{ + struct io_uring_bpf_ops *ops = ctx->bpf_ops; + + if (WARN_ON_ONCE(!ops)) + return; + if (WARN_ON_ONCE(ops->priv != ctx)) + return; + + ops->priv = NULL; + ctx->bpf_ops = NULL; + ctx->loop_step = NULL; } static void bpf_io_unreg(void *kdata, struct bpf_link *link) { + struct io_uring_bpf_ops *ops = kdata; + struct io_ring_ctx *ctx; + + guard(mutex)(&io_bpf_ctrl_mutex); + ctx = ops->priv; + if (ctx) { + guard(mutex)(&ctx->uring_lock); + if (WARN_ON_ONCE(ctx->bpf_ops != ops)) + return; + + io_eject_bpf(ctx); + } +} + +void io_unregister_bpf_ops(struct io_ring_ctx *ctx) +{ + /* + * ->bpf_ops is write protected by io_bpf_ctrl_mutex and uring_lock, + * and read protected by either. Try to avoid taking the global lock + * for rings that never had any bpf installed. + */ + scoped_guard(mutex, &ctx->uring_lock) { + if (!ctx->bpf_ops) + return; + } + + guard(mutex)(&io_bpf_ctrl_mutex); + guard(mutex)(&ctx->uring_lock); + if (ctx->bpf_ops) + io_eject_bpf(ctx); } static struct bpf_struct_ops bpf_ring_ops = { diff --git a/io_uring/bpf-ops.h b/io_uring/bpf-ops.h index b9e589ad519a..b39b3fd3acda 100644 --- a/io_uring/bpf-ops.h +++ b/io_uring/bpf-ops.h @@ -17,4 +17,12 @@ struct io_uring_bpf_ops { void *priv; }; +#ifdef CONFIG_IO_URING_BPF_OPS +void io_unregister_bpf_ops(struct io_ring_ctx *ctx); +#else +static inline void io_unregister_bpf_ops(struct io_ring_ctx *ctx) +{ +} +#endif + #endif /* IOU_BPF_OPS_H */ diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 0a80c8e6e633..d703f0a8b315 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2148,6 +2148,7 @@ static __cold void io_req_caches_free(struct io_ring_ctx *ctx) static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) { + io_unregister_bpf_ops(ctx); io_sq_thread_finish(ctx); mutex_lock(&ctx->uring_lock); -- cgit v1.2.3 From 37a23d6f11938cd59927e3307b9b301624df8e8f Mon Sep 17 00:00:00 2001 From: Rosen Penev Date: Wed, 11 Mar 2026 21:59:21 -0700 Subject: bus: mhi: host: Use kzalloc_flex Change kzalloc + kzalloc to just kzalloc with a flexible array member. Add __counted_by for extra runtime analysis when requested. Move counting assignment immediately after allocation as required by __counted_by. Move mhi_buf definition as a complete definition as needed for flex arrays. It's not a pointer anymore. Signed-off-by: Rosen Penev [mani: squashed https://lore.kernel.org/mhi/20260317-mhi-invalid-free-mhi-buffers-v1-1-8418a3ad604f@oss.qualcomm.com] Signed-off-by: Manivannan Sadhasivam Link: https://patch.msgid.link/20260312045921.7663-1-rosenp@gmail.com --- drivers/bus/mhi/host/boot.c | 22 +++------------------- include/linux/mhi.h | 34 +++++++++++++++++----------------- 2 files changed, 20 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/drivers/bus/mhi/host/boot.c b/drivers/bus/mhi/host/boot.c index f16a1e67a667..19c84913cfb9 100644 --- a/drivers/bus/mhi/host/boot.c +++ b/drivers/bus/mhi/host/boot.c @@ -308,7 +308,6 @@ static void mhi_free_bhi_buffer(struct mhi_controller *mhi_cntrl, struct mhi_buf *mhi_buf = image_info->mhi_buf; dma_free_coherent(mhi_cntrl->cntrl_dev, mhi_buf->len, mhi_buf->buf, mhi_buf->dma_addr); - kfree(image_info->mhi_buf); kfree(image_info); } @@ -322,7 +321,6 @@ void mhi_free_bhie_table(struct mhi_controller *mhi_cntrl, dma_free_coherent(mhi_cntrl->cntrl_dev, mhi_buf->len, mhi_buf->buf, mhi_buf->dma_addr); - kfree(image_info->mhi_buf); kfree(image_info); } @@ -333,15 +331,10 @@ static int mhi_alloc_bhi_buffer(struct mhi_controller *mhi_cntrl, struct image_info *img_info; struct mhi_buf *mhi_buf; - img_info = kzalloc_obj(*img_info); + img_info = kzalloc_flex(*img_info, mhi_buf, 1); if (!img_info) return -ENOMEM; - /* Allocate memory for entry */ - img_info->mhi_buf = kzalloc_obj(*img_info->mhi_buf); - if (!img_info->mhi_buf) - goto error_alloc_mhi_buf; - /* Allocate and populate vector table */ mhi_buf = img_info->mhi_buf; @@ -358,8 +351,6 @@ static int mhi_alloc_bhi_buffer(struct mhi_controller *mhi_cntrl, return 0; error_alloc_segment: - kfree(mhi_buf); -error_alloc_mhi_buf: kfree(img_info); return -ENOMEM; @@ -375,14 +366,11 @@ int mhi_alloc_bhie_table(struct mhi_controller *mhi_cntrl, struct image_info *img_info; struct mhi_buf *mhi_buf; - img_info = kzalloc_obj(*img_info); + img_info = kzalloc_flex(*img_info, mhi_buf, segments); if (!img_info) return -ENOMEM; - /* Allocate memory for entries */ - img_info->mhi_buf = kzalloc_objs(*img_info->mhi_buf, segments); - if (!img_info->mhi_buf) - goto error_alloc_mhi_buf; + img_info->entries = segments; /* Allocate and populate vector table */ mhi_buf = img_info->mhi_buf; @@ -402,7 +390,6 @@ int mhi_alloc_bhie_table(struct mhi_controller *mhi_cntrl, } img_info->bhi_vec = img_info->mhi_buf[segments - 1].buf; - img_info->entries = segments; *image_info = img_info; return 0; @@ -411,9 +398,6 @@ error_alloc_segment: for (--i, --mhi_buf; i >= 0; i--, mhi_buf--) dma_free_coherent(mhi_cntrl->cntrl_dev, mhi_buf->len, mhi_buf->buf, mhi_buf->dma_addr); - kfree(img_info->mhi_buf); - -error_alloc_mhi_buf: kfree(img_info); return -ENOMEM; diff --git a/include/linux/mhi.h b/include/linux/mhi.h index 88ccb3e14f48..fb3ba639f4f8 100644 --- a/include/linux/mhi.h +++ b/include/linux/mhi.h @@ -85,17 +85,33 @@ enum mhi_ch_type { MHI_CH_TYPE_INBOUND_COALESCED = 3, }; +/** + * struct mhi_buf - MHI Buffer description + * @buf: Virtual address of the buffer + * @name: Buffer label. For offload channel, configurations name must be: + * ECA - Event context array data + * CCA - Channel context array data + * @dma_addr: IOMMU address of the buffer + * @len: # of bytes + */ +struct mhi_buf { + void *buf; + const char *name; + dma_addr_t dma_addr; + size_t len; +}; + /** * struct image_info - Firmware and RDDM table * @mhi_buf: Buffer for firmware and RDDM table * @entries: # of entries in table */ struct image_info { - struct mhi_buf *mhi_buf; /* private: from internal.h */ struct bhi_vec_entry *bhi_vec; /* public: */ u32 entries; + struct mhi_buf mhi_buf[] __counted_by(entries); }; /** @@ -488,22 +504,6 @@ struct mhi_result { int transaction_status; }; -/** - * struct mhi_buf - MHI Buffer description - * @buf: Virtual address of the buffer - * @name: Buffer label. For offload channel, configurations name must be: - * ECA - Event context array data - * CCA - Channel context array data - * @dma_addr: IOMMU address of the buffer - * @len: # of bytes - */ -struct mhi_buf { - void *buf; - const char *name; - dma_addr_t dma_addr; - size_t len; -}; - /** * struct mhi_driver - Structure representing a MHI client driver * @probe: CB function for client driver probe function -- cgit v1.2.3 From e71e00127110dedc6a9e746178282b4dac97ed96 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 27 Feb 2026 11:25:36 -0400 Subject: iommupt: Add the RISC-V page table format The RISC-V format is a fairly simple 5 level page table not unlike the x86 one. It has optional support for a single contiguous page size of 64k (16 x 4k). The specification describes a 32-bit format, the general code can support it via a #define but the iommu side implementation has been left off until a user comes. Tested-by: Vincent Chen Acked-by: Paul Walmsley # arch/riscv Reviewed-by: Tomasz Jeznach Tested-by: Tomasz Jeznach Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/.kunitconfig | 1 + drivers/iommu/generic_pt/Kconfig | 11 + drivers/iommu/generic_pt/fmt/Makefile | 2 + drivers/iommu/generic_pt/fmt/defs_riscv.h | 29 +++ drivers/iommu/generic_pt/fmt/iommu_riscv64.c | 11 + drivers/iommu/generic_pt/fmt/riscv.h | 313 +++++++++++++++++++++++++++ include/linux/generic_pt/common.h | 16 ++ include/linux/generic_pt/iommu.h | 11 + 8 files changed, 394 insertions(+) create mode 100644 drivers/iommu/generic_pt/fmt/defs_riscv.h create mode 100644 drivers/iommu/generic_pt/fmt/iommu_riscv64.c create mode 100644 drivers/iommu/generic_pt/fmt/riscv.h (limited to 'include/linux') diff --git a/drivers/iommu/generic_pt/.kunitconfig b/drivers/iommu/generic_pt/.kunitconfig index a78b295f264d..0bb98fe581fe 100644 --- a/drivers/iommu/generic_pt/.kunitconfig +++ b/drivers/iommu/generic_pt/.kunitconfig @@ -5,6 +5,7 @@ CONFIG_DEBUG_GENERIC_PT=y CONFIG_IOMMU_PT=y CONFIG_IOMMU_PT_AMDV1=y CONFIG_IOMMU_PT_VTDSS=y +CONFIG_IOMMU_PT_RISCV64=y CONFIG_IOMMU_PT_X86_64=y CONFIG_IOMMU_PT_KUNIT_TEST=y diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig index ce4fb4786914..f4ed1add58b7 100644 --- a/drivers/iommu/generic_pt/Kconfig +++ b/drivers/iommu/generic_pt/Kconfig @@ -52,6 +52,16 @@ config IOMMU_PT_VTDSS Selected automatically by an IOMMU driver that uses this format. +config IOMMU_PT_RISCV64 + tristate "IOMMU page table for RISC-V 64 bit Sv57/Sv48/Sv39" + depends on !GENERIC_ATOMIC64 # for cmpxchg64 + help + iommu_domain implementation for RISC-V 64 bit 3/4/5 level page table. + It supports 4K/2M/1G/512G/256T page sizes and can decode a sign + extended portion of the 64 bit IOVA space. + + Selected automatically by an IOMMU driver that uses this format. + config IOMMU_PT_X86_64 tristate "IOMMU page table for x86 64-bit, 4/5 levels" depends on !GENERIC_ATOMIC64 # for cmpxchg64 @@ -66,6 +76,7 @@ config IOMMU_PT_KUNIT_TEST tristate "IOMMU Page Table KUnit Test" if !KUNIT_ALL_TESTS depends on KUNIT depends on IOMMU_PT_AMDV1 || !IOMMU_PT_AMDV1 + depends on IOMMU_PT_RISCV64 || !IOMMU_PT_RISCV64 depends on IOMMU_PT_X86_64 || !IOMMU_PT_X86_64 depends on IOMMU_PT_VTDSS || !IOMMU_PT_VTDSS default KUNIT_ALL_TESTS diff --git a/drivers/iommu/generic_pt/fmt/Makefile b/drivers/iommu/generic_pt/fmt/Makefile index 976b49ec97dc..ea024d582594 100644 --- a/drivers/iommu/generic_pt/fmt/Makefile +++ b/drivers/iommu/generic_pt/fmt/Makefile @@ -5,6 +5,8 @@ iommu_pt_fmt-$(CONFIG_IOMMUFD_TEST) += mock iommu_pt_fmt-$(CONFIG_IOMMU_PT_VTDSS) += vtdss +iommu_pt_fmt-$(CONFIG_IOMMU_PT_RISCV64) += riscv64 + iommu_pt_fmt-$(CONFIG_IOMMU_PT_X86_64) += x86_64 IOMMU_PT_KUNIT_TEST := diff --git a/drivers/iommu/generic_pt/fmt/defs_riscv.h b/drivers/iommu/generic_pt/fmt/defs_riscv.h new file mode 100644 index 000000000000..cf67474d5eba --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/defs_riscv.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES + * + */ +#ifndef __GENERIC_PT_FMT_DEFS_RISCV_H +#define __GENERIC_PT_FMT_DEFS_RISCV_H + +#include +#include + +#ifdef PT_RISCV_32BIT +typedef u32 pt_riscv_entry_t; +#define riscvpt_write_attrs riscv32pt_write_attrs +#else +typedef u64 pt_riscv_entry_t; +#define riscvpt_write_attrs riscv64pt_write_attrs +#endif + +typedef pt_riscv_entry_t pt_vaddr_t; +typedef u64 pt_oaddr_t; + +struct riscvpt_write_attrs { + pt_riscv_entry_t descriptor_bits; + gfp_t gfp; +}; +#define pt_write_attrs riscvpt_write_attrs + +#endif diff --git a/drivers/iommu/generic_pt/fmt/iommu_riscv64.c b/drivers/iommu/generic_pt/fmt/iommu_riscv64.c new file mode 100644 index 000000000000..cbf60fffa9bf --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/iommu_riscv64.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES + */ +#define PT_FMT riscv +#define PT_FMT_VARIANT 64 +#define PT_SUPPORTED_FEATURES \ + (BIT(PT_FEAT_SIGN_EXTEND) | BIT(PT_FEAT_FLUSH_RANGE) | \ + BIT(PT_FEAT_RISCV_SVNAPOT_64K)) + +#include "iommu_template.h" diff --git a/drivers/iommu/generic_pt/fmt/riscv.h b/drivers/iommu/generic_pt/fmt/riscv.h new file mode 100644 index 000000000000..a7fef6266a36 --- /dev/null +++ b/drivers/iommu/generic_pt/fmt/riscv.h @@ -0,0 +1,313 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES + * + * RISC-V page table + * + * This is described in Sections: + * 12.3. Sv32: Page-Based 32-bit Virtual-Memory Systems + * 12.4. Sv39: Page-Based 39-bit Virtual-Memory System + * 12.5. Sv48: Page-Based 48-bit Virtual-Memory System + * 12.6. Sv57: Page-Based 57-bit Virtual-Memory System + * of the "The RISC-V Instruction Set Manual: Volume II" + * + * This includes the contiguous page extension from: + * Chapter 13. "Svnapot" Extension for NAPOT Translation Contiguity, + * Version 1.0 + * + * The table format is sign extended and supports leafs in every level. The spec + * doesn't talk a lot about levels, but level here is the same as i=LEVELS-1 in + * the spec. + */ +#ifndef __GENERIC_PT_FMT_RISCV_H +#define __GENERIC_PT_FMT_RISCV_H + +#include "defs_riscv.h" +#include "../pt_defs.h" + +#include +#include +#include +#include + +enum { + PT_ITEM_WORD_SIZE = sizeof(pt_riscv_entry_t), +#ifdef PT_RISCV_32BIT + PT_MAX_VA_ADDRESS_LG2 = 32, + PT_MAX_OUTPUT_ADDRESS_LG2 = 34, + PT_MAX_TOP_LEVEL = 1, +#else + PT_MAX_VA_ADDRESS_LG2 = 57, + PT_MAX_OUTPUT_ADDRESS_LG2 = 56, + PT_MAX_TOP_LEVEL = 4, +#endif + PT_GRANULE_LG2SZ = 12, + PT_TABLEMEM_LG2SZ = 12, + + /* fsc.PPN is 44 bits wide, all PPNs are 4k aligned */ + PT_TOP_PHYS_MASK = GENMASK_ULL(55, 12), +}; + +/* PTE bits */ +enum { + RISCVPT_V = BIT(0), + RISCVPT_R = BIT(1), + RISCVPT_W = BIT(2), + RISCVPT_X = BIT(3), + RISCVPT_U = BIT(4), + RISCVPT_G = BIT(5), + RISCVPT_A = BIT(6), + RISCVPT_D = BIT(7), + RISCVPT_RSW = GENMASK(9, 8), + RISCVPT_PPN32 = GENMASK(31, 10), + + RISCVPT_PPN64 = GENMASK_ULL(53, 10), + RISCVPT_PPN64_64K = GENMASK_ULL(53, 14), + RISCVPT_PBMT = GENMASK_ULL(62, 61), + RISCVPT_N = BIT_ULL(63), + + /* Svnapot encodings for ppn[0] */ + RISCVPT_PPN64_64K_SZ = BIT(13), +}; + +#ifdef PT_RISCV_32BIT +#define RISCVPT_PPN RISCVPT_PPN32 +#define pt_riscv pt_riscv_32 +#else +#define RISCVPT_PPN RISCVPT_PPN64 +#define pt_riscv pt_riscv_64 +#endif + +#define common_to_riscvpt(common_ptr) \ + container_of_const(common_ptr, struct pt_riscv, common) +#define to_riscvpt(pts) common_to_riscvpt((pts)->range->common) + +static inline pt_oaddr_t riscvpt_table_pa(const struct pt_state *pts) +{ + return oalog2_mul(FIELD_GET(RISCVPT_PPN, pts->entry), PT_GRANULE_LG2SZ); +} +#define pt_table_pa riscvpt_table_pa + +static inline pt_oaddr_t riscvpt_entry_oa(const struct pt_state *pts) +{ + if (pts_feature(pts, PT_FEAT_RISCV_SVNAPOT_64K) && + pts->entry & RISCVPT_N) { + PT_WARN_ON(pts->level != 0); + return oalog2_mul(FIELD_GET(RISCVPT_PPN64_64K, pts->entry), + ilog2(SZ_64K)); + } + return oalog2_mul(FIELD_GET(RISCVPT_PPN, pts->entry), PT_GRANULE_LG2SZ); +} +#define pt_entry_oa riscvpt_entry_oa + +static inline bool riscvpt_can_have_leaf(const struct pt_state *pts) +{ + return true; +} +#define pt_can_have_leaf riscvpt_can_have_leaf + +/* Body in pt_fmt_defaults.h */ +static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts); + +static inline unsigned int +riscvpt_entry_num_contig_lg2(const struct pt_state *pts) +{ + if (PT_SUPPORTED_FEATURE(PT_FEAT_RISCV_SVNAPOT_64K) && + pts->entry & RISCVPT_N) { + PT_WARN_ON(!pts_feature(pts, PT_FEAT_RISCV_SVNAPOT_64K)); + PT_WARN_ON(pts->level); + return ilog2(16); + } + return ilog2(1); +} +#define pt_entry_num_contig_lg2 riscvpt_entry_num_contig_lg2 + +static inline unsigned int riscvpt_num_items_lg2(const struct pt_state *pts) +{ + return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64)); +} +#define pt_num_items_lg2 riscvpt_num_items_lg2 + +static inline unsigned short +riscvpt_contig_count_lg2(const struct pt_state *pts) +{ + if (pts->level == 0 && pts_feature(pts, PT_FEAT_RISCV_SVNAPOT_64K)) + return ilog2(16); + return ilog2(1); +} +#define pt_contig_count_lg2 riscvpt_contig_count_lg2 + +static inline enum pt_entry_type riscvpt_load_entry_raw(struct pt_state *pts) +{ + const pt_riscv_entry_t *tablep = pt_cur_table(pts, pt_riscv_entry_t); + pt_riscv_entry_t entry; + + pts->entry = entry = READ_ONCE(tablep[pts->index]); + if (!(entry & RISCVPT_V)) + return PT_ENTRY_EMPTY; + if (pts->level == 0 || + ((entry & (RISCVPT_X | RISCVPT_W | RISCVPT_R)) != 0)) + return PT_ENTRY_OA; + return PT_ENTRY_TABLE; +} +#define pt_load_entry_raw riscvpt_load_entry_raw + +static inline void +riscvpt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa, + unsigned int oasz_lg2, + const struct pt_write_attrs *attrs) +{ + pt_riscv_entry_t *tablep = pt_cur_table(pts, pt_riscv_entry_t); + pt_riscv_entry_t entry; + + if (!pt_check_install_leaf_args(pts, oa, oasz_lg2)) + return; + + entry = RISCVPT_V | + FIELD_PREP(RISCVPT_PPN, log2_div(oa, PT_GRANULE_LG2SZ)) | + attrs->descriptor_bits; + + if (pts_feature(pts, PT_FEAT_RISCV_SVNAPOT_64K) && pts->level == 0 && + oasz_lg2 != PT_GRANULE_LG2SZ) { + u64 *end; + + entry |= RISCVPT_N | RISCVPT_PPN64_64K_SZ; + tablep += pts->index; + end = tablep + log2_div(SZ_64K, PT_GRANULE_LG2SZ); + for (; tablep != end; tablep++) + WRITE_ONCE(*tablep, entry); + } else { + /* FIXME does riscv need this to be cmpxchg? */ + WRITE_ONCE(tablep[pts->index], entry); + } + pts->entry = entry; +} +#define pt_install_leaf_entry riscvpt_install_leaf_entry + +static inline bool riscvpt_install_table(struct pt_state *pts, + pt_oaddr_t table_pa, + const struct pt_write_attrs *attrs) +{ + pt_riscv_entry_t entry; + + entry = RISCVPT_V | + FIELD_PREP(RISCVPT_PPN, log2_div(table_pa, PT_GRANULE_LG2SZ)); + return pt_table_install64(pts, entry); +} +#define pt_install_table riscvpt_install_table + +static inline void riscvpt_attr_from_entry(const struct pt_state *pts, + struct pt_write_attrs *attrs) +{ + attrs->descriptor_bits = + pts->entry & (RISCVPT_R | RISCVPT_W | RISCVPT_X | RISCVPT_U | + RISCVPT_G | RISCVPT_A | RISCVPT_D); +} +#define pt_attr_from_entry riscvpt_attr_from_entry + +/* --- iommu */ +#include +#include + +#define pt_iommu_table pt_iommu_riscv_64 + +/* The common struct is in the per-format common struct */ +static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table) +{ + return &container_of(iommu_table, struct pt_iommu_table, iommu) + ->riscv_64pt.common; +} + +static inline struct pt_iommu *iommu_from_common(struct pt_common *common) +{ + return &container_of(common, struct pt_iommu_table, riscv_64pt.common) + ->iommu; +} + +static inline int riscvpt_iommu_set_prot(struct pt_common *common, + struct pt_write_attrs *attrs, + unsigned int iommu_prot) +{ + u64 pte; + + pte = RISCVPT_A | RISCVPT_U; + if (iommu_prot & IOMMU_WRITE) + pte |= RISCVPT_W | RISCVPT_R | RISCVPT_D; + if (iommu_prot & IOMMU_READ) + pte |= RISCVPT_R; + if (!(iommu_prot & IOMMU_NOEXEC)) + pte |= RISCVPT_X; + + /* Caller must specify a supported combination of flags */ + if (unlikely((pte & (RISCVPT_X | RISCVPT_W | RISCVPT_R)) == 0)) + return -EOPNOTSUPP; + + attrs->descriptor_bits = pte; + return 0; +} +#define pt_iommu_set_prot riscvpt_iommu_set_prot + +static inline int +riscvpt_iommu_fmt_init(struct pt_iommu_riscv_64 *iommu_table, + const struct pt_iommu_riscv_64_cfg *cfg) +{ + struct pt_riscv *table = &iommu_table->riscv_64pt; + + switch (cfg->common.hw_max_vasz_lg2) { + case 39: + pt_top_set_level(&table->common, 2); + break; + case 48: + pt_top_set_level(&table->common, 3); + break; + case 57: + pt_top_set_level(&table->common, 4); + break; + default: + return -EINVAL; + } + table->common.max_oasz_lg2 = + min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2); + return 0; +} +#define pt_iommu_fmt_init riscvpt_iommu_fmt_init + +static inline void +riscvpt_iommu_fmt_hw_info(struct pt_iommu_riscv_64 *table, + const struct pt_range *top_range, + struct pt_iommu_riscv_64_hw_info *info) +{ + phys_addr_t top_phys = virt_to_phys(top_range->top_table); + + info->ppn = oalog2_div(top_phys, PT_GRANULE_LG2SZ); + PT_WARN_ON(top_phys & ~PT_TOP_PHYS_MASK); + + /* + * See Table 3. Encodings of iosatp.MODE field" for DC.tx.SXL = 0: + * 8 = Sv39 = top level 2 + * 9 = Sv38 = top level 3 + * 10 = Sv57 = top level 4 + */ + info->fsc_iosatp_mode = top_range->top_level + 6; +} +#define pt_iommu_fmt_hw_info riscvpt_iommu_fmt_hw_info + +#if defined(GENERIC_PT_KUNIT) +static const struct pt_iommu_riscv_64_cfg riscv_64_kunit_fmt_cfgs[] = { + [0] = { .common.features = BIT(PT_FEAT_RISCV_SVNAPOT_64K), + .common.hw_max_oasz_lg2 = 56, + .common.hw_max_vasz_lg2 = 39 }, + [1] = { .common.features = 0, + .common.hw_max_oasz_lg2 = 56, + .common.hw_max_vasz_lg2 = 48 }, + [2] = { .common.features = BIT(PT_FEAT_RISCV_SVNAPOT_64K), + .common.hw_max_oasz_lg2 = 56, + .common.hw_max_vasz_lg2 = 57 }, +}; +#define kunit_fmt_cfgs riscv_64_kunit_fmt_cfgs +enum { + KUNIT_FMT_FEATURES = BIT(PT_FEAT_RISCV_SVNAPOT_64K), +}; +#endif + +#endif diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h index 6a9a1acb5aad..fc5d0b5edadc 100644 --- a/include/linux/generic_pt/common.h +++ b/include/linux/generic_pt/common.h @@ -175,6 +175,22 @@ enum { PT_FEAT_VTDSS_FORCE_WRITEABLE, }; +struct pt_riscv_32 { + struct pt_common common; +}; + +struct pt_riscv_64 { + struct pt_common common; +}; + +enum { + /* + * Support the 64k contiguous page size following the Svnapot extension. + */ + PT_FEAT_RISCV_SVNAPOT_64K = PT_FEAT_FMT_START, + +}; + struct pt_x86_64 { struct pt_common common; }; diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 9eefbb74efd0..49d9addb98c5 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -275,6 +275,17 @@ struct pt_iommu_vtdss_hw_info { IOMMU_FORMAT(vtdss, vtdss_pt); +struct pt_iommu_riscv_64_cfg { + struct pt_iommu_cfg common; +}; + +struct pt_iommu_riscv_64_hw_info { + u64 ppn; + u8 fsc_iosatp_mode; +}; + +IOMMU_FORMAT(riscv_64, riscv_64pt); + struct pt_iommu_x86_64_cfg { struct pt_iommu_cfg common; /* 4 is a 57 bit 5 level table */ -- cgit v1.2.3 From 99fb8afa16add85ed016baee9735231bca0c32b4 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 27 Feb 2026 15:30:10 -0400 Subject: iommupt: Directly call iommupt's unmap_range() The common algorithm in iommupt does not require the iommu_pgsize() calculations, it can directly unmap any arbitrary range. Add a new function pointer to directly call an iommupt unmap_range op and make __iommu_unmap() call it directly. Gives about a 5% gain on single page unmappings. The function pointer is run through pt_iommu_ops instead of iommu_domain_ops to discourage using it outside iommupt. All drivers with their own page tables should continue to use the simplified map/unmap_pages() style interfaces. Reviewed-by: Samiullah Khawaja Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Reviewed-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/iommu_pt.h | 29 ++++------------------------- drivers/iommu/iommu.c | 27 +++++++++++++++++++++------ include/linux/generic_pt/iommu.h | 37 +++++++++++++++++++++++++++++++------ include/linux/iommu.h | 1 + 4 files changed, 57 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h index 9c08bb594e41..a627c26fa62d 100644 --- a/drivers/iommu/generic_pt/iommu_pt.h +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -1031,34 +1031,12 @@ start_oa: return ret; } -/** - * unmap_pages() - Make a range of IOVA empty/not present - * @domain: Domain to manipulate - * @iova: IO virtual address to start - * @pgsize: Length of each page - * @pgcount: Length of the range in pgsize units starting from @iova - * @iotlb_gather: Gather struct that must be flushed on return - * - * unmap_pages() will remove a translation created by map_pages(). It cannot - * subdivide a mapping created by map_pages(), so it should be called with IOVA - * ranges that match those passed to map_pages(). The IOVA range can aggregate - * contiguous map_pages() calls so long as no individual range is split. - * - * Context: The caller must hold a write range lock that includes - * the whole range. - * - * Returns: Number of bytes of VA unmapped. iova + res will be the point - * unmapping stopped. - */ -size_t DOMAIN_NS(unmap_pages)(struct iommu_domain *domain, unsigned long iova, - size_t pgsize, size_t pgcount, +static size_t NS(unmap_range)(struct pt_iommu *iommu_table, dma_addr_t iova, + dma_addr_t len, struct iommu_iotlb_gather *iotlb_gather) { - struct pt_iommu *iommu_table = - container_of(domain, struct pt_iommu, domain); struct pt_unmap_args unmap = { .free_list = IOMMU_PAGES_LIST_INIT( unmap.free_list) }; - pt_vaddr_t len = pgsize * pgcount; struct pt_range range; int ret; @@ -1073,7 +1051,6 @@ size_t DOMAIN_NS(unmap_pages)(struct iommu_domain *domain, unsigned long iova, return unmap.unmapped; } -EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(unmap_pages), "GENERIC_PT_IOMMU"); static void NS(get_info)(struct pt_iommu *iommu_table, struct pt_iommu_info *info) @@ -1121,6 +1098,7 @@ static void NS(deinit)(struct pt_iommu *iommu_table) } static const struct pt_iommu_ops NS(ops) = { + .unmap_range = NS(unmap_range), #if IS_ENABLED(CONFIG_IOMMUFD_DRIVER) && defined(pt_entry_is_write_dirty) && \ IS_ENABLED(CONFIG_IOMMUFD_TEST) && defined(pt_entry_make_write_dirty) .set_dirty = NS(set_dirty), @@ -1183,6 +1161,7 @@ static int pt_iommu_init_domain(struct pt_iommu *iommu_table, domain->type = __IOMMU_DOMAIN_PAGING; domain->pgsize_bitmap = info.pgsize_bitmap; + domain->is_iommupt = true; if (pt_feature(common, PT_FEAT_DYNAMIC_TOP)) range = _pt_top_range(common, diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 35db51780954..f68269707101 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "dma-iommu.h" #include "iommu-priv.h" @@ -2666,13 +2667,12 @@ int iommu_map(struct iommu_domain *domain, unsigned long iova, } EXPORT_SYMBOL_GPL(iommu_map); -static size_t __iommu_unmap(struct iommu_domain *domain, - unsigned long iova, size_t size, - struct iommu_iotlb_gather *iotlb_gather) +static size_t +__iommu_unmap_domain_pgtbl(struct iommu_domain *domain, unsigned long iova, + size_t size, struct iommu_iotlb_gather *iotlb_gather) { const struct iommu_domain_ops *ops = domain->ops; size_t unmapped_page, unmapped = 0; - unsigned long orig_iova = iova; unsigned int min_pagesz; if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING))) @@ -2718,8 +2718,23 @@ static size_t __iommu_unmap(struct iommu_domain *domain, unmapped += unmapped_page; } - trace_unmap(orig_iova, size, unmapped); - iommu_debug_unmap_end(domain, orig_iova, size, unmapped); + return unmapped; +} + +static size_t __iommu_unmap(struct iommu_domain *domain, unsigned long iova, + size_t size, + struct iommu_iotlb_gather *iotlb_gather) +{ + struct pt_iommu *pt = iommupt_from_domain(domain); + size_t unmapped; + + if (pt) + unmapped = pt->ops->unmap_range(pt, iova, size, iotlb_gather); + else + unmapped = __iommu_unmap_domain_pgtbl(domain, iova, size, + iotlb_gather); + trace_unmap(iova, size, unmapped); + iommu_debug_unmap_end(domain, iova, size, unmapped); return unmapped; } diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 49d9addb98c5..0da971134a37 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -66,6 +66,13 @@ struct pt_iommu { struct device *iommu_device; }; +static inline struct pt_iommu *iommupt_from_domain(struct iommu_domain *domain) +{ + if (!IS_ENABLED(CONFIG_IOMMU_PT) || !domain->is_iommupt) + return NULL; + return container_of(domain, struct pt_iommu, domain); +} + /** * struct pt_iommu_info - Details about the IOMMU page table * @@ -80,6 +87,29 @@ struct pt_iommu_info { }; struct pt_iommu_ops { + /** + * @unmap_range: Make a range of IOVA empty/not present + * @iommu_table: Table to manipulate + * @iova: IO virtual address to start + * @len: Length of the range starting from @iova + * @iotlb_gather: Gather struct that must be flushed on return + * + * unmap_range() will remove a translation created by map_range(). It + * cannot subdivide a mapping created by map_range(), so it should be + * called with IOVA ranges that match those passed to map_pages. The + * IOVA range can aggregate contiguous map_range() calls so long as no + * individual range is split. + * + * Context: The caller must hold a write range lock that includes + * the whole range. + * + * Returns: Number of bytes of VA unmapped. iova + res will be the + * point unmapping stopped. + */ + size_t (*unmap_range)(struct pt_iommu *iommu_table, dma_addr_t iova, + dma_addr_t len, + struct iommu_iotlb_gather *iotlb_gather); + /** * @set_dirty: Make the iova write dirty * @iommu_table: Table to manipulate @@ -198,10 +228,6 @@ struct pt_iommu_cfg { unsigned long iova, phys_addr_t paddr, \ size_t pgsize, size_t pgcount, \ int prot, gfp_t gfp, size_t *mapped); \ - size_t pt_iommu_##fmt##_unmap_pages( \ - struct iommu_domain *domain, unsigned long iova, \ - size_t pgsize, size_t pgcount, \ - struct iommu_iotlb_gather *iotlb_gather); \ int pt_iommu_##fmt##_read_and_clear_dirty( \ struct iommu_domain *domain, unsigned long iova, size_t size, \ unsigned long flags, struct iommu_dirty_bitmap *dirty); \ @@ -223,8 +249,7 @@ struct pt_iommu_cfg { */ #define IOMMU_PT_DOMAIN_OPS(fmt) \ .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \ - .map_pages = &pt_iommu_##fmt##_map_pages, \ - .unmap_pages = &pt_iommu_##fmt##_unmap_pages + .map_pages = &pt_iommu_##fmt##_map_pages #define IOMMU_PT_DIRTY_OPS(fmt) \ .read_and_clear_dirty = &pt_iommu_##fmt##_read_and_clear_dirty diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 54b8b48c762e..7ca648c01336 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -223,6 +223,7 @@ enum iommu_domain_cookie_type { struct iommu_domain { unsigned type; enum iommu_domain_cookie_type cookie_type; + bool is_iommupt; const struct iommu_domain_ops *ops; const struct iommu_dirty_ops *dirty_ops; const struct iommu_ops *owner; /* Whose domain_alloc we came from */ -- cgit v1.2.3 From d6c65b0fd6218bd21ed0be7a8d3218e8f6dc91de Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 27 Feb 2026 15:30:11 -0400 Subject: iommupt: Avoid rewalking during map Currently the core code provides a simplified interface to drivers where it fragments a requested multi-page map into single page size steps after doing all the calculations to figure out what page size is appropriate. Each step rewalks the page tables from the start. Since iommupt has a single implementation of the mapping algorithm it can internally compute each step as it goes while retaining its current position in the walk. Add a new function pt_pgsz_count() which computes the same page size fragement of a large mapping operations. Compute the next fragment when all the leaf entries of the current fragement have been written, then continue walking from the current point. The function pointer is run through pt_iommu_ops instead of iommu_domain_ops to discourage using it outside iommupt. All drivers with their own page tables should continue to use the simplified map_pages() style interfaces. Reviewed-by: Samiullah Khawaja Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Reviewed-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/iommu_pt.h | 133 +++++++++++++++++----------- drivers/iommu/generic_pt/kunit_generic_pt.h | 12 +++ drivers/iommu/generic_pt/pt_iter.h | 22 +++++ drivers/iommu/iommu.c | 39 ++++++-- include/linux/generic_pt/iommu.h | 34 +++++-- 5 files changed, 175 insertions(+), 65 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h index a627c26fa62d..17b72dbd7d51 100644 --- a/drivers/iommu/generic_pt/iommu_pt.h +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -477,6 +477,7 @@ struct pt_iommu_map_args { pt_oaddr_t oa; unsigned int leaf_pgsize_lg2; unsigned int leaf_level; + pt_vaddr_t num_leaves; }; /* @@ -529,11 +530,15 @@ static int clear_contig(const struct pt_state *start_pts, static int __map_range_leaf(struct pt_range *range, void *arg, unsigned int level, struct pt_table_p *table) { + struct pt_iommu *iommu_table = iommu_from_common(range->common); struct pt_state pts = pt_init(range, level, table); struct pt_iommu_map_args *map = arg; unsigned int leaf_pgsize_lg2 = map->leaf_pgsize_lg2; unsigned int start_index; pt_oaddr_t oa = map->oa; + unsigned int num_leaves; + unsigned int orig_end; + pt_vaddr_t last_va; unsigned int step; bool need_contig; int ret = 0; @@ -547,6 +552,15 @@ static int __map_range_leaf(struct pt_range *range, void *arg, _pt_iter_first(&pts); start_index = pts.index; + orig_end = pts.end_index; + if (pts.index + map->num_leaves < pts.end_index) { + /* Need to stop in the middle of the table to change sizes */ + pts.end_index = pts.index + map->num_leaves; + num_leaves = 0; + } else { + num_leaves = map->num_leaves - (pts.end_index - pts.index); + } + do { pts.type = pt_load_entry_raw(&pts); if (pts.type != PT_ENTRY_EMPTY || need_contig) { @@ -572,7 +586,40 @@ static int __map_range_leaf(struct pt_range *range, void *arg, flush_writes_range(&pts, start_index, pts.index); map->oa = oa; - return ret; + map->num_leaves = num_leaves; + if (ret || num_leaves) + return ret; + + /* range->va is not valid if we reached the end of the table */ + pts.index -= step; + pt_index_to_va(&pts); + pts.index += step; + last_va = range->va + log2_to_int(leaf_pgsize_lg2); + + if (last_va - 1 == range->last_va) { + PT_WARN_ON(pts.index != orig_end); + return 0; + } + + /* + * Reached a point where the page size changed, compute the new + * parameters. + */ + map->leaf_pgsize_lg2 = pt_compute_best_pgsize( + iommu_table->domain.pgsize_bitmap, last_va, range->last_va, oa); + map->leaf_level = + pt_pgsz_lg2_to_level(range->common, map->leaf_pgsize_lg2); + map->num_leaves = pt_pgsz_count(iommu_table->domain.pgsize_bitmap, + last_va, range->last_va, oa, + map->leaf_pgsize_lg2); + + /* Didn't finish this table level, caller will repeat it */ + if (pts.index != orig_end) { + if (pts.index != start_index) + pt_index_to_va(&pts); + return -EAGAIN; + } + return 0; } static int __map_range(struct pt_range *range, void *arg, unsigned int level, @@ -595,14 +642,9 @@ static int __map_range(struct pt_range *range, void *arg, unsigned int level, if (pts.type != PT_ENTRY_EMPTY) return -EADDRINUSE; ret = pt_iommu_new_table(&pts, &map->attrs); - if (ret) { - /* - * Racing with another thread installing a table - */ - if (ret == -EAGAIN) - continue; + /* EAGAIN on a race will loop again */ + if (ret) return ret; - } } else { pts.table_lower = pt_table_ptr(&pts); /* @@ -626,10 +668,12 @@ static int __map_range(struct pt_range *range, void *arg, unsigned int level, * The already present table can possibly be shared with another * concurrent map. */ - if (map->leaf_level == level - 1) - ret = pt_descend(&pts, arg, __map_range_leaf); - else - ret = pt_descend(&pts, arg, __map_range); + do { + if (map->leaf_level == level - 1) + ret = pt_descend(&pts, arg, __map_range_leaf); + else + ret = pt_descend(&pts, arg, __map_range); + } while (ret == -EAGAIN); if (ret) return ret; @@ -637,6 +681,14 @@ static int __map_range(struct pt_range *range, void *arg, unsigned int level, pt_index_to_va(&pts); if (pts.index >= pts.end_index) break; + + /* + * This level is currently running __map_range_leaf() which is + * not correct if the target level has been updated to this + * level. Have the caller invoke __map_range_leaf. + */ + if (map->leaf_level == level) + return -EAGAIN; } while (true); return 0; } @@ -808,12 +860,13 @@ static int check_map_range(struct pt_iommu *iommu_table, struct pt_range *range, static int do_map(struct pt_range *range, struct pt_common *common, bool single_page, struct pt_iommu_map_args *map) { + int ret; + /* * The __map_single_page() fast path does not support DMA_INCOHERENT * flushing to keep its .text small. */ if (single_page && !pt_feature(common, PT_FEAT_DMA_INCOHERENT)) { - int ret; ret = pt_walk_range(range, __map_single_page, map); if (ret != -EAGAIN) @@ -821,50 +874,25 @@ static int do_map(struct pt_range *range, struct pt_common *common, /* EAGAIN falls through to the full path */ } - if (map->leaf_level == range->top_level) - return pt_walk_range(range, __map_range_leaf, map); - return pt_walk_range(range, __map_range, map); + do { + if (map->leaf_level == range->top_level) + ret = pt_walk_range(range, __map_range_leaf, map); + else + ret = pt_walk_range(range, __map_range, map); + } while (ret == -EAGAIN); + return ret; } -/** - * map_pages() - Install translation for an IOVA range - * @domain: Domain to manipulate - * @iova: IO virtual address to start - * @paddr: Physical/Output address to start - * @pgsize: Length of each page - * @pgcount: Length of the range in pgsize units starting from @iova - * @prot: A bitmap of IOMMU_READ/WRITE/CACHE/NOEXEC/MMIO - * @gfp: GFP flags for any memory allocations - * @mapped: Total bytes successfully mapped - * - * The range starting at IOVA will have paddr installed into it. The caller - * must specify a valid pgsize and pgcount to segment the range into compatible - * blocks. - * - * On error the caller will probably want to invoke unmap on the range from iova - * up to the amount indicated by @mapped to return the table back to an - * unchanged state. - * - * Context: The caller must hold a write range lock that includes the whole - * range. - * - * Returns: -ERRNO on failure, 0 on success. The number of bytes of VA that were - * mapped are added to @mapped, @mapped is not zerod first. - */ -int DOMAIN_NS(map_pages)(struct iommu_domain *domain, unsigned long iova, - phys_addr_t paddr, size_t pgsize, size_t pgcount, - int prot, gfp_t gfp, size_t *mapped) +static int NS(map_range)(struct pt_iommu *iommu_table, dma_addr_t iova, + phys_addr_t paddr, dma_addr_t len, unsigned int prot, + gfp_t gfp, size_t *mapped) { - struct pt_iommu *iommu_table = - container_of(domain, struct pt_iommu, domain); pt_vaddr_t pgsize_bitmap = iommu_table->domain.pgsize_bitmap; struct pt_common *common = common_from_iommu(iommu_table); struct iommu_iotlb_gather iotlb_gather; - pt_vaddr_t len = pgsize * pgcount; struct pt_iommu_map_args map = { .iotlb_gather = &iotlb_gather, .oa = paddr, - .leaf_pgsize_lg2 = vaffs(pgsize), }; bool single_page = false; struct pt_range range; @@ -892,13 +920,13 @@ int DOMAIN_NS(map_pages)(struct iommu_domain *domain, unsigned long iova, return ret; /* Calculate target page size and level for the leaves */ - if (pt_has_system_page_size(common) && pgsize == PAGE_SIZE && - pgcount == 1) { + if (pt_has_system_page_size(common) && len == PAGE_SIZE) { PT_WARN_ON(!(pgsize_bitmap & PAGE_SIZE)); if (log2_mod(iova | paddr, PAGE_SHIFT)) return -ENXIO; map.leaf_pgsize_lg2 = PAGE_SHIFT; map.leaf_level = 0; + map.num_leaves = 1; single_page = true; } else { map.leaf_pgsize_lg2 = pt_compute_best_pgsize( @@ -907,6 +935,9 @@ int DOMAIN_NS(map_pages)(struct iommu_domain *domain, unsigned long iova, return -ENXIO; map.leaf_level = pt_pgsz_lg2_to_level(common, map.leaf_pgsize_lg2); + map.num_leaves = pt_pgsz_count(pgsize_bitmap, range.va, + range.last_va, paddr, + map.leaf_pgsize_lg2); } ret = check_map_range(iommu_table, &range, &map); @@ -929,7 +960,6 @@ int DOMAIN_NS(map_pages)(struct iommu_domain *domain, unsigned long iova, *mapped += map.oa - paddr; return ret; } -EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(map_pages), "GENERIC_PT_IOMMU"); struct pt_unmap_args { struct iommu_pages_list free_list; @@ -1098,6 +1128,7 @@ static void NS(deinit)(struct pt_iommu *iommu_table) } static const struct pt_iommu_ops NS(ops) = { + .map_range = NS(map_range), .unmap_range = NS(unmap_range), #if IS_ENABLED(CONFIG_IOMMUFD_DRIVER) && defined(pt_entry_is_write_dirty) && \ IS_ENABLED(CONFIG_IOMMUFD_TEST) && defined(pt_entry_make_write_dirty) diff --git a/drivers/iommu/generic_pt/kunit_generic_pt.h b/drivers/iommu/generic_pt/kunit_generic_pt.h index 68278bf15cfe..374e475f591e 100644 --- a/drivers/iommu/generic_pt/kunit_generic_pt.h +++ b/drivers/iommu/generic_pt/kunit_generic_pt.h @@ -312,6 +312,17 @@ static void test_best_pgsize(struct kunit *test) } } +static void test_pgsz_count(struct kunit *test) +{ + KUNIT_EXPECT_EQ(test, + pt_pgsz_count(SZ_4K, 0, SZ_1G - 1, 0, ilog2(SZ_4K)), + SZ_1G / SZ_4K); + KUNIT_EXPECT_EQ(test, + pt_pgsz_count(SZ_2M | SZ_4K, SZ_4K, SZ_1G - 1, SZ_4K, + ilog2(SZ_4K)), + (SZ_2M - SZ_4K) / SZ_4K); +} + /* * Check that pt_install_table() and pt_table_pa() match */ @@ -770,6 +781,7 @@ static struct kunit_case generic_pt_test_cases[] = { KUNIT_CASE_FMT(test_init), KUNIT_CASE_FMT(test_bitops), KUNIT_CASE_FMT(test_best_pgsize), + KUNIT_CASE_FMT(test_pgsz_count), KUNIT_CASE_FMT(test_table_ptr), KUNIT_CASE_FMT(test_max_va), KUNIT_CASE_FMT(test_table_radix), diff --git a/drivers/iommu/generic_pt/pt_iter.h b/drivers/iommu/generic_pt/pt_iter.h index c0d8617cce29..3e45dbde6b83 100644 --- a/drivers/iommu/generic_pt/pt_iter.h +++ b/drivers/iommu/generic_pt/pt_iter.h @@ -569,6 +569,28 @@ static inline unsigned int pt_compute_best_pgsize(pt_vaddr_t pgsz_bitmap, return pgsz_lg2; } +/* + * Return the number of pgsize_lg2 leaf entries that can be mapped for + * va to oa. This accounts for any requirement to reduce or increase the page + * size across the VA range. + */ +static inline pt_vaddr_t pt_pgsz_count(pt_vaddr_t pgsz_bitmap, pt_vaddr_t va, + pt_vaddr_t last_va, pt_oaddr_t oa, + unsigned int pgsize_lg2) +{ + pt_vaddr_t len = last_va - va + 1; + pt_vaddr_t next_pgsizes = log2_set_mod(pgsz_bitmap, 0, pgsize_lg2 + 1); + + if (next_pgsizes) { + unsigned int next_pgsize_lg2 = vaffs(next_pgsizes); + + if (log2_mod(va ^ oa, next_pgsize_lg2) == 0) + len = min(len, log2_set_mod_max(va, next_pgsize_lg2) - + va + 1); + } + return log2_div(len, pgsize_lg2); +} + #define _PT_MAKE_CALL_LEVEL(fn) \ static __always_inline int fn(struct pt_range *range, void *arg, \ unsigned int level, \ diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index f68269707101..33cee64686e3 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2569,14 +2569,14 @@ out_set_count: return pgsize; } -int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, - phys_addr_t paddr, size_t size, int prot, gfp_t gfp) +static int __iommu_map_domain_pgtbl(struct iommu_domain *domain, + unsigned long iova, phys_addr_t paddr, + size_t size, int prot, gfp_t gfp) { const struct iommu_domain_ops *ops = domain->ops; unsigned long orig_iova = iova; unsigned int min_pagesz; size_t orig_size = size; - phys_addr_t orig_paddr = paddr; int ret = 0; might_sleep_if(gfpflags_allow_blocking(gfp)); @@ -2633,12 +2633,9 @@ int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, /* unroll mapping in case something went wrong */ if (ret) { iommu_unmap(domain, orig_iova, orig_size - size); - } else { - trace_map(orig_iova, orig_paddr, orig_size); - iommu_debug_map(domain, orig_paddr, orig_size); + return ret; } - - return ret; + return 0; } int iommu_sync_map(struct iommu_domain *domain, unsigned long iova, size_t size) @@ -2650,6 +2647,32 @@ int iommu_sync_map(struct iommu_domain *domain, unsigned long iova, size_t size) return ops->iotlb_sync_map(domain, iova, size); } +int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, + phys_addr_t paddr, size_t size, int prot, gfp_t gfp) +{ + struct pt_iommu *pt = iommupt_from_domain(domain); + int ret; + + if (pt) { + size_t mapped = 0; + + ret = pt->ops->map_range(pt, iova, paddr, size, prot, gfp, + &mapped); + if (ret) { + iommu_unmap(domain, iova, mapped); + return ret; + } + return 0; + } + ret = __iommu_map_domain_pgtbl(domain, iova, paddr, size, prot, gfp); + if (!ret) + return ret; + + trace_map(iova, paddr, size); + iommu_debug_map(domain, paddr, size); + return 0; +} + int iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot, gfp_t gfp) { diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 0da971134a37..dd0edd02a48a 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -87,6 +87,33 @@ struct pt_iommu_info { }; struct pt_iommu_ops { + /** + * @map_range: Install translation for an IOVA range + * @iommu_table: Table to manipulate + * @iova: IO virtual address to start + * @paddr: Physical/Output address to start + * @len: Length of the range starting from @iova + * @prot: A bitmap of IOMMU_READ/WRITE/CACHE/NOEXEC/MMIO + * @gfp: GFP flags for any memory allocations + * + * The range starting at IOVA will have paddr installed into it. The + * rage is automatically segmented into optimally sized table entries, + * and can have any valid alignment. + * + * On error the caller will probably want to invoke unmap on the range + * from iova up to the amount indicated by @mapped to return the table + * back to an unchanged state. + * + * Context: The caller must hold a write range lock that includes + * the whole range. + * + * Returns: -ERRNO on failure, 0 on success. The number of bytes of VA + * that were mapped are added to @mapped, @mapped is not zerod first. + */ + int (*map_range)(struct pt_iommu *iommu_table, dma_addr_t iova, + phys_addr_t paddr, dma_addr_t len, unsigned int prot, + gfp_t gfp, size_t *mapped); + /** * @unmap_range: Make a range of IOVA empty/not present * @iommu_table: Table to manipulate @@ -224,10 +251,6 @@ struct pt_iommu_cfg { #define IOMMU_PROTOTYPES(fmt) \ phys_addr_t pt_iommu_##fmt##_iova_to_phys(struct iommu_domain *domain, \ dma_addr_t iova); \ - int pt_iommu_##fmt##_map_pages(struct iommu_domain *domain, \ - unsigned long iova, phys_addr_t paddr, \ - size_t pgsize, size_t pgcount, \ - int prot, gfp_t gfp, size_t *mapped); \ int pt_iommu_##fmt##_read_and_clear_dirty( \ struct iommu_domain *domain, unsigned long iova, size_t size, \ unsigned long flags, struct iommu_dirty_bitmap *dirty); \ @@ -248,8 +271,7 @@ struct pt_iommu_cfg { * iommu_pt */ #define IOMMU_PT_DOMAIN_OPS(fmt) \ - .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \ - .map_pages = &pt_iommu_##fmt##_map_pages + .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys #define IOMMU_PT_DIRTY_OPS(fmt) \ .read_and_clear_dirty = &pt_iommu_##fmt##_read_and_clear_dirty -- cgit v1.2.3 From a82efb8747d1b8a7c0a377dc79c2aac204eae788 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Tue, 17 Mar 2026 11:16:02 +0000 Subject: iommu: Add device ATS supported capability PCIe ATS may be disabled by platform firmware, root complex limitations, or kernel policy even when a device advertises the ATS capability in its PCI configuration space. Add a new IOMMU_CAP_PCI_ATS_SUPPORTED capability to allow IOMMU drivers to report the effective ATS decision for a device. When this capability is true for a device, ATS may be enabled for that device, but it does not imply that ATS is currently enabled. A subsequent patch will extend iommufd to expose the effective ATS status to userspace. Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Signed-off-by: Shameer Kolothum Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 6 ++++++ drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 3 +++ drivers/iommu/intel/iommu.c | 2 ++ include/linux/iommu.h | 2 ++ 4 files changed, 13 insertions(+) (limited to 'include/linux') diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 81c4d7733872..f1814fee5182 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2985,6 +2985,12 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap) return amd_iommu_hd_support(iommu); } + case IOMMU_CAP_PCI_ATS_SUPPORTED: { + struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); + + return amd_iommu_iotlb_sup && + (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP); + } default: break; } diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 4d00d796f078..dec5cac98f7c 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -107,6 +107,7 @@ static const char * const event_class_str[] = { }; static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master); +static bool arm_smmu_ats_supported(struct arm_smmu_master *master); static void parse_driver_options(struct arm_smmu_device *smmu) { @@ -2494,6 +2495,8 @@ static bool arm_smmu_capable(struct device *dev, enum iommu_cap cap) return true; case IOMMU_CAP_DIRTY_TRACKING: return arm_smmu_dbm_capable(master->smmu); + case IOMMU_CAP_PCI_ATS_SUPPORTED: + return arm_smmu_ats_supported(master); default: return false; } diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index ef7613b177b9..5dca8e525c73 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3220,6 +3220,8 @@ static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap) return ecap_sc_support(info->iommu->ecap); case IOMMU_CAP_DIRTY_TRACKING: return ssads_supported(info->iommu); + case IOMMU_CAP_PCI_ATS_SUPPORTED: + return info->ats_supported; default: return false; } diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 7ca648c01336..a904821ed169 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -272,6 +272,8 @@ enum iommu_cap { */ IOMMU_CAP_DEFERRED_FLUSH, IOMMU_CAP_DIRTY_TRACKING, /* IOMMU supports dirty tracking */ + /* ATS is supported and may be enabled for this device */ + IOMMU_CAP_PCI_ATS_SUPPORTED, }; /* These are the possible reserved region types */ -- cgit v1.2.3 From 2727d44f5d5bc3f8e55a6a0ccf24d8105a5a400e Mon Sep 17 00:00:00 2001 From: Kit Dallege Date: Sun, 15 Mar 2026 18:09:31 +0100 Subject: writeback: fix kernel-doc function name mismatch for wb_put_many() The kernel-doc comment says wb_put but the actual function is wb_put_many. Fix the name to match. Assisted-by: Claude:claude-opus-4-6 Signed-off-by: Kit Dallege Link: https://patch.msgid.link/20260315170931.65852-1-xaum.io@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/backing-dev-defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index c88fd4d37d1f..a06b93446d10 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -237,7 +237,7 @@ static inline void wb_get(struct bdi_writeback *wb) } /** - * wb_put - decrement a wb's refcount + * wb_put_many - decrement a wb's refcount * @wb: bdi_writeback to put * @nr: number of references to put */ -- cgit v1.2.3 From 9577c74c96f88d807d1ba005adbf5952e7127e55 Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Thu, 12 Mar 2026 18:51:41 -0700 Subject: platform/x86/intel/vsec: Make driver_data info const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Treat PCI id->driver_data (intel_vsec_platform_info) as read-only by making vsec_priv->info a const pointer and updating all function signatures to accept const intel_vsec_platform_info *. This improves const-correctness and clarifies that the platform info data from the driver_data table is not meant to be modified at runtime. No functional changes intended. Signed-off-by: David E. Box Reviewed-by: Michael J. Ruhl Link: https://patch.msgid.link/20260313015202.3660072-3-david.e.box@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/vsec.c | 20 ++++++++++---------- include/linux/intel_vsec.h | 4 ++-- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/drivers/platform/x86/intel/vsec.c b/drivers/platform/x86/intel/vsec.c index 46966edca03b..e0096be605d9 100644 --- a/drivers/platform/x86/intel/vsec.c +++ b/drivers/platform/x86/intel/vsec.c @@ -42,7 +42,7 @@ enum vsec_device_state { }; struct vsec_priv { - struct intel_vsec_platform_info *info; + const struct intel_vsec_platform_info *info; struct device *suppliers[VSEC_FEATURE_COUNT]; struct oobmsm_plat_info plat_info; enum vsec_device_state state[VSEC_FEATURE_COUNT]; @@ -270,7 +270,7 @@ cleanup_aux: EXPORT_SYMBOL_NS_GPL(intel_vsec_add_aux, "INTEL_VSEC"); static int intel_vsec_add_dev(struct pci_dev *pdev, struct intel_vsec_header *header, - struct intel_vsec_platform_info *info, + const struct intel_vsec_platform_info *info, unsigned long cap_id, u64 base_addr) { struct intel_vsec_device __free(kfree) *intel_vsec_dev = NULL; @@ -406,7 +406,7 @@ static int get_cap_id(u32 header_id, unsigned long *cap_id) static int intel_vsec_register_device(struct pci_dev *pdev, struct intel_vsec_header *header, - struct intel_vsec_platform_info *info, + const struct intel_vsec_platform_info *info, u64 base_addr) { const struct vsec_feature_dependency *consumer_deps; @@ -452,7 +452,7 @@ static int intel_vsec_register_device(struct pci_dev *pdev, } static bool intel_vsec_walk_header(struct pci_dev *pdev, - struct intel_vsec_platform_info *info) + const struct intel_vsec_platform_info *info) { struct intel_vsec_header **header = info->headers; bool have_devices = false; @@ -468,7 +468,7 @@ static bool intel_vsec_walk_header(struct pci_dev *pdev, } static bool intel_vsec_walk_dvsec(struct pci_dev *pdev, - struct intel_vsec_platform_info *info) + const struct intel_vsec_platform_info *info) { bool have_devices = false; int pos = 0; @@ -519,7 +519,7 @@ static bool intel_vsec_walk_dvsec(struct pci_dev *pdev, } static bool intel_vsec_walk_vsec(struct pci_dev *pdev, - struct intel_vsec_platform_info *info) + const struct intel_vsec_platform_info *info) { bool have_devices = false; int pos = 0; @@ -565,7 +565,7 @@ static bool intel_vsec_walk_vsec(struct pci_dev *pdev, } int intel_vsec_register(struct pci_dev *pdev, - struct intel_vsec_platform_info *info) + const struct intel_vsec_platform_info *info) { if (!pdev || !info || !info->headers) return -EINVAL; @@ -578,7 +578,7 @@ int intel_vsec_register(struct pci_dev *pdev, EXPORT_SYMBOL_NS_GPL(intel_vsec_register, "INTEL_VSEC"); static bool intel_vsec_get_features(struct pci_dev *pdev, - struct intel_vsec_platform_info *info) + const struct intel_vsec_platform_info *info) { bool found = false; @@ -622,7 +622,7 @@ static void intel_vsec_skip_missing_dependencies(struct pci_dev *pdev) static int intel_vsec_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { - struct intel_vsec_platform_info *info; + const struct intel_vsec_platform_info *info; struct vsec_priv *priv; int num_caps, ret; int run_once = 0; @@ -633,7 +633,7 @@ static int intel_vsec_pci_probe(struct pci_dev *pdev, const struct pci_device_id return ret; pci_save_state(pdev); - info = (struct intel_vsec_platform_info *)id->driver_data; + info = (const struct intel_vsec_platform_info *)id->driver_data; if (!info) return -EINVAL; diff --git a/include/linux/intel_vsec.h b/include/linux/intel_vsec.h index 1a0f357c2427..d551174b0049 100644 --- a/include/linux/intel_vsec.h +++ b/include/linux/intel_vsec.h @@ -200,13 +200,13 @@ static inline struct intel_vsec_device *auxdev_to_ivdev(struct auxiliary_device #if IS_ENABLED(CONFIG_INTEL_VSEC) int intel_vsec_register(struct pci_dev *pdev, - struct intel_vsec_platform_info *info); + const struct intel_vsec_platform_info *info); int intel_vsec_set_mapping(struct oobmsm_plat_info *plat_info, struct intel_vsec_device *vsec_dev); struct oobmsm_plat_info *intel_vsec_get_mapping(struct pci_dev *pdev); #else static inline int intel_vsec_register(struct pci_dev *pdev, - struct intel_vsec_platform_info *info) + const struct intel_vsec_platform_info *info) { return -ENODEV; } -- cgit v1.2.3 From c62fd96a04e4a7b847448f97ecfe9f3fe706e7b3 Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Thu, 12 Mar 2026 18:51:42 -0700 Subject: platform/x86/intel/vsec: Decouple add/link helpers from PCI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This refactor prepares for adding ACPI-enumerated PMT endpoints. While intel_vsec is bound to PCI today, some helpers are used by code that will also register PMT endpoints from non-PCI (ACPI) paths. Clean up PCI-specific plumbing where it isn’t strictly required and rely on generic struct device where possible. Signed-off-by: David E. Box Reviewed-by: Ilpo Järvinen Reviewed-by: Michael J. Ruhl Link: https://patch.msgid.link/20260313015202.3660072-4-david.e.box@linux.intel.com Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/vsec.c | 13 +++++++++---- drivers/platform/x86/intel/vsec_tpmi.c | 2 +- include/linux/intel_vsec.h | 2 +- 3 files changed, 11 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/platform/x86/intel/vsec.c b/drivers/platform/x86/intel/vsec.c index e0096be605d9..938648b9ef09 100644 --- a/drivers/platform/x86/intel/vsec.c +++ b/drivers/platform/x86/intel/vsec.c @@ -158,18 +158,23 @@ static bool vsec_driver_present(int cap_id) */ static const struct pci_device_id intel_vsec_pci_ids[]; -static int intel_vsec_link_devices(struct pci_dev *pdev, struct device *dev, +static int intel_vsec_link_devices(struct device *parent, struct device *dev, int consumer_id) { const struct vsec_feature_dependency *deps; enum vsec_device_state *state; struct device **suppliers; struct vsec_priv *priv; + struct pci_dev *pdev; int supplier_id; if (!consumer_id) return 0; + if (!dev_is_pci(parent)) + return 0; + + pdev = to_pci_dev(parent); if (!pci_match_id(intel_vsec_pci_ids, pdev)) return 0; @@ -204,7 +209,7 @@ static int intel_vsec_link_devices(struct pci_dev *pdev, struct device *dev, return 0; } -int intel_vsec_add_aux(struct pci_dev *pdev, struct device *parent, +int intel_vsec_add_aux(struct device *parent, struct intel_vsec_device *intel_vsec_dev, const char *name) { @@ -252,7 +257,7 @@ int intel_vsec_add_aux(struct pci_dev *pdev, struct device *parent, if (ret) goto cleanup_aux; - ret = intel_vsec_link_devices(pdev, &auxdev->dev, intel_vsec_dev->cap_id); + ret = intel_vsec_link_devices(parent, &auxdev->dev, intel_vsec_dev->cap_id); if (ret) goto cleanup_aux; @@ -343,7 +348,7 @@ static int intel_vsec_add_dev(struct pci_dev *pdev, struct intel_vsec_header *he * Pass the ownership of intel_vsec_dev and resource within it to * intel_vsec_add_aux() */ - return intel_vsec_add_aux(pdev, parent, no_free_ptr(intel_vsec_dev), + return intel_vsec_add_aux(parent, no_free_ptr(intel_vsec_dev), intel_vsec_name(header->id)); } diff --git a/drivers/platform/x86/intel/vsec_tpmi.c b/drivers/platform/x86/intel/vsec_tpmi.c index 98846e88d3d0..2298b6361094 100644 --- a/drivers/platform/x86/intel/vsec_tpmi.c +++ b/drivers/platform/x86/intel/vsec_tpmi.c @@ -655,7 +655,7 @@ static int tpmi_create_device(struct intel_tpmi_info *tpmi_info, * feature_vsec_dev and res memory are also freed as part of * device deletion. */ - return intel_vsec_add_aux(vsec_dev->pcidev, &vsec_dev->auxdev.dev, + return intel_vsec_add_aux(&vsec_dev->auxdev.dev, feature_vsec_dev, feature_id_name); } diff --git a/include/linux/intel_vsec.h b/include/linux/intel_vsec.h index d551174b0049..49a746ec0128 100644 --- a/include/linux/intel_vsec.h +++ b/include/linux/intel_vsec.h @@ -184,7 +184,7 @@ struct pmt_feature_group { struct telemetry_region regions[]; }; -int intel_vsec_add_aux(struct pci_dev *pdev, struct device *parent, +int intel_vsec_add_aux(struct device *parent, struct intel_vsec_device *intel_vsec_dev, const char *name); -- cgit v1.2.3 From 353042d54d82f6c46449f0ee38c244b5a13c1fe4 Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Thu, 12 Mar 2026 18:51:43 -0700 Subject: platform/x86/intel/vsec: Switch exported helpers from pci_dev to device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Preparatory refactor for ACPI-enumerated PMT endpoints. Several exported PMT/VSEC interfaces and structs carried struct pci_dev * even though callers only need a generic struct device. Move those to struct device * so the same APIs work for PCI and ACPI parents. Acked-by: Rodrigo Vivi Signed-off-by: David E. Box Link: https://patch.msgid.link/20260313015202.3660072-5-david.e.box@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/gpu/drm/xe/xe_debugfs.c | 2 +- drivers/gpu/drm/xe/xe_hwmon.c | 2 +- drivers/gpu/drm/xe/xe_vsec.c | 7 ++-- drivers/gpu/drm/xe/xe_vsec.h | 4 +-- drivers/platform/x86/intel/pmc/core.c | 4 +-- drivers/platform/x86/intel/pmc/ssram_telemetry.c | 2 +- drivers/platform/x86/intel/pmt/class.c | 8 ++--- drivers/platform/x86/intel/pmt/class.h | 5 +-- drivers/platform/x86/intel/pmt/discovery.c | 4 +-- drivers/platform/x86/intel/pmt/telemetry.c | 13 +++---- drivers/platform/x86/intel/pmt/telemetry.h | 12 +++---- drivers/platform/x86/intel/sdsi.c | 5 +-- drivers/platform/x86/intel/vsec.c | 44 ++++++++++++++---------- drivers/platform/x86/intel/vsec_tpmi.c | 6 ++-- include/linux/intel_vsec.h | 13 +++---- 15 files changed, 71 insertions(+), 60 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpu/drm/xe/xe_debugfs.c b/drivers/gpu/drm/xe/xe_debugfs.c index 844cfafe1ec7..ad2d8f179eb6 100644 --- a/drivers/gpu/drm/xe/xe_debugfs.c +++ b/drivers/gpu/drm/xe/xe_debugfs.c @@ -45,7 +45,7 @@ static void read_residency_counter(struct xe_device *xe, struct xe_mmio *mmio, u64 residency = 0; int ret; - ret = xe_pmt_telem_read(to_pci_dev(xe->drm.dev), + ret = xe_pmt_telem_read(xe->drm.dev, xe_mmio_read32(mmio, PUNIT_TELEMETRY_GUID), &residency, offset, sizeof(residency)); if (ret != sizeof(residency)) { diff --git a/drivers/gpu/drm/xe/xe_hwmon.c b/drivers/gpu/drm/xe/xe_hwmon.c index 0fd4d4f1014a..92e423a339f1 100644 --- a/drivers/gpu/drm/xe/xe_hwmon.c +++ b/drivers/gpu/drm/xe/xe_hwmon.c @@ -506,7 +506,7 @@ xe_hwmon_energy_get(struct xe_hwmon *hwmon, int channel, long *energy) if (hwmon->xe->info.platform == XE_BATTLEMAGE) { u64 pmt_val; - ret = xe_pmt_telem_read(to_pci_dev(hwmon->xe->drm.dev), + ret = xe_pmt_telem_read(hwmon->xe->drm.dev, xe_mmio_read32(mmio, PUNIT_TELEMETRY_GUID), &pmt_val, BMG_ENERGY_STATUS_PMT_OFFSET, sizeof(pmt_val)); if (ret != sizeof(pmt_val)) { diff --git a/drivers/gpu/drm/xe/xe_vsec.c b/drivers/gpu/drm/xe/xe_vsec.c index 4ebb4dbe1c9b..a9baf0bfe572 100644 --- a/drivers/gpu/drm/xe/xe_vsec.c +++ b/drivers/gpu/drm/xe/xe_vsec.c @@ -140,10 +140,10 @@ static int xe_guid_decode(u32 guid, int *index, u32 *offset) return 0; } -int xe_pmt_telem_read(struct pci_dev *pdev, u32 guid, u64 *data, loff_t user_offset, +int xe_pmt_telem_read(struct device *dev, u32 guid, u64 *data, loff_t user_offset, u32 count) { - struct xe_device *xe = pdev_to_xe_device(pdev); + struct xe_device *xe = kdev_to_xe_device(dev); void __iomem *telem_addr = xe->mmio.regs + BMG_TELEMETRY_OFFSET; u32 mem_region; u32 offset; @@ -198,7 +198,6 @@ void xe_vsec_init(struct xe_device *xe) { struct intel_vsec_platform_info *info; struct device *dev = xe->drm.dev; - struct pci_dev *pdev = to_pci_dev(dev); enum xe_vsec platform; platform = get_platform_info(xe); @@ -221,6 +220,6 @@ void xe_vsec_init(struct xe_device *xe) * Register a VSEC. Cleanup is handled using device managed * resources. */ - intel_vsec_register(pdev, info); + intel_vsec_register(dev, info); } MODULE_IMPORT_NS("INTEL_VSEC"); diff --git a/drivers/gpu/drm/xe/xe_vsec.h b/drivers/gpu/drm/xe/xe_vsec.h index dabfb4e02d70..a25b4e6e681b 100644 --- a/drivers/gpu/drm/xe/xe_vsec.h +++ b/drivers/gpu/drm/xe/xe_vsec.h @@ -6,10 +6,10 @@ #include -struct pci_dev; +struct device; struct xe_device; void xe_vsec_init(struct xe_device *xe); -int xe_pmt_telem_read(struct pci_dev *pdev, u32 guid, u64 *data, loff_t user_offset, u32 count); +int xe_pmt_telem_read(struct device *dev, u32 guid, u64 *data, loff_t user_offset, u32 count); #endif diff --git a/drivers/platform/x86/intel/pmc/core.c b/drivers/platform/x86/intel/pmc/core.c index 02b303418d18..d91e1ab842d6 100644 --- a/drivers/platform/x86/intel/pmc/core.c +++ b/drivers/platform/x86/intel/pmc/core.c @@ -1315,7 +1315,7 @@ static struct telem_endpoint *pmc_core_register_endpoint(struct pci_dev *pcidev, unsigned int i; for (i = 0; guids[i]; i++) { - ep = pmt_telem_find_and_register_endpoint(pcidev, guids[i], 0); + ep = pmt_telem_find_and_register_endpoint(&pcidev->dev, guids[i], 0); if (!IS_ERR(ep)) return ep; } @@ -1600,7 +1600,7 @@ static int pmc_core_get_telem_info(struct pmc_dev *pmcdev, struct pmc_dev_info * if (!pmc->map->lpm_req_guid) return -ENXIO; - ep = pmt_telem_find_and_register_endpoint(pcidev, pmc->map->lpm_req_guid, 0); + ep = pmt_telem_find_and_register_endpoint(&pcidev->dev, pmc->map->lpm_req_guid, 0); if (IS_ERR(ep)) { dev_dbg(&pmcdev->pdev->dev, "couldn't get telem endpoint %pe", ep); return -EPROBE_DEFER; diff --git a/drivers/platform/x86/intel/pmc/ssram_telemetry.c b/drivers/platform/x86/intel/pmc/ssram_telemetry.c index 03fad9331fc0..6f6e83e70fc5 100644 --- a/drivers/platform/x86/intel/pmc/ssram_telemetry.c +++ b/drivers/platform/x86/intel/pmc/ssram_telemetry.c @@ -60,7 +60,7 @@ pmc_ssram_telemetry_add_pmt(struct pci_dev *pcidev, u64 ssram_base, void __iomem info.base_addr = ssram_base; info.parent = &pcidev->dev; - return intel_vsec_register(pcidev, &info); + return intel_vsec_register(&pcidev->dev, &info); } static inline u64 get_base(void __iomem *addr, u32 offset) diff --git a/drivers/platform/x86/intel/pmt/class.c b/drivers/platform/x86/intel/pmt/class.c index be3c8d9e4fff..b4c9964df807 100644 --- a/drivers/platform/x86/intel/pmt/class.c +++ b/drivers/platform/x86/intel/pmt/class.c @@ -60,11 +60,11 @@ pmt_memcpy64_fromio(void *to, const u64 __iomem *from, size_t count) return count; } -int pmt_telem_read_mmio(struct pci_dev *pdev, struct pmt_callbacks *cb, u32 guid, void *buf, +int pmt_telem_read_mmio(struct device *dev, struct pmt_callbacks *cb, u32 guid, void *buf, void __iomem *addr, loff_t off, u32 count) { if (cb && cb->read_telem) - return cb->read_telem(pdev, guid, buf, off, count); + return cb->read_telem(dev, guid, buf, off, count); addr += off; @@ -99,7 +99,7 @@ intel_pmt_read(struct file *filp, struct kobject *kobj, if (count > entry->size - off) count = entry->size - off; - count = pmt_telem_read_mmio(entry->pcidev, entry->cb, entry->header.guid, buf, + count = pmt_telem_read_mmio(entry->ep->dev, entry->cb, entry->header.guid, buf, entry->base, off, count); return count; @@ -208,7 +208,7 @@ static int intel_pmt_populate_entry(struct intel_pmt_entry *entry, struct intel_vsec_device *ivdev, struct resource *disc_res) { - struct pci_dev *pci_dev = ivdev->pcidev; + struct pci_dev *pci_dev = to_pci_dev(ivdev->dev); struct device *dev = &ivdev->auxdev.dev; struct intel_pmt_header *header = &entry->header; u8 bir; diff --git a/drivers/platform/x86/intel/pmt/class.h b/drivers/platform/x86/intel/pmt/class.h index 3c5ad5f52bca..1ae56a5baad2 100644 --- a/drivers/platform/x86/intel/pmt/class.h +++ b/drivers/platform/x86/intel/pmt/class.h @@ -19,11 +19,12 @@ #define GET_BIR(v) ((v) & GENMASK(2, 0)) #define GET_ADDRESS(v) ((v) & GENMASK(31, 3)) +struct device; struct pci_dev; extern struct class intel_pmt_class; struct telem_endpoint { - struct pci_dev *pcidev; + struct device *dev; struct telem_header header; struct pmt_callbacks *cb; void __iomem *base; @@ -65,7 +66,7 @@ struct intel_pmt_namespace { struct intel_pmt_entry *entry); }; -int pmt_telem_read_mmio(struct pci_dev *pdev, struct pmt_callbacks *cb, u32 guid, void *buf, +int pmt_telem_read_mmio(struct device *dev, struct pmt_callbacks *cb, u32 guid, void *buf, void __iomem *addr, loff_t off, u32 count); bool intel_pmt_is_early_client_hw(struct device *dev); int intel_pmt_dev_create(struct intel_pmt_entry *entry, diff --git a/drivers/platform/x86/intel/pmt/discovery.c b/drivers/platform/x86/intel/pmt/discovery.c index e500aa327d23..c482368bfaae 100644 --- a/drivers/platform/x86/intel/pmt/discovery.c +++ b/drivers/platform/x86/intel/pmt/discovery.c @@ -542,7 +542,7 @@ static int pmt_features_probe(struct auxiliary_device *auxdev, const struct auxi if (!priv) return -ENOMEM; - priv->parent = &ivdev->pcidev->dev; + priv->parent = ivdev->dev; auxiliary_set_drvdata(auxdev, priv); priv->dev = device_create(&intel_pmt_class, &auxdev->dev, MKDEV(0, 0), priv, @@ -609,7 +609,7 @@ void intel_pmt_get_features(struct intel_pmt_entry *entry) mutex_lock(&feature_list_lock); list_for_each_entry(feature, &pmt_feature_list, list) { - if (feature->priv->parent != &entry->ep->pcidev->dev) + if (feature->priv->parent != entry->ep->dev) continue; pmt_get_features(entry, feature); diff --git a/drivers/platform/x86/intel/pmt/telemetry.c b/drivers/platform/x86/intel/pmt/telemetry.c index a52803bfe124..bdc7c24a3678 100644 --- a/drivers/platform/x86/intel/pmt/telemetry.c +++ b/drivers/platform/x86/intel/pmt/telemetry.c @@ -112,7 +112,7 @@ static int pmt_telem_add_endpoint(struct intel_vsec_device *ivdev, return -ENOMEM; ep = entry->ep; - ep->pcidev = ivdev->pcidev; + ep->dev = ivdev->dev; ep->header.access_type = entry->header.access_type; ep->header.guid = entry->header.guid; ep->header.base_offset = entry->header.base_offset; @@ -204,7 +204,7 @@ int pmt_telem_get_endpoint_info(int devid, struct telem_endpoint_info *info) goto unlock; } - info->pdev = entry->ep->pcidev; + info->dev = entry->ep->dev; info->header = entry->ep->header; unlock: @@ -218,9 +218,10 @@ static int pmt_copy_region(struct telemetry_region *region, struct intel_pmt_entry *entry) { + struct pci_dev *pdev = to_pci_dev(entry->ep->dev); struct oobmsm_plat_info *plat_info; - plat_info = intel_vsec_get_mapping(entry->ep->pcidev); + plat_info = intel_vsec_get_mapping(pdev); if (IS_ERR(plat_info)) return PTR_ERR(plat_info); @@ -308,7 +309,7 @@ int pmt_telem_read(struct telem_endpoint *ep, u32 id, u64 *data, u32 count) if (offset + NUM_BYTES_QWORD(count) > size) return -EINVAL; - pmt_telem_read_mmio(ep->pcidev, ep->cb, ep->header.guid, data, ep->base, offset, + pmt_telem_read_mmio(ep->dev, ep->cb, ep->header.guid, data, ep->base, offset, NUM_BYTES_QWORD(count)); return ep->present ? 0 : -EPIPE; @@ -335,7 +336,7 @@ int pmt_telem_read32(struct telem_endpoint *ep, u32 id, u32 *data, u32 count) EXPORT_SYMBOL_NS_GPL(pmt_telem_read32, "INTEL_PMT_TELEMETRY"); struct telem_endpoint * -pmt_telem_find_and_register_endpoint(struct pci_dev *pcidev, u32 guid, u16 pos) +pmt_telem_find_and_register_endpoint(struct device *dev, u32 guid, u16 pos) { int devid = 0; int inst = 0; @@ -348,7 +349,7 @@ pmt_telem_find_and_register_endpoint(struct pci_dev *pcidev, u32 guid, u16 pos) if (err) return ERR_PTR(err); - if (ep_info.header.guid == guid && ep_info.pdev == pcidev) { + if (ep_info.header.guid == guid && ep_info.dev == dev) { if (inst == pos) return pmt_telem_register_endpoint(devid); ++inst; diff --git a/drivers/platform/x86/intel/pmt/telemetry.h b/drivers/platform/x86/intel/pmt/telemetry.h index d45af5512b4e..0f88c5e7d90e 100644 --- a/drivers/platform/x86/intel/pmt/telemetry.h +++ b/drivers/platform/x86/intel/pmt/telemetry.h @@ -6,8 +6,8 @@ #define PMT_TELEM_TELEMETRY 0 #define PMT_TELEM_CRASHLOG 1 +struct device; struct telem_endpoint; -struct pci_dev; struct telem_header { u8 access_type; @@ -17,7 +17,7 @@ struct telem_header { }; struct telem_endpoint_info { - struct pci_dev *pdev; + struct device *dev; struct telem_header header; }; @@ -71,8 +71,8 @@ int pmt_telem_get_endpoint_info(int devid, struct telem_endpoint_info *info); /** * pmt_telem_find_and_register_endpoint() - Get a telemetry endpoint from - * pci_dev device, guid and pos - * @pdev: PCI device inside the Intel vsec + * device, guid and pos + * @dev: device inside the Intel vsec * @guid: GUID of the telemetry space * @pos: Instance of the guid * @@ -80,8 +80,8 @@ int pmt_telem_get_endpoint_info(int devid, struct telem_endpoint_info *info); * * endpoint - On success returns pointer to the telemetry endpoint * * -ENXIO - telemetry endpoint not found */ -struct telem_endpoint *pmt_telem_find_and_register_endpoint(struct pci_dev *pcidev, - u32 guid, u16 pos); +struct telem_endpoint * +pmt_telem_find_and_register_endpoint(struct device *dev, u32 guid, u16 pos); /** * pmt_telem_read() - Read qwords from counter sram using sample id diff --git a/drivers/platform/x86/intel/sdsi.c b/drivers/platform/x86/intel/sdsi.c index da75f53d0bcc..d7e37d4ace23 100644 --- a/drivers/platform/x86/intel/sdsi.c +++ b/drivers/platform/x86/intel/sdsi.c @@ -599,13 +599,14 @@ static int sdsi_get_layout(struct sdsi_priv *priv, struct disc_table *table) return 0; } -static int sdsi_map_mbox_registers(struct sdsi_priv *priv, struct pci_dev *parent, +static int sdsi_map_mbox_registers(struct sdsi_priv *priv, struct device *dev, struct disc_table *disc_table, struct resource *disc_res) { u32 access_type = FIELD_GET(DT_ACCESS_TYPE, disc_table->access_info); u32 size = FIELD_GET(DT_SIZE, disc_table->access_info); u32 tbir = FIELD_GET(DT_TBIR, disc_table->offset); u32 offset = DT_OFFSET(disc_table->offset); + struct pci_dev *parent = to_pci_dev(dev); struct resource res = {}; /* Starting location of SDSi MMIO region based on access type */ @@ -681,7 +682,7 @@ static int sdsi_probe(struct auxiliary_device *auxdev, const struct auxiliary_de return ret; /* Map the SDSi mailbox registers */ - ret = sdsi_map_mbox_registers(priv, intel_cap_dev->pcidev, &disc_table, disc_res); + ret = sdsi_map_mbox_registers(priv, intel_cap_dev->dev, &disc_table, disc_res); if (ret) return ret; diff --git a/drivers/platform/x86/intel/vsec.c b/drivers/platform/x86/intel/vsec.c index 938648b9ef09..a547e4b98245 100644 --- a/drivers/platform/x86/intel/vsec.c +++ b/drivers/platform/x86/intel/vsec.c @@ -274,7 +274,7 @@ cleanup_aux: } EXPORT_SYMBOL_NS_GPL(intel_vsec_add_aux, "INTEL_VSEC"); -static int intel_vsec_add_dev(struct pci_dev *pdev, struct intel_vsec_header *header, +static int intel_vsec_add_dev(struct device *dev, struct intel_vsec_header *header, const struct intel_vsec_platform_info *info, unsigned long cap_id, u64 base_addr) { @@ -288,18 +288,18 @@ static int intel_vsec_add_dev(struct pci_dev *pdev, struct intel_vsec_header *he if (info->parent) parent = info->parent; else - parent = &pdev->dev; + parent = dev; if (!intel_vsec_supported(header->id, info->caps)) return -EINVAL; if (!header->num_entries) { - dev_dbg(&pdev->dev, "Invalid 0 entry count for header id %d\n", header->id); + dev_dbg(dev, "Invalid 0 entry count for header id %d\n", header->id); return -EINVAL; } if (!header->entry_size) { - dev_dbg(&pdev->dev, "Invalid 0 entry size for header id %d\n", header->id); + dev_dbg(dev, "Invalid 0 entry size for header id %d\n", header->id); return -EINVAL; } @@ -331,7 +331,7 @@ static int intel_vsec_add_dev(struct pci_dev *pdev, struct intel_vsec_header *he release_mem_region(tmp->start, resource_size(tmp)); } - intel_vsec_dev->pcidev = pdev; + intel_vsec_dev->dev = dev; intel_vsec_dev->resource = no_free_ptr(res); intel_vsec_dev->num_resources = header->num_entries; intel_vsec_dev->quirks = info->quirks; @@ -409,13 +409,14 @@ static int get_cap_id(u32 header_id, unsigned long *cap_id) return 0; } -static int intel_vsec_register_device(struct pci_dev *pdev, +static int intel_vsec_register_device(struct device *dev, struct intel_vsec_header *header, const struct intel_vsec_platform_info *info, u64 base_addr) { const struct vsec_feature_dependency *consumer_deps; struct vsec_priv *priv; + struct pci_dev *pdev; unsigned long cap_id; int ret; @@ -427,8 +428,12 @@ static int intel_vsec_register_device(struct pci_dev *pdev, * Only track dependencies for devices probed by the VSEC driver. * For others using the exported APIs, add the device directly. */ + if (!dev_is_pci(dev)) + return intel_vsec_add_dev(dev, header, info, cap_id, base_addr); + + pdev = to_pci_dev(dev); if (!pci_match_id(intel_vsec_pci_ids, pdev)) - return intel_vsec_add_dev(pdev, header, info, cap_id, base_addr); + return intel_vsec_add_dev(dev, header, info, cap_id, base_addr); priv = pci_get_drvdata(pdev); if (priv->state[cap_id] == STATE_REGISTERED || @@ -444,7 +449,7 @@ static int intel_vsec_register_device(struct pci_dev *pdev, consumer_deps = get_consumer_dependencies(priv, cap_id); if (!consumer_deps || suppliers_ready(priv, consumer_deps, cap_id)) { - ret = intel_vsec_add_dev(pdev, header, info, cap_id, base_addr); + ret = intel_vsec_add_dev(dev, header, info, cap_id, base_addr); if (ret) priv->state[cap_id] = STATE_SKIP; else @@ -456,7 +461,7 @@ static int intel_vsec_register_device(struct pci_dev *pdev, return -EAGAIN; } -static bool intel_vsec_walk_header(struct pci_dev *pdev, +static bool intel_vsec_walk_header(struct device *dev, const struct intel_vsec_platform_info *info) { struct intel_vsec_header **header = info->headers; @@ -464,7 +469,7 @@ static bool intel_vsec_walk_header(struct pci_dev *pdev, int ret; for ( ; *header; header++) { - ret = intel_vsec_register_device(pdev, *header, info, info->base_addr); + ret = intel_vsec_register_device(dev, *header, info, info->base_addr); if (!ret) have_devices = true; } @@ -512,7 +517,7 @@ static bool intel_vsec_walk_dvsec(struct pci_dev *pdev, pci_read_config_dword(pdev, pos + PCI_DVSEC_HEADER2, &hdr); header.id = PCI_DVSEC_HEADER2_ID(hdr); - ret = intel_vsec_register_device(pdev, &header, info, + ret = intel_vsec_register_device(&pdev->dev, &header, info, pci_resource_start(pdev, header.tbir)); if (ret) continue; @@ -558,7 +563,7 @@ static bool intel_vsec_walk_vsec(struct pci_dev *pdev, header.tbir = INTEL_DVSEC_TABLE_BAR(table); header.offset = INTEL_DVSEC_TABLE_OFFSET(table); - ret = intel_vsec_register_device(pdev, &header, info, + ret = intel_vsec_register_device(&pdev->dev, &header, info, pci_resource_start(pdev, header.tbir)); if (ret) continue; @@ -569,13 +574,13 @@ static bool intel_vsec_walk_vsec(struct pci_dev *pdev, return have_devices; } -int intel_vsec_register(struct pci_dev *pdev, +int intel_vsec_register(struct device *dev, const struct intel_vsec_platform_info *info) { - if (!pdev || !info || !info->headers) + if (!dev || !info || !info->headers) return -EINVAL; - if (!intel_vsec_walk_header(pdev, info)) + if (!intel_vsec_walk_header(dev, info)) return -ENODEV; else return 0; @@ -601,7 +606,7 @@ static bool intel_vsec_get_features(struct pci_dev *pdev, found = true; if (info && (info->quirks & VSEC_QUIRK_NO_DVSEC) && - intel_vsec_walk_header(pdev, info)) + intel_vsec_walk_header(&pdev->dev, info)) found = true; return found; @@ -673,7 +678,10 @@ int intel_vsec_set_mapping(struct oobmsm_plat_info *plat_info, { struct vsec_priv *priv; - priv = pci_get_drvdata(vsec_dev->pcidev); + if (!dev_is_pci(vsec_dev->dev)) + return -ENODEV; + + priv = pci_get_drvdata(to_pci_dev(vsec_dev->dev)); if (!priv) return -EINVAL; @@ -821,7 +829,7 @@ static pci_ers_result_t intel_vsec_pci_slot_reset(struct pci_dev *pdev) xa_for_each(&auxdev_array, index, intel_vsec_dev) { /* check if pdev doesn't match */ - if (pdev != intel_vsec_dev->pcidev) + if (&pdev->dev != intel_vsec_dev->dev) continue; devm_release_action(&pdev->dev, intel_vsec_remove_aux, &intel_vsec_dev->auxdev); diff --git a/drivers/platform/x86/intel/vsec_tpmi.c b/drivers/platform/x86/intel/vsec_tpmi.c index 2298b6361094..9dddf4e5863e 100644 --- a/drivers/platform/x86/intel/vsec_tpmi.c +++ b/drivers/platform/x86/intel/vsec_tpmi.c @@ -530,7 +530,7 @@ static const struct file_operations mem_write_ops = { .release = single_release, }; -#define tpmi_to_dev(info) (&info->vsec_dev->pcidev->dev) +#define tpmi_to_dev(info) ((info)->vsec_dev->dev) static void tpmi_dbgfs_register(struct intel_tpmi_info *tpmi_info) { @@ -642,7 +642,7 @@ static int tpmi_create_device(struct intel_tpmi_info *tpmi_info, tmp->flags = IORESOURCE_MEM; } - feature_vsec_dev->pcidev = vsec_dev->pcidev; + feature_vsec_dev->dev = vsec_dev->dev; feature_vsec_dev->resource = res; feature_vsec_dev->num_resources = pfs->pfs_header.num_entries; feature_vsec_dev->priv_data = &tpmi_info->plat_info; @@ -742,7 +742,7 @@ static int tpmi_fetch_pfs_header(struct intel_tpmi_pm_feature *pfs, u64 start, i static int intel_vsec_tpmi_init(struct auxiliary_device *auxdev) { struct intel_vsec_device *vsec_dev = auxdev_to_ivdev(auxdev); - struct pci_dev *pci_dev = vsec_dev->pcidev; + struct pci_dev *pci_dev = to_pci_dev(vsec_dev->dev); struct intel_tpmi_info *tpmi_info; u64 pfs_start = 0; int ret, i; diff --git a/include/linux/intel_vsec.h b/include/linux/intel_vsec.h index 49a746ec0128..4eecb2a6bac4 100644 --- a/include/linux/intel_vsec.h +++ b/include/linux/intel_vsec.h @@ -29,6 +29,7 @@ #define INTEL_DVSEC_TABLE_OFFSET(x) ((x) & GENMASK(31, 3)) #define TABLE_OFFSET_SHIFT 3 +struct device; struct pci_dev; struct resource; @@ -82,14 +83,14 @@ enum intel_vsec_quirks { * struct pmt_callbacks - Callback infrastructure for PMT devices * @read_telem: when specified, called by client driver to access PMT * data (instead of direct copy). - * * pdev: PCI device reference for the callback's use + * * dev: device reference for the callback's use * * guid: ID of data to acccss * * data: buffer for the data to be copied * * off: offset into the requested buffer * * count: size of buffer */ struct pmt_callbacks { - int (*read_telem)(struct pci_dev *pdev, u32 guid, u64 *data, loff_t off, u32 count); + int (*read_telem)(struct device *dev, u32 guid, u64 *data, loff_t off, u32 count); }; struct vsec_feature_dependency { @@ -122,7 +123,7 @@ struct intel_vsec_platform_info { /** * struct intel_vsec_device - Auxbus specific device information * @auxdev: auxbus device struct for auxbus access - * @pcidev: pci device associated with the device + * @dev: struct device associated with the device * @resource: any resources shared by the parent * @ida: id reference * @num_resources: number of resources @@ -135,7 +136,7 @@ struct intel_vsec_platform_info { */ struct intel_vsec_device { struct auxiliary_device auxdev; - struct pci_dev *pcidev; + struct device *dev; struct resource *resource; struct ida *ida; int num_resources; @@ -199,13 +200,13 @@ static inline struct intel_vsec_device *auxdev_to_ivdev(struct auxiliary_device } #if IS_ENABLED(CONFIG_INTEL_VSEC) -int intel_vsec_register(struct pci_dev *pdev, +int intel_vsec_register(struct device *dev, const struct intel_vsec_platform_info *info); int intel_vsec_set_mapping(struct oobmsm_plat_info *plat_info, struct intel_vsec_device *vsec_dev); struct oobmsm_plat_info *intel_vsec_get_mapping(struct pci_dev *pdev); #else -static inline int intel_vsec_register(struct pci_dev *pdev, +static inline int intel_vsec_register(struct device *dev, const struct intel_vsec_platform_info *info) { return -ENODEV; -- cgit v1.2.3 From 22fa2ebc11a164e1ea529da6c356e3e01aef8ac8 Mon Sep 17 00:00:00 2001 From: "David E. Box" Date: Thu, 12 Mar 2026 18:51:45 -0700 Subject: platform/x86/intel/vsec: Plumb ACPI PMT discovery tables through vsec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some platforms expose PMT discovery via ACPI instead of PCI BARs. Add a generic discovery source flag and carry ACPI discovery entries alongside the existing PCI resource path so PMT clients can consume either. Changes: - Add enum intel_vsec_disc_source { _PCI, _ACPI }. - Extend intel_vsec_platform_info and intel_vsec_device with source enum and ACPI discovery table pointer/ - When src==ACPI, skip BAR resource setup and copy the ACPI discovery entries into the aux device. No user-visible behavior change yet; this only wires ACPI data through vsec in preparation for ACPI-enumerated PMT clients. Signed-off-by: David E. Box Link: https://patch.msgid.link/20260313015202.3660072-7-david.e.box@linux.intel.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- drivers/platform/x86/intel/vsec.c | 23 +++++++++++++++++++++++ include/linux/intel_vsec.h | 20 +++++++++++++++++++- 2 files changed, 42 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/platform/x86/intel/vsec.c b/drivers/platform/x86/intel/vsec.c index 34b2c19ecff0..7d5dbc1c1d05 100644 --- a/drivers/platform/x86/intel/vsec.c +++ b/drivers/platform/x86/intel/vsec.c @@ -24,7 +24,9 @@ #include #include #include +#include #include +#include #include #define PMT_XA_START 0 @@ -109,6 +111,7 @@ static void intel_vsec_dev_release(struct device *dev) ida_free(intel_vsec_dev->ida, intel_vsec_dev->auxdev.id); + kfree(intel_vsec_dev->acpi_disc); kfree(intel_vsec_dev->resource); kfree(intel_vsec_dev); } @@ -320,6 +323,13 @@ static int intel_vsec_add_dev(struct device *dev, struct intel_vsec_header *head * auxiliary device driver. */ for (i = 0, tmp = res; i < header->num_entries; i++, tmp++) { + /* + * Skip resource mapping check for ACPI-based discovery + * since those tables are read from _DSD, not MMIO. + */ + if (info->src == INTEL_VSEC_DISC_ACPI) + break; + tmp->start = base_addr + header->offset + i * (header->entry_size * sizeof(u32)); tmp->end = tmp->start + (header->entry_size * sizeof(u32)) - 1; tmp->flags = IORESOURCE_MEM; @@ -338,6 +348,19 @@ static int intel_vsec_add_dev(struct device *dev, struct intel_vsec_header *head intel_vsec_dev->base_addr = info->base_addr; intel_vsec_dev->priv_data = info->priv_data; intel_vsec_dev->cap_id = cap_id; + intel_vsec_dev->src = info->src; + + if (info->src == INTEL_VSEC_DISC_ACPI) { + size_t bytes; + + if (check_mul_overflow(intel_vsec_dev->num_resources, + sizeof(*info->acpi_disc), &bytes)) + return -EOVERFLOW; + + intel_vsec_dev->acpi_disc = kmemdup(info->acpi_disc, bytes, GFP_KERNEL); + if (!intel_vsec_dev->acpi_disc) + return -ENOMEM; + } if (header->id == VSEC_ID_SDSI) intel_vsec_dev->ida = &intel_vsec_sdsi_ida; diff --git a/include/linux/intel_vsec.h b/include/linux/intel_vsec.h index 4eecb2a6bac4..1fe5665a9d02 100644 --- a/include/linux/intel_vsec.h +++ b/include/linux/intel_vsec.h @@ -33,6 +33,11 @@ struct device; struct pci_dev; struct resource; +enum intel_vsec_disc_source { + INTEL_VSEC_DISC_PCI, /* PCI, default */ + INTEL_VSEC_DISC_ACPI, /* ACPI */ +}; + enum intel_vsec_id { VSEC_ID_TELEMETRY = 2, VSEC_ID_WATCHER = 3, @@ -103,6 +108,10 @@ struct vsec_feature_dependency { * @parent: parent device in the auxbus chain * @headers: list of headers to define the PMT client devices to create * @deps: array of feature dependencies + * @acpi_disc: ACPI discovery tables, each entry is two QWORDs + * in little-endian format as defined by the PMT ACPI spec. + * Valid only when @provider == INTEL_VSEC_DISC_ACPI. + * @src: source of discovery table data * @priv_data: private data, usable by parent devices, currently a callback * @caps: bitmask of PMT capabilities for the given headers * @quirks: bitmask of VSEC device quirks @@ -113,6 +122,8 @@ struct intel_vsec_platform_info { struct device *parent; struct intel_vsec_header **headers; const struct vsec_feature_dependency *deps; + u32 (*acpi_disc)[4]; + enum intel_vsec_disc_source src; void *priv_data; unsigned long caps; unsigned long quirks; @@ -124,7 +135,12 @@ struct intel_vsec_platform_info { * struct intel_vsec_device - Auxbus specific device information * @auxdev: auxbus device struct for auxbus access * @dev: struct device associated with the device - * @resource: any resources shared by the parent + * @resource: PCI discovery resources (BAR windows), one per discovery + * instance. Valid only when @src == INTEL_VSEC_DISC_PCI + * @acpi_disc: ACPI discovery tables, each entry is two QWORDs + * in little-endian format as defined by the PMT ACPI spec. + * Valid only when @src == INTEL_VSEC_DISC_ACPI. + * @src: source of discovery table data * @ida: id reference * @num_resources: number of resources * @id: xarray id @@ -138,6 +154,8 @@ struct intel_vsec_device { struct auxiliary_device auxdev; struct device *dev; struct resource *resource; + u32 (*acpi_disc)[4]; + enum intel_vsec_disc_source src; struct ida *ida; int num_resources; int id; /* xa */ -- cgit v1.2.3 From bb8539e0e60916ef3ed4a92eb2f3cfd8e34061ef Mon Sep 17 00:00:00 2001 From: Qingfang Deng Date: Mon, 16 Mar 2026 17:28:23 +0800 Subject: ppp: require callers of ppp_dev_name() to hold RCU ppp_dev_name() holds the RCU read lock internally to protect pch->ppp. However, as it returns netdev->name to the caller, the caller should also hold either RCU or RTNL lock to prevent the netdev from being freed. The only two references of the function is in the L2TP driver, both of which already hold RCU. So remove the internal RCU lock and document that callers must hold RCU. Signed-off-by: Qingfang Deng Reviewed-by: Breno Leitao Link: https://patch.msgid.link/20260316092824.479149-1-dqfext@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ppp/ppp_generic.c | 3 +-- include/linux/ppp_channel.h | 4 +++- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index a036ddfe327b..cb29a6968c63 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -2969,6 +2969,7 @@ int ppp_unit_number(struct ppp_channel *chan) /* * Return the PPP device interface name of a channel. + * Caller must hold RCU read lock. */ char *ppp_dev_name(struct ppp_channel *chan) { @@ -2977,11 +2978,9 @@ char *ppp_dev_name(struct ppp_channel *chan) struct ppp *ppp; if (pch) { - rcu_read_lock(); ppp = rcu_dereference(pch->ppp); if (ppp && ppp->dev) name = ppp->dev->name; - rcu_read_unlock(); } return name; } diff --git a/include/linux/ppp_channel.h b/include/linux/ppp_channel.h index ca8ad03eeef0..2f63e9a6cc88 100644 --- a/include/linux/ppp_channel.h +++ b/include/linux/ppp_channel.h @@ -72,7 +72,9 @@ extern int ppp_channel_index(struct ppp_channel *); /* Get the unit number associated with a channel, or -1 if none */ extern int ppp_unit_number(struct ppp_channel *); -/* Get the device name associated with a channel, or NULL if none */ +/* Get the device name associated with a channel, or NULL if none. + * Caller must hold RCU read lock. + */ extern char *ppp_dev_name(struct ppp_channel *); /* -- cgit v1.2.3 From b520c4eef83dd406591431f936de0908c3ed7fb9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Mar 2026 17:11:30 +0100 Subject: block: split bio_alloc_bioset more clearly into a fast and slowpath bio_alloc_bioset tries non-waiting slab allocations first for the bio and bvec array, but does so in a somewhat convoluted way. Restructure the function so that it first open codes these slab allocations, and then falls back to the mempools with the original gfp mask. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni -ck Reviewed-by: Martin K. Petersen Link: https://patch.msgid.link/20260316161144.1607877-3-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 180 +++++++++++++++++++++------------------------------- include/linux/bio.h | 3 +- 2 files changed, 74 insertions(+), 109 deletions(-) (limited to 'include/linux') diff --git a/block/bio.c b/block/bio.c index 6131ccb7284a..5982bf069cef 100644 --- a/block/bio.c +++ b/block/bio.c @@ -176,43 +176,12 @@ static void bvec_free(struct mempool *pool, struct bio_vec *bv, * Make the first allocation restricted and don't dump info on allocation * failures, since we'll fall back to the mempool in case of failure. */ -static inline gfp_t bvec_alloc_gfp(gfp_t gfp) +static inline gfp_t try_alloc_gfp(gfp_t gfp) { return (gfp & ~(__GFP_DIRECT_RECLAIM | __GFP_IO)) | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; } -static struct bio_vec *bvec_alloc(struct mempool *pool, unsigned short *nr_vecs, - gfp_t gfp_mask) -{ - struct biovec_slab *bvs = biovec_slab(*nr_vecs); - - if (WARN_ON_ONCE(!bvs)) - return NULL; - - /* - * Upgrade the nr_vecs request to take full advantage of the allocation. - * We also rely on this in the bvec_free path. - */ - *nr_vecs = bvs->nr_vecs; - - /* - * Try a slab allocation first for all smaller allocations. If that - * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool. - * The mempool is sized to handle up to BIO_MAX_VECS entries. - */ - if (*nr_vecs < BIO_MAX_VECS) { - struct bio_vec *bvl; - - bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask)); - if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM)) - return bvl; - *nr_vecs = BIO_MAX_VECS; - } - - return mempool_alloc(pool, gfp_mask); -} - void bio_uninit(struct bio *bio) { #ifdef CONFIG_BLK_CGROUP @@ -433,13 +402,31 @@ static void bio_alloc_rescue(struct work_struct *work) } } +/* + * submit_bio_noacct() converts recursion to iteration; this means if we're + * running beneath it, any bios we allocate and submit will not be submitted + * (and thus freed) until after we return. + * + * This exposes us to a potential deadlock if we allocate multiple bios from the + * same bio_set while running underneath submit_bio_noacct(). If we were to + * allocate multiple bios (say a stacking block driver that was splitting bios), + * we would deadlock if we exhausted the mempool's reserve. + * + * We solve this, and guarantee forward progress by punting the bios on + * current->bio_list to a per bio_set rescuer workqueue before blocking to wait + * for elements being returned to the mempool. + */ static void punt_bios_to_rescuer(struct bio_set *bs) { struct bio_list punt, nopunt; struct bio *bio; - if (WARN_ON_ONCE(!bs->rescue_workqueue)) + if (!current->bio_list || !bs->rescue_workqueue) return; + if (bio_list_empty(¤t->bio_list[0]) && + bio_list_empty(¤t->bio_list[1])) + return; + /* * In order to guarantee forward progress we must punt only bios that * were allocated from this bio_set; otherwise, if there was a bio on @@ -486,9 +473,7 @@ static void bio_alloc_irq_cache_splice(struct bio_alloc_cache *cache) local_irq_restore(flags); } -static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, - unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp, - struct bio_set *bs) +static struct bio *bio_alloc_percpu_cache(struct bio_set *bs) { struct bio_alloc_cache *cache; struct bio *bio; @@ -506,11 +491,6 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, cache->free_list = bio->bi_next; cache->nr--; put_cpu(); - - if (nr_vecs) - bio_init_inline(bio, bdev, nr_vecs, opf); - else - bio_init(bio, bdev, NULL, nr_vecs, opf); bio->bi_pool = bs; return bio; } @@ -520,7 +500,7 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, * @bdev: block device to allocate the bio for (can be %NULL) * @nr_vecs: number of bvecs to pre-allocate * @opf: operation and flags for bio - * @gfp_mask: the GFP_* mask given to the slab allocator + * @gfp: the GFP_* mask given to the slab allocator * @bs: the bio_set to allocate from. * * Allocate a bio from the mempools in @bs. @@ -550,91 +530,77 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, * Returns: Pointer to new bio on success, NULL on failure. */ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, - blk_opf_t opf, gfp_t gfp_mask, - struct bio_set *bs) + blk_opf_t opf, gfp_t gfp, struct bio_set *bs) { - gfp_t saved_gfp = gfp_mask; - struct bio *bio; + struct bio_vec *bvecs = NULL; + struct bio *bio = NULL; + gfp_t saved_gfp = gfp; void *p; /* should not use nobvec bioset for nr_vecs > 0 */ if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_vecs > 0)) return NULL; + gfp = try_alloc_gfp(gfp); if (bs->cache && nr_vecs <= BIO_INLINE_VECS) { - opf |= REQ_ALLOC_CACHE; - bio = bio_alloc_percpu_cache(bdev, nr_vecs, opf, - gfp_mask, bs); - if (bio) - return bio; /* - * No cached bio available, bio returned below marked with - * REQ_ALLOC_CACHE to participate in per-cpu alloc cache. + * Set REQ_ALLOC_CACHE even if no cached bio is available to + * return the allocated bio to the percpu cache when done. */ - } else + opf |= REQ_ALLOC_CACHE; + bio = bio_alloc_percpu_cache(bs); + } else { opf &= ~REQ_ALLOC_CACHE; - - /* - * submit_bio_noacct() converts recursion to iteration; this means if - * we're running beneath it, any bios we allocate and submit will not be - * submitted (and thus freed) until after we return. - * - * This exposes us to a potential deadlock if we allocate multiple bios - * from the same bio_set() while running underneath submit_bio_noacct(). - * If we were to allocate multiple bios (say a stacking block driver - * that was splitting bios), we would deadlock if we exhausted the - * mempool's reserve. - * - * We solve this, and guarantee forward progress, with a rescuer - * workqueue per bio_set. If we go to allocate and there are bios on - * current->bio_list, we first try the allocation without - * __GFP_DIRECT_RECLAIM; if that fails, we punt those bios we would be - * blocking to the rescuer workqueue before we retry with the original - * gfp_flags. - */ - if (current->bio_list && - (!bio_list_empty(¤t->bio_list[0]) || - !bio_list_empty(¤t->bio_list[1])) && - bs->rescue_workqueue) - gfp_mask &= ~__GFP_DIRECT_RECLAIM; - - p = mempool_alloc(&bs->bio_pool, gfp_mask); - if (!p && gfp_mask != saved_gfp) { - punt_bios_to_rescuer(bs); - gfp_mask = saved_gfp; - p = mempool_alloc(&bs->bio_pool, gfp_mask); + p = kmem_cache_alloc(bs->bio_slab, gfp); + if (p) + bio = p + bs->front_pad; } - if (unlikely(!p)) - return NULL; - if (!mempool_is_saturated(&bs->bio_pool)) - opf &= ~REQ_ALLOC_CACHE; - bio = p + bs->front_pad; - if (nr_vecs > BIO_INLINE_VECS) { - struct bio_vec *bvl = NULL; + if (bio && nr_vecs > BIO_INLINE_VECS) { + struct biovec_slab *bvs = biovec_slab(nr_vecs); - bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask); - if (!bvl && gfp_mask != saved_gfp) { - punt_bios_to_rescuer(bs); - gfp_mask = saved_gfp; - bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask); + /* + * Upgrade nr_vecs to take full advantage of the allocation. + * We also rely on this in bvec_free(). + */ + nr_vecs = bvs->nr_vecs; + bvecs = kmem_cache_alloc(bvs->slab, gfp); + if (unlikely(!bvecs)) { + kmem_cache_free(bs->bio_slab, p); + bio = NULL; } - if (unlikely(!bvl)) - goto err_free; + } - bio_init(bio, bdev, bvl, nr_vecs, opf); - } else if (nr_vecs) { - bio_init_inline(bio, bdev, BIO_INLINE_VECS, opf); - } else { - bio_init(bio, bdev, NULL, 0, opf); + if (unlikely(!bio)) { + /* + * Give up if we are not allow to sleep as non-blocking mempool + * allocations just go back to the slab allocation. + */ + if (!(saved_gfp & __GFP_DIRECT_RECLAIM)) + return NULL; + + punt_bios_to_rescuer(bs); + + /* + * Don't rob the mempools by returning to the per-CPU cache if + * we're tight on memory. + */ + opf &= ~REQ_ALLOC_CACHE; + + p = mempool_alloc(&bs->bio_pool, gfp); + bio = p + bs->front_pad; + if (nr_vecs > BIO_INLINE_VECS) { + nr_vecs = BIO_MAX_VECS; + bvecs = mempool_alloc(&bs->bvec_pool, gfp); + } } + if (nr_vecs && nr_vecs <= BIO_INLINE_VECS) + bio_init_inline(bio, bdev, nr_vecs, opf); + else + bio_init(bio, bdev, bvecs, nr_vecs, opf); bio->bi_pool = bs; return bio; - -err_free: - mempool_free(p, &bs->bio_pool); - return NULL; } EXPORT_SYMBOL(bio_alloc_bioset); diff --git a/include/linux/bio.h b/include/linux/bio.h index 9693a0d6fefe..984844d2870b 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -350,8 +350,7 @@ extern void bioset_exit(struct bio_set *); extern int biovec_init_pool(mempool_t *pool, int pool_entries); struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, - blk_opf_t opf, gfp_t gfp_mask, - struct bio_set *bs); + blk_opf_t opf, gfp_t gfp, struct bio_set *bs); struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask); extern void bio_put(struct bio *); -- cgit v1.2.3 From 10febd397591d93f42adb743c2c664041e7f1bcb Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Thu, 12 Mar 2026 04:44:30 +0000 Subject: sched/topology: Remove sched_domain_shared allocation with sd_data Now that "sd->shared" assignments are using the sched_domain_shared objects allocated with s_data, remove the sd_data based allocations. Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Link: https://patch.msgid.link/20260312044434.1974-6-kprateek.nayak@amd.com --- include/linux/sched/topology.h | 1 - kernel/sched/topology.c | 19 ------------------- 2 files changed, 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index a1e1032426dc..51c29581f15e 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -172,7 +172,6 @@ typedef int (*sched_domain_flags_f)(void); struct sd_data { struct sched_domain *__percpu *sd; - struct sched_domain_shared *__percpu *sds; struct sched_group *__percpu *sg; struct sched_group_capacity *__percpu *sgc; }; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b19d84f44669..43150591914b 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1609,9 +1609,6 @@ static void claim_allocations(int cpu, struct s_data *d) WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); *per_cpu_ptr(sdd->sd, cpu) = NULL; - if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref)) - *per_cpu_ptr(sdd->sds, cpu) = NULL; - if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) *per_cpu_ptr(sdd->sg, cpu) = NULL; @@ -2390,10 +2387,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sdd->sd) return -ENOMEM; - sdd->sds = alloc_percpu(struct sched_domain_shared *); - if (!sdd->sds) - return -ENOMEM; - sdd->sg = alloc_percpu(struct sched_group *); if (!sdd->sg) return -ENOMEM; @@ -2404,7 +2397,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map) for_each_cpu(j, cpu_map) { struct sched_domain *sd; - struct sched_domain_shared *sds; struct sched_group *sg; struct sched_group_capacity *sgc; @@ -2415,13 +2407,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map) *per_cpu_ptr(sdd->sd, j) = sd; - sds = kzalloc_node(sizeof(struct sched_domain_shared), - GFP_KERNEL, cpu_to_node(j)); - if (!sds) - return -ENOMEM; - - *per_cpu_ptr(sdd->sds, j) = sds; - sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); if (!sg) @@ -2463,8 +2448,6 @@ static void __sdt_free(const struct cpumask *cpu_map) kfree(*per_cpu_ptr(sdd->sd, j)); } - if (sdd->sds) - kfree(*per_cpu_ptr(sdd->sds, j)); if (sdd->sg) kfree(*per_cpu_ptr(sdd->sg, j)); if (sdd->sgc) @@ -2472,8 +2455,6 @@ static void __sdt_free(const struct cpumask *cpu_map) } free_percpu(sdd->sd); sdd->sd = NULL; - free_percpu(sdd->sds); - sdd->sds = NULL; free_percpu(sdd->sg); sdd->sg = NULL; free_percpu(sdd->sgc); -- cgit v1.2.3 From 8ca12326f592f7554acf2788ecb1c5c954dcf31c Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 16 Mar 2026 00:36:22 +0100 Subject: PM: EM: Switch to rcu_dereference_all() in wakeup path em_cpu_energy() is part of the EAS (Fair) task wakeup path. Now that rcu_read_{,un}lock() have been removed from find_energy_efficient_cpu() switch to rcu_dereference_all() and check for rcu_read_lock_any_held() in em_cpu_energy() as well. In EAS (Fair) task wakeup path is a preempt/IRQ disabled region, so rcu_read_{,un}lock() can be removed. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Link: https://patch.msgid.link/5b1228b7-5949-4a45-9f62-e8ce936de694@arm.com --- include/linux/energy_model.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index e7497f804644..c909a8ba22e8 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -248,7 +248,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, struct em_perf_state *ps; int i; - WARN_ONCE(!rcu_read_lock_held(), "EM: rcu read lock needed\n"); + lockdep_assert(rcu_read_lock_any_held()); if (!sum_util) return 0; @@ -267,7 +267,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, * Find the lowest performance state of the Energy Model above the * requested performance. */ - em_table = rcu_dereference(pd->em_table); + em_table = rcu_dereference_all(pd->em_table); i = em_pd_get_efficient_state(em_table->state, pd, max_util); ps = &em_table->state[i]; -- cgit v1.2.3 From b7560798466a07d9c3fb011698e92c335ab28baf Mon Sep 17 00:00:00 2001 From: Devendra K Verma Date: Wed, 18 Mar 2026 12:34:03 +0530 Subject: dmaengine: dw-edma: Add non-LL mode AMD MDB IP supports Linked List (LL) mode as well as non-LL mode. The current code does not have the mechanisms to enable the DMA transactions using the non-LL mode. The following two cases are added with this patch: - For the AMD (Xilinx) only, when a valid physical base address of the device side DDR is not configured, then the IP can still be used in non-LL mode. For all the channels DMA transactions will be using the non-LL mode only. This, the default non-LL mode, is not applicable for Synopsys IP with the current code addition. - If the default mode is LL-mode, for both AMD (Xilinx) and Synosys, and if user wants to use non-LL mode then user can do so via configuring the peripheral_config param of dma_slave_config. Signed-off-by: Devendra K Verma Reviewed-by: Frank Li Link: https://patch.msgid.link/20260318070403.1634706-3-devendra.verma@amd.com Signed-off-by: Vinod Koul --- drivers/dma/dw-edma/dw-edma-core.c | 47 ++++++++++++++++++++++++- drivers/dma/dw-edma/dw-edma-core.h | 1 + drivers/dma/dw-edma/dw-edma-pcie.c | 44 +++++++++++++++++------- drivers/dma/dw-edma/dw-hdma-v0-core.c | 64 ++++++++++++++++++++++++++++++++++- drivers/dma/dw-edma/dw-hdma-v0-regs.h | 1 + include/linux/dma/edma.h | 1 + 6 files changed, 143 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/drivers/dma/dw-edma/dw-edma-core.c b/drivers/dma/dw-edma/dw-edma-core.c index 94bcbd560143..c2feb3adc79f 100644 --- a/drivers/dma/dw-edma/dw-edma-core.c +++ b/drivers/dma/dw-edma/dw-edma-core.c @@ -223,6 +223,43 @@ static int dw_edma_device_config(struct dma_chan *dchan, struct dma_slave_config *config) { struct dw_edma_chan *chan = dchan2dw_edma_chan(dchan); + bool cfg_non_ll; + int non_ll = 0; + + chan->non_ll = false; + if (chan->dw->chip->mf == EDMA_MF_HDMA_NATIVE) { + if (config->peripheral_config && + config->peripheral_size != sizeof(int)) { + dev_err(dchan->device->dev, + "config param peripheral size mismatch\n"); + return -EINVAL; + } + + /* + * When there is no valid LLP base address available then the + * default DMA ops will use the non-LL mode. + * + * Cases where LL mode is enabled and client wants to use the + * non-LL mode then also client can do so via providing the + * peripheral_config param. + */ + cfg_non_ll = chan->dw->chip->cfg_non_ll; + if (config->peripheral_config) { + non_ll = *(int *)config->peripheral_config; + + if (cfg_non_ll && !non_ll) { + dev_err(dchan->device->dev, "invalid configuration\n"); + return -EINVAL; + } + } + + if (cfg_non_ll || non_ll) + chan->non_ll = true; + } else if (config->peripheral_config) { + dev_err(dchan->device->dev, + "peripheral config param applicable only for HDMA\n"); + return -EINVAL; + } memcpy(&chan->config, config, sizeof(*config)); chan->configured = true; @@ -358,6 +395,7 @@ dw_edma_device_transfer(struct dw_edma_transfer *xfer) struct dw_edma_desc *desc; u64 src_addr, dst_addr; size_t fsz = 0; + u32 bursts_max; u32 cnt = 0; int i; @@ -415,6 +453,13 @@ dw_edma_device_transfer(struct dw_edma_transfer *xfer) return NULL; } + /* + * For non-LL mode, only a single burst can be handled + * in a single chunk unlike LL mode where multiple bursts + * can be configured in a single chunk. + */ + bursts_max = chan->non_ll ? 1 : chan->ll_max; + desc = dw_edma_alloc_desc(chan); if (unlikely(!desc)) goto err_alloc; @@ -450,7 +495,7 @@ dw_edma_device_transfer(struct dw_edma_transfer *xfer) if (xfer->type == EDMA_XFER_SCATTER_GATHER && !sg) break; - if (chunk->bursts_alloc == chan->ll_max) { + if (chunk->bursts_alloc == bursts_max) { chunk = dw_edma_alloc_chunk(desc); if (unlikely(!chunk)) goto err_alloc; diff --git a/drivers/dma/dw-edma/dw-edma-core.h b/drivers/dma/dw-edma/dw-edma-core.h index 59b24973fa7d..902574b1ba86 100644 --- a/drivers/dma/dw-edma/dw-edma-core.h +++ b/drivers/dma/dw-edma/dw-edma-core.h @@ -86,6 +86,7 @@ struct dw_edma_chan { u8 configured; struct dma_slave_config config; + bool non_ll; }; struct dw_edma_irq { diff --git a/drivers/dma/dw-edma/dw-edma-pcie.c b/drivers/dma/dw-edma/dw-edma-pcie.c index 0cb5850ca411..0b30ce138503 100644 --- a/drivers/dma/dw-edma/dw-edma-pcie.c +++ b/drivers/dma/dw-edma/dw-edma-pcie.c @@ -295,6 +295,15 @@ static void dw_edma_pcie_get_xilinx_dma_data(struct pci_dev *pdev, pdata->devmem_phys_off = off; } +static u64 dw_edma_get_phys_addr(struct pci_dev *pdev, + struct dw_edma_pcie_data *pdata, + enum pci_barno bar) +{ + if (pdev->vendor == PCI_VENDOR_ID_XILINX) + return pdata->devmem_phys_off; + return pci_bus_address(pdev, bar); +} + static int dw_edma_pcie_probe(struct pci_dev *pdev, const struct pci_device_id *pid) { @@ -303,6 +312,7 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev, struct dw_edma_chip *chip; int err, nr_irqs; int i, mask; + bool non_ll = false; struct dw_edma_pcie_data *vsec_data __free(kfree) = kmalloc_obj(*vsec_data); @@ -329,21 +339,24 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev, /* * There is no valid address found for the LL memory - * space on the device side. + * space on the device side. In the absence of LL base + * address use the non-LL mode or simple mode supported by + * the HDMA IP. */ if (vsec_data->devmem_phys_off == DW_PCIE_XILINX_MDB_INVALID_ADDR) - return -ENOMEM; + non_ll = true; /* * Configure the channel LL and data blocks if number of * channels enabled in VSEC capability are more than the * channels configured in xilinx_mdb_data. */ - dw_edma_set_chan_region_offset(vsec_data, BAR_2, 0, - DW_PCIE_XILINX_MDB_LL_OFF_GAP, - DW_PCIE_XILINX_MDB_LL_SIZE, - DW_PCIE_XILINX_MDB_DT_OFF_GAP, - DW_PCIE_XILINX_MDB_DT_SIZE); + if (!non_ll) + dw_edma_set_chan_region_offset(vsec_data, BAR_2, 0, + DW_PCIE_XILINX_MDB_LL_OFF_GAP, + DW_PCIE_XILINX_MDB_LL_SIZE, + DW_PCIE_XILINX_MDB_DT_OFF_GAP, + DW_PCIE_XILINX_MDB_DT_SIZE); } /* Mapping PCI BAR regions */ @@ -391,6 +404,7 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev, chip->mf = vsec_data->mf; chip->nr_irqs = nr_irqs; chip->ops = &dw_edma_pcie_plat_ops; + chip->cfg_non_ll = non_ll; chip->ll_wr_cnt = vsec_data->wr_ch_cnt; chip->ll_rd_cnt = vsec_data->rd_ch_cnt; @@ -399,7 +413,7 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev, if (!chip->reg_base) return -ENOMEM; - for (i = 0; i < chip->ll_wr_cnt; i++) { + for (i = 0; i < chip->ll_wr_cnt && !non_ll; i++) { struct dw_edma_region *ll_region = &chip->ll_region_wr[i]; struct dw_edma_region *dt_region = &chip->dt_region_wr[i]; struct dw_edma_block *ll_block = &vsec_data->ll_wr[i]; @@ -410,7 +424,8 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev, return -ENOMEM; ll_region->vaddr.io += ll_block->off; - ll_region->paddr = pci_bus_address(pdev, ll_block->bar); + ll_region->paddr = dw_edma_get_phys_addr(pdev, vsec_data, + ll_block->bar); ll_region->paddr += ll_block->off; ll_region->sz = ll_block->sz; @@ -419,12 +434,13 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev, return -ENOMEM; dt_region->vaddr.io += dt_block->off; - dt_region->paddr = pci_bus_address(pdev, dt_block->bar); + dt_region->paddr = dw_edma_get_phys_addr(pdev, vsec_data, + dt_block->bar); dt_region->paddr += dt_block->off; dt_region->sz = dt_block->sz; } - for (i = 0; i < chip->ll_rd_cnt; i++) { + for (i = 0; i < chip->ll_rd_cnt && !non_ll; i++) { struct dw_edma_region *ll_region = &chip->ll_region_rd[i]; struct dw_edma_region *dt_region = &chip->dt_region_rd[i]; struct dw_edma_block *ll_block = &vsec_data->ll_rd[i]; @@ -435,7 +451,8 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev, return -ENOMEM; ll_region->vaddr.io += ll_block->off; - ll_region->paddr = pci_bus_address(pdev, ll_block->bar); + ll_region->paddr = dw_edma_get_phys_addr(pdev, vsec_data, + ll_block->bar); ll_region->paddr += ll_block->off; ll_region->sz = ll_block->sz; @@ -444,7 +461,8 @@ static int dw_edma_pcie_probe(struct pci_dev *pdev, return -ENOMEM; dt_region->vaddr.io += dt_block->off; - dt_region->paddr = pci_bus_address(pdev, dt_block->bar); + dt_region->paddr = dw_edma_get_phys_addr(pdev, vsec_data, + dt_block->bar); dt_region->paddr += dt_block->off; dt_region->sz = dt_block->sz; } diff --git a/drivers/dma/dw-edma/dw-hdma-v0-core.c b/drivers/dma/dw-edma/dw-hdma-v0-core.c index 8f75934e09d0..632abb8b481c 100644 --- a/drivers/dma/dw-edma/dw-hdma-v0-core.c +++ b/drivers/dma/dw-edma/dw-hdma-v0-core.c @@ -225,7 +225,7 @@ static void dw_hdma_v0_sync_ll_data(struct dw_edma_chunk *chunk) readl(chunk->ll_region.vaddr.io); } -static void dw_hdma_v0_core_start(struct dw_edma_chunk *chunk, bool first) +static void dw_hdma_v0_core_ll_start(struct dw_edma_chunk *chunk, bool first) { struct dw_edma_chan *chan = chunk->chan; struct dw_edma *dw = chan->dw; @@ -263,6 +263,68 @@ static void dw_hdma_v0_core_start(struct dw_edma_chunk *chunk, bool first) SET_CH_32(dw, chan->dir, chan->id, doorbell, HDMA_V0_DOORBELL_START); } +static void dw_hdma_v0_core_non_ll_start(struct dw_edma_chunk *chunk) +{ + struct dw_edma_chan *chan = chunk->chan; + struct dw_edma *dw = chan->dw; + struct dw_edma_burst *child; + u32 val; + + child = list_first_entry_or_null(&chunk->burst->list, + struct dw_edma_burst, list); + if (!child) + return; + + SET_CH_32(dw, chan->dir, chan->id, ch_en, HDMA_V0_CH_EN); + + /* Source address */ + SET_CH_32(dw, chan->dir, chan->id, sar.lsb, + lower_32_bits(child->sar)); + SET_CH_32(dw, chan->dir, chan->id, sar.msb, + upper_32_bits(child->sar)); + + /* Destination address */ + SET_CH_32(dw, chan->dir, chan->id, dar.lsb, + lower_32_bits(child->dar)); + SET_CH_32(dw, chan->dir, chan->id, dar.msb, + upper_32_bits(child->dar)); + + /* Transfer size */ + SET_CH_32(dw, chan->dir, chan->id, transfer_size, child->sz); + + /* Interrupt setup */ + val = GET_CH_32(dw, chan->dir, chan->id, int_setup) | + HDMA_V0_STOP_INT_MASK | + HDMA_V0_ABORT_INT_MASK | + HDMA_V0_LOCAL_STOP_INT_EN | + HDMA_V0_LOCAL_ABORT_INT_EN; + + if (!(dw->chip->flags & DW_EDMA_CHIP_LOCAL)) { + val |= HDMA_V0_REMOTE_STOP_INT_EN | + HDMA_V0_REMOTE_ABORT_INT_EN; + } + + SET_CH_32(dw, chan->dir, chan->id, int_setup, val); + + /* Channel control setup */ + val = GET_CH_32(dw, chan->dir, chan->id, control1); + val &= ~HDMA_V0_LINKLIST_EN; + SET_CH_32(dw, chan->dir, chan->id, control1, val); + + SET_CH_32(dw, chan->dir, chan->id, doorbell, + HDMA_V0_DOORBELL_START); +} + +static void dw_hdma_v0_core_start(struct dw_edma_chunk *chunk, bool first) +{ + struct dw_edma_chan *chan = chunk->chan; + + if (chan->non_ll) + dw_hdma_v0_core_non_ll_start(chunk); + else + dw_hdma_v0_core_ll_start(chunk, first); +} + static void dw_hdma_v0_core_ch_config(struct dw_edma_chan *chan) { struct dw_edma *dw = chan->dw; diff --git a/drivers/dma/dw-edma/dw-hdma-v0-regs.h b/drivers/dma/dw-edma/dw-hdma-v0-regs.h index eab5fd7177e5..7759ba9b4850 100644 --- a/drivers/dma/dw-edma/dw-hdma-v0-regs.h +++ b/drivers/dma/dw-edma/dw-hdma-v0-regs.h @@ -12,6 +12,7 @@ #include #define HDMA_V0_MAX_NR_CH 8 +#define HDMA_V0_CH_EN BIT(0) #define HDMA_V0_LOCAL_ABORT_INT_EN BIT(6) #define HDMA_V0_REMOTE_ABORT_INT_EN BIT(5) #define HDMA_V0_LOCAL_STOP_INT_EN BIT(4) diff --git a/include/linux/dma/edma.h b/include/linux/dma/edma.h index 9da53c75e49b..1fafd5b0e315 100644 --- a/include/linux/dma/edma.h +++ b/include/linux/dma/edma.h @@ -103,6 +103,7 @@ struct dw_edma_chip { enum dw_edma_map_format mf; struct dw_edma *dw; + bool cfg_non_ll; }; /* Export to the platform drivers */ -- cgit v1.2.3 From de9e2b3d88af36411301c049a1b049f3e4fe0757 Mon Sep 17 00:00:00 2001 From: Cristian Ciocaltea Date: Tue, 3 Mar 2026 21:24:17 +0200 Subject: uapi: Provide DIV_ROUND_CLOSEST() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently DIV_ROUND_CLOSEST() is only available for the kernel via include/linux/math.h. Expose it to userland as well by adding __KERNEL_DIV_ROUND_CLOSEST() as a common definition in uapi. Additionally, ensure it allows building ISO C applications by switching from the 'typeof' GNU extension to the ISO-friendly __typeof__. Reviewed-by: Nícolas F. R. A. Prado Tested-by: Diederik de Haas Acked-by: Andy Shevchenko Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Cristian Ciocaltea Link: https://patch.msgid.link/20260303-rk3588-bgcolor-v8-1-fee377037ad1@collabora.com Signed-off-by: Daniel Stone --- include/linux/math.h | 18 +----------------- include/uapi/linux/const.h | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/math.h b/include/linux/math.h index 6dc1d1d32fbc..1e8fb3efbc8c 100644 --- a/include/linux/math.h +++ b/include/linux/math.h @@ -89,23 +89,7 @@ } \ ) -/* - * Divide positive or negative dividend by positive or negative divisor - * and round to closest integer. Result is undefined for negative - * divisors if the dividend variable type is unsigned and for negative - * dividends if the divisor variable type is unsigned. - */ -#define DIV_ROUND_CLOSEST(x, divisor)( \ -{ \ - typeof(x) __x = x; \ - typeof(divisor) __d = divisor; \ - (((typeof(x))-1) > 0 || \ - ((typeof(divisor))-1) > 0 || \ - (((__x) > 0) == ((__d) > 0))) ? \ - (((__x) + ((__d) / 2)) / (__d)) : \ - (((__x) - ((__d) / 2)) / (__d)); \ -} \ -) +#define DIV_ROUND_CLOSEST __KERNEL_DIV_ROUND_CLOSEST /* * Same as above but for u64 dividends. divisor must be a 32-bit * number. diff --git a/include/uapi/linux/const.h b/include/uapi/linux/const.h index b8f629ef135f..565f309b9df8 100644 --- a/include/uapi/linux/const.h +++ b/include/uapi/linux/const.h @@ -50,4 +50,22 @@ #define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) +/* + * Divide positive or negative dividend by positive or negative divisor + * and round to closest integer. Result is undefined for negative + * divisors if the dividend variable type is unsigned and for negative + * dividends if the divisor variable type is unsigned. + */ +#define __KERNEL_DIV_ROUND_CLOSEST(x, divisor) \ +({ \ + __typeof__(x) __x = x; \ + __typeof__(divisor) __d = divisor; \ + \ + (((__typeof__(x))-1) > 0 || \ + ((__typeof__(divisor))-1) > 0 || \ + (((__x) > 0) == ((__d) > 0))) ? \ + (((__x) + ((__d) / 2)) / (__d)) : \ + (((__x) - ((__d) / 2)) / (__d)); \ +}) + #endif /* _UAPI_LINUX_CONST_H */ -- cgit v1.2.3 From f8a5f6934f30b9ee334256347dd70a7ba0f8be7f Mon Sep 17 00:00:00 2001 From: Aldo Conte Date: Wed, 11 Mar 2026 17:33:20 +0100 Subject: usb: typec: Document priority and mode_selection fields in struct typec_altmode The fields 'priority' and 'mode_selection' in struct typec_altmode are missing from the kernel-doc comment, which results in warnings when building the documentation with 'make htmldocs'. WARNING: ./include/linux/usb/typec_altmode.h:44 struct member 'priority' not described in 'typec_altmode' WARNING: ./include/linux/usb/typec_altmode.h:44 struct member 'mode_selection' not described in 'typec_altmode' Document both fields to keep the kernel-doc comment aligned with the structure definition. Signed-off-by: Aldo Conte Reviewed-by: Heikki Krogerus Link: https://patch.msgid.link/20260311163320.61534-1-aldocontelk@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/typec_altmode.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/usb/typec_altmode.h b/include/linux/usb/typec_altmode.h index 0513d333b797..b90cc5cfff8d 100644 --- a/include/linux/usb/typec_altmode.h +++ b/include/linux/usb/typec_altmode.h @@ -26,6 +26,9 @@ struct typec_altmode_ops; * @mode: Index of the Mode * @vdo: VDO returned by Discover Modes USB PD command * @active: Tells has the mode been entered or not + * @priority: Priority used by the automatic alternate mode selection process + * @mode_selection: Whether entry to this alternate mode is managed by the + * automatic alternate mode selection process or by the specific driver * @desc: Optional human readable description of the mode * @ops: Operations vector from the driver * @cable_ops: Cable operations vector from the driver. -- cgit v1.2.3 From a43dd4f6f91ed1a1d16595cb0c550b283e9b2298 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 16 Mar 2026 15:03:00 +0000 Subject: power: supply: Add PD SPR AVS support to USB type enum Add two new members to the power_supply_usb_type to represent the USB Power Delivery (PD) Standard Power Range (SPR) Adjustable Voltage Supply (AVS) charging types: POWER_SUPPLY_USB_TYPE_PD_SPR_AVS: For devices supporting only the PD SPR AVS type. POWER_SUPPLY_USB_TYPE_PD_PPS_SPR_AVS: For devices that support both PD Programmable Power Supply (PPS) and PD SPR AVS. Signed-off-by: Badhri Jagan Sridharan Link: https://patch.msgid.link/20260316150301.3892223-3-badhri@google.com Signed-off-by: Greg Kroah-Hartman --- Documentation/ABI/testing/sysfs-class-power | 3 ++- drivers/power/supply/power_supply_sysfs.c | 2 ++ include/linux/power_supply.h | 3 +++ 3 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/Documentation/ABI/testing/sysfs-class-power b/Documentation/ABI/testing/sysfs-class-power index 4b21d5d23251..32697b926cc8 100644 --- a/Documentation/ABI/testing/sysfs-class-power +++ b/Documentation/ABI/testing/sysfs-class-power @@ -675,7 +675,8 @@ Description: Valid values: "Unknown", "SDP", "DCP", "CDP", "ACA", "C", "PD", - "PD_DRP", "PD_PPS", "BrickID" + "PD_DRP", "PD_PPS", "BrickID", "PD_SPR_AVS", + "PD_PPS_SPR_AVS" **Device Specific Properties** diff --git a/drivers/power/supply/power_supply_sysfs.c b/drivers/power/supply/power_supply_sysfs.c index dd3a48d72d2b..f30a7b9ccd5e 100644 --- a/drivers/power/supply/power_supply_sysfs.c +++ b/drivers/power/supply/power_supply_sysfs.c @@ -70,6 +70,8 @@ static const char * const POWER_SUPPLY_USB_TYPE_TEXT[] = { [POWER_SUPPLY_USB_TYPE_PD] = "PD", [POWER_SUPPLY_USB_TYPE_PD_DRP] = "PD_DRP", [POWER_SUPPLY_USB_TYPE_PD_PPS] = "PD_PPS", + [POWER_SUPPLY_USB_TYPE_PD_SPR_AVS] = "PD_SPR_AVS", + [POWER_SUPPLY_USB_TYPE_PD_PPS_SPR_AVS] = "PD_PPS_SPR_AVS", [POWER_SUPPLY_USB_TYPE_APPLE_BRICK_ID] = "BrickID", }; diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index 360ffdf272da..7a5e4c3242a0 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -210,6 +210,9 @@ enum power_supply_usb_type { POWER_SUPPLY_USB_TYPE_PD, /* Power Delivery Port */ POWER_SUPPLY_USB_TYPE_PD_DRP, /* PD Dual Role Port */ POWER_SUPPLY_USB_TYPE_PD_PPS, /* PD Programmable Power Supply */ + /* PD Standard Power Range Adjustable Voltage Supply */ + POWER_SUPPLY_USB_TYPE_PD_SPR_AVS, + POWER_SUPPLY_USB_TYPE_PD_PPS_SPR_AVS, /* Supports both PD PPS + SPR AVS */ POWER_SUPPLY_USB_TYPE_APPLE_BRICK_ID, /* Apple Charging Method */ }; -- cgit v1.2.3 From d3d959404e6c72e09db8de8893a970edf0ac565d Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Mon, 16 Mar 2026 15:03:01 +0000 Subject: tcpm: Implement sink support for PD SPR AVS negotiation Add support to enable TCPM to negotiate with USB PD Standard Power Range Adjustable Voltage Supply (SPR AVS) when acting as a power sink. * Added support to the tcpm power supply properties, allowing userspace to enable and control the dynamic limits (voltage and current) specific to the SPR AVS contract. * Implemented tcpm_pd_select_spr_avs_apdo() to select the appropriate APDO and validate the requested voltage/current against both the Source and Sink capabilities. * Implemented tcpm_pd_build_spr_avs_request() to construct the Request Data Object (RDO) for SPR AVS. * Added SNK_NEGOTIATE_SPR_AVS_CAPABILITIES state to the state machine to handle negotiation for SPR AVS. * Updated the SNK_TRANSITION_SINK state to implement the SPR AVS-specific VBUS transition rules, including reducing current draw to PD_I_SNK_STBY_MA for large voltage changes, as required by USB PD spec. Log stub captured when enabling AVS: $ echo 3 > /sys/class/power_supply/tcpm-source-psy-1-0025/online $ cat /d/usb/tcpm-1-0025/log [ 358.895775] request to set AVS online [ 358.895792] AMS POWER_NEGOTIATION start [ 358.895806] state change SNK_READY -> AMS_START [rev3 POWER_NEGOTIATION] [ 358.895850] state change AMS_START -> SNK_NEGOTIATE_SPR_AVS_CAPABILITIES [rev3 POWER_NEGOTIATION] [ 358.895866] SPR AVS src_pdo_index:4 snk_pdo_index:2 req_op_curr_ma roundup:2200 req_out_volt_mv roundup:9000 [ 358.895880] Requesting APDO SPR AVS 4: 9000 mV, 2200 mA [ 358.896405] set_auto_vbus_discharge_threshold mode:0 pps_active:n vbus:0 pps_apdo_min_volt:0 ret:0 [ 358.896422] PD TX, header: 0x1a82 [ 358.900158] PD TX complete, status: 0 [ 358.900205] pending state change SNK_NEGOTIATE_SPR_AVS_CAPABILITIES -> HARD_RESET_SEND @ 60 ms [rev3 POWER_NEGOTIATION] [ 358.904832] PD RX, header: 0x1a3 [1] [ 358.904854] state change SNK_NEGOTIATE_SPR_AVS_CAPABILITIES -> SNK_TRANSITION_SINK [rev3 POWER_NEGOTIATION] [ 358.904888] pending state change SNK_TRANSITION_SINK -> HARD_RESET_SEND @ 700 ms [rev3 POWER_NEGOTIATION] [ 359.021530] PD RX, header: 0x3a6 [1] [ 359.021546] Setting voltage/current limit 9000 mV 2200 mA [ 359.023035] set_auto_vbus_discharge_threshold mode:3 pps_active:n vbus:9000 pps_apdo_min_volt:0 ret:0 [ 359.023053] state change SNK_TRANSITION_SINK -> SNK_READY [rev3 POWER_NEGOTIATION] [ 359.023090] AMS POWER_NEGOTIATION finished $ cat /sys/class/power_supply/tcpm-source-psy-1-0025/online 3 Log stub captured when increasing voltage: $ echo 9100000 > /sys/class/power_supply/tcpm-source-psy-1-0025/voltage_now $ cat /d/usb/tcpm-1-0025/log [ 632.116714] AMS POWER_NEGOTIATION start [ 632.116728] state change SNK_READY -> AMS_START [rev3 POWER_NEGOTIATION] [ 632.116779] state change AMS_START -> SNK_NEGOTIATE_SPR_AVS_CAPABILITIES [rev3 POWER_NEGOTIATION] [ 632.116798] SPR AVS src_pdo_index:4 snk_pdo_index:2 req_op_curr_ma roundup:2200 req_out_volt_mv roundup:9100 [ 632.116811] Requesting APDO SPR AVS 4: 9100 mV, 2200 mA [ 632.117315] set_auto_vbus_discharge_threshold mode:0 pps_active:n vbus:0 pps_apdo_min_volt:0 ret:0 [ 632.117328] PD TX, header: 0x1c82 [ 632.121007] PD TX complete, status: 0 [ 632.121052] pending state change SNK_NEGOTIATE_SPR_AVS_CAPABILITIES -> HARD_RESET_SEND @ 60 ms [rev3 POWER_NEGOTIATION] [ 632.124572] PD RX, header: 0x5a3 [1] [ 632.124594] state change SNK_NEGOTIATE_SPR_AVS_CAPABILITIES -> SNK_TRANSITION_SINK [rev3 POWER_NEGOTIATION] [ 632.124623] pending state change SNK_TRANSITION_SINK -> HARD_RESET_SEND @ 700 ms [rev3 POWER_NEGOTIATION] [ 632.149256] PD RX, header: 0x7a6 [1] [ 632.149271] Setting voltage/current limit 9100 mV 2200 mA [ 632.150770] set_auto_vbus_discharge_threshold mode:3 pps_active:n vbus:9100 pps_apdo_min_volt:0 ret:0 [ 632.150787] state change SNK_TRANSITION_SINK -> SNK_READY [rev3 POWER_NEGOTIATION] [ 632.150823] AMS POWER_NEGOTIATION finished $ cat /sys/class/power_supply/tcpm-source-psy-1-0025/voltage_now 9100000 Signed-off-by: Badhri Jagan Sridharan Reviewed-by: Amit Sunil Dhamne Acked-by: Heikki Krogerus Link: https://patch.msgid.link/20260316150301.3892223-4-badhri@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/tcpm.c | 611 +++++++++++++++++++++++++++++++++++------- include/linux/usb/pd.h | 32 ++- include/linux/usb/tcpm.h | 2 +- 3 files changed, 537 insertions(+), 108 deletions(-) (limited to 'include/linux') diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index 63a75b94743d..dfbb94ddc98a 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -62,6 +62,7 @@ S(SNK_WAIT_CAPABILITIES_TIMEOUT), \ S(SNK_NEGOTIATE_CAPABILITIES), \ S(SNK_NEGOTIATE_PPS_CAPABILITIES), \ + S(SNK_NEGOTIATE_SPR_AVS_CAPABILITIES), \ S(SNK_TRANSITION_SINK), \ S(SNK_TRANSITION_SINK_VBUS), \ S(SNK_READY), \ @@ -308,6 +309,51 @@ struct pd_pps_data { bool active; }; +enum spr_avs_status { + SPR_AVS_UNKNOWN, + SPR_AVS_NOT_SUPPORTED, + SPR_AVS_SUPPORTED +}; + +static const char * const spr_avs_status_strings[] = { + [SPR_AVS_UNKNOWN] = "Unknown", + [SPR_AVS_SUPPORTED] = "Supported", + [SPR_AVS_NOT_SUPPORTED] = "Not Supported", +}; + +/* + * Standard Power Range Adjustable Voltage Supply (SPR - AVS) data + * @max_current_ma_9v_to_15v: Max current for 9V to 15V range derived from + * source cap & sink cap + * @max_current_ma_15v_to_20v: Max current for 15V to 20V range derived from + * source cap & sink cap + * @req_op_curr_ma: Requested operating current to the port partner acting as source + * @req_out_volt_mv: Requested output voltage to the port partner acting as source + * @max_out_volt_mv: Max SPR voltage supported by the port and the port partner + * @max_current_ma; MAX SPR current supported by the port and the port partner + * @port_partner_src_status: SPR AVS status of port partner acting as source + * @port_partner_src_pdo_index: PDO index of SPR AVS cap of the port partner + * acting as source. Valid only when + * port_partner_src_status is SPR_AVS_SUPPORTED. + * @port_snk_status: SPR AVS status of the local port acting as sink. + * @port_snk_pdo_index: PDO index of SPR AVS cap of local port acting as sink + * @active: True when the local port acting as the sink has negotiated SPR AVS + * with the partner acting as source. + */ +struct pd_spr_avs_data { + u32 max_current_ma_9v_to_15v; + u32 max_current_ma_15v_to_20v; + u32 req_op_curr_ma; + u32 req_out_volt_mv; + u32 max_out_volt_mv; + u32 max_current_ma; + enum spr_avs_status port_partner_src_status; + unsigned int port_partner_src_pdo_index; + enum spr_avs_status port_snk_status; + unsigned int port_snk_pdo_index; + bool active; +}; + struct pd_data { struct usb_power_delivery *pd; struct usb_power_delivery_capabilities *source_cap; @@ -376,6 +422,11 @@ struct sink_caps_ext_data { u8 spr_max_pdp; }; +enum aug_req_type { + PD_PPS, + PD_SPR_AVS, +}; + struct tcpm_port { struct device *dev; @@ -538,9 +589,14 @@ struct tcpm_port { /* PPS */ struct pd_pps_data pps_data; - struct completion pps_complete; - bool pps_pending; - int pps_status; + + /* SPR AVS */ + struct pd_spr_avs_data spr_avs_data; + + /* Augmented supply request - PPS; SPR_AVS */ + struct completion aug_supply_req_complete; + bool aug_supply_req_pending; + int aug_supply_req_status; /* Alternate mode data */ struct pd_mode_data mode_data; @@ -3285,6 +3341,7 @@ static void tcpm_pd_data_request(struct tcpm_port *port, switch (type) { case PD_DATA_SOURCE_CAP: + port->spr_avs_data.port_partner_src_status = SPR_AVS_UNKNOWN; for (i = 0; i < cnt; i++) port->source_caps[i] = le32_to_cpu(msg->payload[i]); @@ -3456,12 +3513,12 @@ static void tcpm_pd_data_request(struct tcpm_port *port, } } -static void tcpm_pps_complete(struct tcpm_port *port, int result) +static void tcpm_aug_supply_req_complete(struct tcpm_port *port, int result) { - if (port->pps_pending) { - port->pps_status = result; - port->pps_pending = false; - complete(&port->pps_complete); + if (port->aug_supply_req_pending) { + port->aug_supply_req_status = result; + port->aug_supply_req_pending = false; + complete(&port->aug_supply_req_complete); } } @@ -3559,7 +3616,7 @@ static void tcpm_pd_ctrl_request(struct tcpm_port *port, /* Revert data back from any requested PPS updates */ port->pps_data.req_out_volt = port->supply_voltage; port->pps_data.req_op_curr = port->current_limit; - port->pps_status = (type == PD_CTRL_WAIT ? + port->aug_supply_req_status = (type == PD_CTRL_WAIT ? -EAGAIN : -EOPNOTSUPP); /* Threshold was relaxed before sending Request. Restore it back. */ @@ -3567,6 +3624,20 @@ static void tcpm_pd_ctrl_request(struct tcpm_port *port, port->pps_data.active, port->supply_voltage); + tcpm_set_state(port, SNK_READY, 0); + break; + case SNK_NEGOTIATE_SPR_AVS_CAPABILITIES: + /* Revert data back from any requested SPR AVS updates */ + port->spr_avs_data.req_out_volt_mv = port->supply_voltage; + port->spr_avs_data.req_op_curr_ma = port->current_limit; + port->aug_supply_req_status = (type == PD_CTRL_WAIT ? + -EAGAIN : -EOPNOTSUPP); + + /* Threshold was relaxed before sending Request. Restore it back. */ + tcpm_set_auto_vbus_discharge_threshold(port, TYPEC_PWR_MODE_PD, + port->spr_avs_data.active, + port->supply_voltage); + tcpm_set_state(port, SNK_READY, 0); break; case DR_SWAP_SEND: @@ -3621,6 +3692,7 @@ static void tcpm_pd_ctrl_request(struct tcpm_port *port, switch (port->state) { case SNK_NEGOTIATE_CAPABILITIES: port->pps_data.active = false; + port->spr_avs_data.active = false; tcpm_set_state(port, SNK_TRANSITION_SINK, 0); break; case SNK_NEGOTIATE_PPS_CAPABILITIES: @@ -3633,6 +3705,13 @@ static void tcpm_pd_ctrl_request(struct tcpm_port *port, power_supply_changed(port->psy); tcpm_set_state(port, SNK_TRANSITION_SINK, 0); break; + case SNK_NEGOTIATE_SPR_AVS_CAPABILITIES: + port->spr_avs_data.active = true; + port->req_supply_voltage = port->spr_avs_data.req_out_volt_mv; + port->req_current_limit = port->spr_avs_data.req_op_curr_ma; + power_supply_changed(port->psy); + tcpm_set_state(port, SNK_TRANSITION_SINK, 0); + break; case SOFT_RESET_SEND: if (port->ams == SOFT_RESET_AMS) tcpm_ams_finish(port); @@ -4130,9 +4209,9 @@ static int tcpm_pd_select_pdo(struct tcpm_port *port, int *sink_pdo, case PDO_TYPE_APDO: if (pdo_apdo_type(pdo) == APDO_TYPE_PPS) { port->pps_data.supported = true; - port->usb_type = - POWER_SUPPLY_USB_TYPE_PD_PPS; - power_supply_changed(port->psy); + } else if (pdo_apdo_type(pdo) == APDO_TYPE_SPR_AVS) { + port->spr_avs_data.port_partner_src_status = SPR_AVS_SUPPORTED; + port->spr_avs_data.port_partner_src_pdo_index = i; } continue; default: @@ -4170,6 +4249,10 @@ static int tcpm_pd_select_pdo(struct tcpm_port *port, int *sink_pdo, min_snk_mv = pdo_min_voltage(pdo); break; case PDO_TYPE_APDO: + if (pdo_apdo_type(pdo) == APDO_TYPE_SPR_AVS) { + port->spr_avs_data.port_snk_status = SPR_AVS_SUPPORTED; + port->spr_avs_data.port_snk_pdo_index = j; + } continue; default: tcpm_log(port, "Invalid sink PDO type, ignoring"); @@ -4191,6 +4274,23 @@ static int tcpm_pd_select_pdo(struct tcpm_port *port, int *sink_pdo, } } + if (port->spr_avs_data.port_snk_status == SPR_AVS_UNKNOWN) + port->spr_avs_data.port_snk_status = SPR_AVS_NOT_SUPPORTED; + + if (port->spr_avs_data.port_partner_src_status == SPR_AVS_UNKNOWN) + port->spr_avs_data.port_partner_src_status = SPR_AVS_NOT_SUPPORTED; + + if (port->pps_data.supported && + port->spr_avs_data.port_partner_src_status == SPR_AVS_SUPPORTED) + port->usb_type = POWER_SUPPLY_USB_TYPE_PD_PPS_SPR_AVS; + else if (port->pps_data.supported) + port->usb_type = POWER_SUPPLY_USB_TYPE_PD_PPS; + else if (port->spr_avs_data.port_partner_src_status == SPR_AVS_SUPPORTED) + port->usb_type = POWER_SUPPLY_USB_TYPE_PD_SPR_AVS; + + if (port->usb_type != POWER_SUPPLY_USB_TYPE_PD) + power_supply_changed(port->psy); + return ret; } @@ -4241,6 +4341,88 @@ static unsigned int tcpm_pd_select_pps_apdo(struct tcpm_port *port) return src_pdo; } +static int tcpm_pd_select_spr_avs_apdo(struct tcpm_port *port) +{ + u32 req_out_volt_mv, req_op_curr_ma, src_max_curr_ma = 0, source_cap; + u32 snk_max_curr_ma = 0, src_pdo_index, snk_pdo_index, snk_pdo; + + if (port->spr_avs_data.port_snk_status != SPR_AVS_SUPPORTED || + port->spr_avs_data.port_partner_src_status != + SPR_AVS_SUPPORTED) { + tcpm_log(port, "SPR AVS not supported. port:%s partner:%s", + spr_avs_status_strings[port->spr_avs_data.port_snk_status], + spr_avs_status_strings[port->spr_avs_data.port_partner_src_status]); + return -EOPNOTSUPP; + } + + /* Round up to SPR_AVS_VOLT_MV_STEP */ + req_out_volt_mv = port->spr_avs_data.req_out_volt_mv; + if (req_out_volt_mv % SPR_AVS_VOLT_MV_STEP) { + req_out_volt_mv += SPR_AVS_VOLT_MV_STEP - + (req_out_volt_mv % SPR_AVS_VOLT_MV_STEP); + port->spr_avs_data.req_out_volt_mv = req_out_volt_mv; + } + + /* Round up to RDO_SPR_AVS_CURR_MA_STEP */ + req_op_curr_ma = port->spr_avs_data.req_op_curr_ma; + if (req_op_curr_ma % RDO_SPR_AVS_CURR_MA_STEP) { + req_op_curr_ma += RDO_SPR_AVS_CURR_MA_STEP - + (req_op_curr_ma % RDO_SPR_AVS_CURR_MA_STEP); + port->spr_avs_data.req_op_curr_ma = req_op_curr_ma; + } + + src_pdo_index = port->spr_avs_data.port_partner_src_pdo_index; + snk_pdo_index = port->spr_avs_data.port_snk_pdo_index; + source_cap = port->source_caps[src_pdo_index]; + snk_pdo = port->snk_pdo[snk_pdo_index]; + tcpm_log(port, + "SPR AVS src_pdo_index:%d snk_pdo_index:%d req_op_curr_ma roundup:%u req_out_volt_mv roundup:%u", + src_pdo_index, snk_pdo_index, req_op_curr_ma, req_out_volt_mv); + + if (req_out_volt_mv >= SPR_AVS_TIER1_MIN_VOLT_MV && + req_out_volt_mv <= SPR_AVS_TIER1_MAX_VOLT_MV) { + src_max_curr_ma = + pdo_spr_avs_apdo_9v_to_15v_max_current_ma(source_cap); + snk_max_curr_ma = + pdo_spr_avs_apdo_9v_to_15v_max_current_ma(snk_pdo); + } else if (req_out_volt_mv > SPR_AVS_TIER1_MAX_VOLT_MV && + req_out_volt_mv <= SPR_AVS_TIER2_MAX_VOLT_MV) { + src_max_curr_ma = + pdo_spr_avs_apdo_15v_to_20v_max_current_ma(source_cap); + snk_max_curr_ma = + pdo_spr_avs_apdo_15v_to_20v_max_current_ma(snk_pdo); + } else { + tcpm_log(port, "Invalid SPR AVS req_volt:%umV", req_out_volt_mv); + return -EINVAL; + } + + if (req_op_curr_ma > src_max_curr_ma || + req_op_curr_ma > snk_max_curr_ma) { + tcpm_log(port, + "Invalid SPR AVS request. req_volt:%umV req_curr:%umA src_max_cur:%umA snk_max_cur:%umA", + req_out_volt_mv, req_op_curr_ma, src_max_curr_ma, + snk_max_curr_ma); + return -EINVAL; + } + + /* Max SPR voltage based on both the port and the partner caps */ + if (pdo_spr_avs_apdo_15v_to_20v_max_current_ma(snk_pdo) && + pdo_spr_avs_apdo_15v_to_20v_max_current_ma(source_cap)) + port->spr_avs_data.max_out_volt_mv = SPR_AVS_TIER2_MAX_VOLT_MV; + else + port->spr_avs_data.max_out_volt_mv = SPR_AVS_TIER1_MAX_VOLT_MV; + + /* + * Max SPR AVS curr based on 9V to 15V. This should be higher than or + * equal to 15V to 20V range. + */ + port->spr_avs_data.max_current_ma = + min(pdo_spr_avs_apdo_9v_to_15v_max_current_ma(source_cap), + pdo_spr_avs_apdo_9v_to_15v_max_current_ma(snk_pdo)); + + return src_pdo_index; +} + static int tcpm_pd_build_request(struct tcpm_port *port, u32 *rdo) { unsigned int mv, ma, mw, flags; @@ -4408,13 +4590,74 @@ static int tcpm_pd_build_pps_request(struct tcpm_port *port, u32 *rdo) return 0; } -static int tcpm_pd_send_pps_request(struct tcpm_port *port) +static int tcpm_pd_build_spr_avs_request(struct tcpm_port *port, u32 *rdo) +{ + u32 out_mv, op_ma, flags, snk_pdo_index, source_cap; + unsigned int src_power_mw, snk_power_mw; + int src_pdo_index; + u32 snk_pdo; + + src_pdo_index = tcpm_pd_select_spr_avs_apdo(port); + if (src_pdo_index < 0) + return src_pdo_index; + snk_pdo_index = port->spr_avs_data.port_snk_pdo_index; + source_cap = port->source_caps[src_pdo_index]; + snk_pdo = port->snk_pdo[snk_pdo_index]; + out_mv = port->spr_avs_data.req_out_volt_mv; + op_ma = port->spr_avs_data.req_op_curr_ma; + + flags = RDO_USB_COMM | RDO_NO_SUSPEND; + + /* + * Set capability mismatch when the maximum power needs in the current + * requested AVS voltage tier range is greater than + * port->operating_snk_mw, however, the maximum power offered by the + * source at the current requested AVS voltage tier is less than + * port->operating_sink_mw. + */ + if (out_mv > SPR_AVS_TIER1_MAX_VOLT_MV) { + src_power_mw = + pdo_spr_avs_apdo_15v_to_20v_max_current_ma(source_cap) * + SPR_AVS_TIER2_MAX_VOLT_MV / 1000; + snk_power_mw = + pdo_spr_avs_apdo_15v_to_20v_max_current_ma(snk_pdo) * + SPR_AVS_TIER2_MAX_VOLT_MV / 1000; + } else { + src_power_mw = + pdo_spr_avs_apdo_9v_to_15v_max_current_ma(source_cap) * + SPR_AVS_TIER1_MAX_VOLT_MV / 1000; + snk_power_mw = + pdo_spr_avs_apdo_9v_to_15v_max_current_ma(snk_pdo) * + SPR_AVS_TIER1_MAX_VOLT_MV / 1000; + } + + if (snk_power_mw >= port->operating_snk_mw && + src_power_mw < port->operating_snk_mw) + flags |= RDO_CAP_MISMATCH; + + *rdo = RDO_AVS(src_pdo_index + 1, out_mv, op_ma, flags); + + tcpm_log(port, "Requesting APDO SPR AVS %d: %u mV, %u mA", + src_pdo_index, out_mv, op_ma); + + return 0; +} + +static int tcpm_pd_send_aug_supply_request(struct tcpm_port *port, + enum aug_req_type type) { struct pd_message msg; int ret; u32 rdo; - ret = tcpm_pd_build_pps_request(port, &rdo); + if (type == PD_PPS) { + ret = tcpm_pd_build_pps_request(port, &rdo); + } else if (type == PD_SPR_AVS) { + ret = tcpm_pd_build_spr_avs_request(port, &rdo); + } else { + tcpm_log(port, "Invalid aug_req_type %d", type); + ret = -EOPNOTSUPP; + } if (ret < 0) return ret; @@ -4637,6 +4880,14 @@ static void tcpm_set_partner_usb_comm_capable(struct tcpm_port *port, bool capab port->tcpc->set_partner_usb_comm_capable(port->tcpc, capable); } +static void tcpm_partner_source_caps_reset(struct tcpm_port *port) +{ + usb_power_delivery_unregister_capabilities(port->partner_source_caps); + port->partner_source_caps = NULL; + port->spr_avs_data.port_partner_src_status = SPR_AVS_UNKNOWN; + port->spr_avs_data.active = false; +} + static void tcpm_reset_port(struct tcpm_port *port) { tcpm_enable_auto_vbus_discharge(port, false); @@ -4676,8 +4927,7 @@ static void tcpm_reset_port(struct tcpm_port *port) usb_power_delivery_unregister_capabilities(port->partner_sink_caps); port->partner_sink_caps = NULL; - usb_power_delivery_unregister_capabilities(port->partner_source_caps); - port->partner_source_caps = NULL; + tcpm_partner_source_caps_reset(port); usb_power_delivery_unregister(port->partner_pd); port->partner_pd = NULL; } @@ -5169,7 +5419,7 @@ static void run_state_machine(struct tcpm_port *port) case SNK_UNATTACHED: if (!port->non_pd_role_swap) tcpm_swap_complete(port, -ENOTCONN); - tcpm_pps_complete(port, -ENOTCONN); + tcpm_aug_supply_req_complete(port, -ENOTCONN); tcpm_snk_detach(port); if (port->potential_contaminant) { tcpm_set_state(port, CHECK_CONTAMINANT, 0); @@ -5400,13 +5650,16 @@ static void run_state_machine(struct tcpm_port *port) } break; case SNK_NEGOTIATE_PPS_CAPABILITIES: - ret = tcpm_pd_send_pps_request(port); + case SNK_NEGOTIATE_SPR_AVS_CAPABILITIES: + ret = tcpm_pd_send_aug_supply_request(port, port->state == + SNK_NEGOTIATE_PPS_CAPABILITIES ? + PD_PPS : PD_SPR_AVS); if (ret < 0) { /* Restore back to the original state */ tcpm_set_auto_vbus_discharge_threshold(port, TYPEC_PWR_MODE_PD, port->pps_data.active, port->supply_voltage); - port->pps_status = ret; + port->aug_supply_req_status = ret; /* * If this was called due to updates to sink * capabilities, and pps is no longer valid, we should @@ -5422,23 +5675,58 @@ static void run_state_machine(struct tcpm_port *port) } break; case SNK_TRANSITION_SINK: - /* From the USB PD spec: - * "The Sink Shall transition to Sink Standby before a positive or - * negative voltage transition of VBUS. During Sink Standby - * the Sink Shall reduce its power draw to pSnkStdby." - * - * This is not applicable to PPS though as the port can continue - * to draw negotiated power without switching to standby. - */ - if (port->supply_voltage != port->req_supply_voltage && !port->pps_data.active && - port->current_limit * port->supply_voltage / 1000 > PD_P_SNK_STDBY_MW) { - u32 stdby_ma = PD_P_SNK_STDBY_MW * 1000 / port->supply_voltage; + if (port->spr_avs_data.active) { + if (abs(port->req_supply_voltage - port->supply_voltage) > + SPR_AVS_AVS_SMALL_STEP_V * 1000) { + /* + * The Sink Shall reduce its current draw to + * iSnkStdby within tSnkStdby. The reduction to + * iSnkStdby is not required if the voltage + * increase is less than or equal to + * vAvsSmallStep. + */ + tcpm_log(port, + "SPR AVS Setting iSnkstandby. Req vol: %u mV Curr vol: %u mV", + port->req_supply_voltage, + port->supply_voltage); + tcpm_set_current_limit(port, PD_I_SNK_STBY_MA, + port->supply_voltage); + } + /* + * Although tAvsSrcTransSmall is expected to be used + * for voltage transistions smaller than 1V, using + * tAvsSrcTransLarge to be resilient against chargers + * which strictly cannot honor tAvsSrcTransSmall to + * improve interoperability. + */ + tcpm_set_state(port, hard_reset_state(port), + PD_T_AVS_SRC_TRANS_LARGE); + /* + * From the USB PD spec: + * "The Sink Shall transition to Sink Standby before a + * positive ornegative voltage transition of VBUS. + * During Sink Standby the Sink Shall reduce its power + * draw to pSnkStdby." + * + * This is not applicable to PPS though as the port can + * continue to draw negotiated power without switching + * to standby. + */ + } else if (port->supply_voltage != port->req_supply_voltage && + !port->pps_data.active && + (port->current_limit * port->supply_voltage / 1000 > + PD_P_SNK_STDBY_MW)) { + u32 stdby_ma = PD_P_SNK_STDBY_MW * 1000 / + port->supply_voltage; tcpm_log(port, "Setting standby current %u mV @ %u mA", port->supply_voltage, stdby_ma); - tcpm_set_current_limit(port, stdby_ma, port->supply_voltage); + tcpm_set_current_limit(port, stdby_ma, + port->supply_voltage); + tcpm_set_state(port, hard_reset_state(port), + PD_T_PS_TRANSITION); } - fallthrough; + break; case SNK_TRANSITION_SINK_VBUS: tcpm_set_state(port, hard_reset_state(port), PD_T_PS_TRANSITION); @@ -5458,7 +5746,7 @@ static void run_state_machine(struct tcpm_port *port) tcpm_typec_connect(port); if (port->pd_capable && port->source_caps[0] & PDO_FIXED_DUAL_ROLE) mod_enable_frs_delayed_work(port, 0); - tcpm_pps_complete(port, port->pps_status); + tcpm_aug_supply_req_complete(port, port->aug_supply_req_status); if (port->ams != NONE_AMS) tcpm_ams_finish(port); @@ -5645,8 +5933,7 @@ static void run_state_machine(struct tcpm_port *port) port->message_id = 0; port->rx_msgid = -1; /* remove existing capabilities */ - usb_power_delivery_unregister_capabilities(port->partner_source_caps); - port->partner_source_caps = NULL; + tcpm_partner_source_caps_reset(port); tcpm_pd_send_control(port, PD_CTRL_ACCEPT, TCPC_TX_SOP); tcpm_ams_finish(port); if (port->pwr_role == TYPEC_SOURCE) { @@ -5679,8 +5966,7 @@ static void run_state_machine(struct tcpm_port *port) port->message_id = 0; port->rx_msgid = -1; /* remove existing capabilities */ - usb_power_delivery_unregister_capabilities(port->partner_source_caps); - port->partner_source_caps = NULL; + tcpm_partner_source_caps_reset(port); if (tcpm_pd_send_control(port, PD_CTRL_SOFT_RESET, TCPC_TX_SOP)) tcpm_set_state_cond(port, hard_reset_state(port), 0); else @@ -5817,8 +6103,7 @@ static void run_state_machine(struct tcpm_port *port) break; case PR_SWAP_SNK_SRC_SINK_OFF: /* will be source, remove existing capabilities */ - usb_power_delivery_unregister_capabilities(port->partner_source_caps); - port->partner_source_caps = NULL; + tcpm_partner_source_caps_reset(port); /* * Prevent vbus discharge circuit from turning on during PR_SWAP * as this is not a disconnect. @@ -5966,7 +6251,7 @@ static void run_state_machine(struct tcpm_port *port) break; case ERROR_RECOVERY: tcpm_swap_complete(port, -EPROTO); - tcpm_pps_complete(port, -EPROTO); + tcpm_aug_supply_req_complete(port, -EPROTO); tcpm_set_state(port, PORT_RESET, 0); break; case PORT_RESET: @@ -6940,7 +7225,7 @@ static int tcpm_try_role(struct typec_port *p, int role) return ret; } -static int tcpm_pps_set_op_curr(struct tcpm_port *port, u16 req_op_curr) +static int tcpm_aug_set_op_curr(struct tcpm_port *port, u16 req_op_curr_ma) { unsigned int target_mw; int ret; @@ -6948,7 +7233,19 @@ static int tcpm_pps_set_op_curr(struct tcpm_port *port, u16 req_op_curr) mutex_lock(&port->swap_lock); mutex_lock(&port->lock); - if (!port->pps_data.active) { + if (port->pps_data.active) { + req_op_curr_ma = req_op_curr_ma - + (req_op_curr_ma % RDO_PROG_CURR_MA_STEP); + if (req_op_curr_ma > port->pps_data.max_curr) { + ret = -EINVAL; + goto port_unlock; + } + target_mw = (req_op_curr_ma * port->supply_voltage) / 1000; + if (target_mw < port->operating_snk_mw) { + ret = -EINVAL; + goto port_unlock; + } + } else if (!port->spr_avs_data.active) { ret = -EOPNOTSUPP; goto port_unlock; } @@ -6958,38 +7255,31 @@ static int tcpm_pps_set_op_curr(struct tcpm_port *port, u16 req_op_curr) goto port_unlock; } - if (req_op_curr > port->pps_data.max_curr) { - ret = -EINVAL; - goto port_unlock; - } - - target_mw = (req_op_curr * port->supply_voltage) / 1000; - if (target_mw < port->operating_snk_mw) { - ret = -EINVAL; - goto port_unlock; - } + if (port->pps_data.active) + port->upcoming_state = SNK_NEGOTIATE_PPS_CAPABILITIES; + else + port->upcoming_state = SNK_NEGOTIATE_SPR_AVS_CAPABILITIES; - port->upcoming_state = SNK_NEGOTIATE_PPS_CAPABILITIES; ret = tcpm_ams_start(port, POWER_NEGOTIATION); if (ret == -EAGAIN) { port->upcoming_state = INVALID_STATE; goto port_unlock; } - /* Round down operating current to align with PPS valid steps */ - req_op_curr = req_op_curr - (req_op_curr % RDO_PROG_CURR_MA_STEP); - - reinit_completion(&port->pps_complete); - port->pps_data.req_op_curr = req_op_curr; - port->pps_status = 0; - port->pps_pending = true; + reinit_completion(&port->aug_supply_req_complete); + if (port->pps_data.active) + port->pps_data.req_op_curr = req_op_curr_ma; + else + port->spr_avs_data.req_op_curr_ma = req_op_curr_ma; + port->aug_supply_req_status = 0; + port->aug_supply_req_pending = true; mutex_unlock(&port->lock); - if (!wait_for_completion_timeout(&port->pps_complete, - msecs_to_jiffies(PD_PPS_CTRL_TIMEOUT))) + if (!wait_for_completion_timeout(&port->aug_supply_req_complete, + msecs_to_jiffies(PD_AUG_PSY_CTRL_TIMEOUT))) ret = -ETIMEDOUT; else - ret = port->pps_status; + ret = port->aug_supply_req_status; goto swap_unlock; @@ -7001,7 +7291,7 @@ swap_unlock: return ret; } -static int tcpm_pps_set_out_volt(struct tcpm_port *port, u16 req_out_volt) +static int tcpm_aug_set_out_volt(struct tcpm_port *port, u16 req_out_volt_mv) { unsigned int target_mw; int ret; @@ -7009,7 +7299,16 @@ static int tcpm_pps_set_out_volt(struct tcpm_port *port, u16 req_out_volt) mutex_lock(&port->swap_lock); mutex_lock(&port->lock); - if (!port->pps_data.active) { + if (port->pps_data.active) { + req_out_volt_mv = req_out_volt_mv - (req_out_volt_mv % + RDO_PROG_VOLT_MV_STEP); + /* Round down output voltage to align with PPS valid steps */ + target_mw = (port->current_limit * req_out_volt_mv) / 1000; + if (target_mw < port->operating_snk_mw) { + ret = -EINVAL; + goto port_unlock; + } + } else if (!port->spr_avs_data.active) { ret = -EOPNOTSUPP; goto port_unlock; } @@ -7019,33 +7318,31 @@ static int tcpm_pps_set_out_volt(struct tcpm_port *port, u16 req_out_volt) goto port_unlock; } - target_mw = (port->current_limit * req_out_volt) / 1000; - if (target_mw < port->operating_snk_mw) { - ret = -EINVAL; - goto port_unlock; - } + if (port->pps_data.active) + port->upcoming_state = SNK_NEGOTIATE_PPS_CAPABILITIES; + else + port->upcoming_state = SNK_NEGOTIATE_SPR_AVS_CAPABILITIES; - port->upcoming_state = SNK_NEGOTIATE_PPS_CAPABILITIES; ret = tcpm_ams_start(port, POWER_NEGOTIATION); if (ret == -EAGAIN) { port->upcoming_state = INVALID_STATE; goto port_unlock; } - /* Round down output voltage to align with PPS valid steps */ - req_out_volt = req_out_volt - (req_out_volt % RDO_PROG_VOLT_MV_STEP); - - reinit_completion(&port->pps_complete); - port->pps_data.req_out_volt = req_out_volt; - port->pps_status = 0; - port->pps_pending = true; + reinit_completion(&port->aug_supply_req_complete); + if (port->pps_data.active) + port->pps_data.req_out_volt = req_out_volt_mv; + else + port->spr_avs_data.req_out_volt_mv = req_out_volt_mv; + port->aug_supply_req_status = 0; + port->aug_supply_req_pending = true; mutex_unlock(&port->lock); - if (!wait_for_completion_timeout(&port->pps_complete, - msecs_to_jiffies(PD_PPS_CTRL_TIMEOUT))) + if (!wait_for_completion_timeout(&port->aug_supply_req_complete, + msecs_to_jiffies(PD_AUG_PSY_CTRL_TIMEOUT))) ret = -ETIMEDOUT; else - ret = port->pps_status; + ret = port->aug_supply_req_status; goto swap_unlock; @@ -7088,9 +7385,9 @@ static int tcpm_pps_activate(struct tcpm_port *port, bool activate) goto port_unlock; } - reinit_completion(&port->pps_complete); - port->pps_status = 0; - port->pps_pending = true; + reinit_completion(&port->aug_supply_req_complete); + port->aug_supply_req_status = 0; + port->aug_supply_req_pending = true; /* Trigger PPS request or move back to standard PDO contract */ if (activate) { @@ -7099,11 +7396,75 @@ static int tcpm_pps_activate(struct tcpm_port *port, bool activate) } mutex_unlock(&port->lock); - if (!wait_for_completion_timeout(&port->pps_complete, - msecs_to_jiffies(PD_PPS_CTRL_TIMEOUT))) + if (!wait_for_completion_timeout(&port->aug_supply_req_complete, + msecs_to_jiffies(PD_AUG_PSY_CTRL_TIMEOUT))) + ret = -ETIMEDOUT; + else + ret = port->aug_supply_req_status; + + goto swap_unlock; + +port_unlock: + mutex_unlock(&port->lock); +swap_unlock: + mutex_unlock(&port->swap_lock); + + return ret; +} + +static int tcpm_spr_avs_activate(struct tcpm_port *port, bool activate) +{ + int ret = 0; + + mutex_lock(&port->swap_lock); + mutex_lock(&port->lock); + + if (port->spr_avs_data.port_snk_status == SPR_AVS_NOT_SUPPORTED || + port->spr_avs_data.port_partner_src_status == SPR_AVS_NOT_SUPPORTED) { + tcpm_log(port, "SPR_AVS not supported"); + ret = -EOPNOTSUPP; + goto port_unlock; + } + + /* Trying to deactivate SPR AVS when already deactivated so just bail */ + if (!port->spr_avs_data.active && !activate) + goto port_unlock; + + if (port->state != SNK_READY) { + tcpm_log(port, + "SPR_AVS cannot be activated. Port not in SNK_READY"); + ret = -EAGAIN; + goto port_unlock; + } + + if (activate) + port->upcoming_state = SNK_NEGOTIATE_SPR_AVS_CAPABILITIES; + else + port->upcoming_state = SNK_NEGOTIATE_CAPABILITIES; + ret = tcpm_ams_start(port, POWER_NEGOTIATION); + if (ret == -EAGAIN) { + tcpm_log(port, "SPR_AVS cannot be %s. AMS start failed", + activate ? "activated" : "deactivated"); + port->upcoming_state = INVALID_STATE; + goto port_unlock; + } + + reinit_completion(&port->aug_supply_req_complete); + port->aug_supply_req_status = 0; + port->aug_supply_req_pending = true; + + /* Trigger AVS request or move back to standard PDO contract */ + if (activate) { + port->spr_avs_data.req_out_volt_mv = port->supply_voltage; + port->spr_avs_data.req_op_curr_ma = port->current_limit; + } + mutex_unlock(&port->lock); + + if (!wait_for_completion_timeout(&port->aug_supply_req_complete, + msecs_to_jiffies(PD_AUG_PSY_CTRL_TIMEOUT))) ret = -ETIMEDOUT; else - ret = port->pps_status; + ret = port->aug_supply_req_status; goto swap_unlock; @@ -7259,16 +7620,26 @@ static int tcpm_pd_set(struct typec_port *p, struct usb_power_delivery *pd) break; case SNK_NEGOTIATE_CAPABILITIES: case SNK_NEGOTIATE_PPS_CAPABILITIES: + case SNK_NEGOTIATE_SPR_AVS_CAPABILITIES: case SNK_READY: case SNK_TRANSITION_SINK: case SNK_TRANSITION_SINK_VBUS: - if (port->pps_data.active) + if (port->pps_data.active) { port->upcoming_state = SNK_NEGOTIATE_PPS_CAPABILITIES; - else if (port->pd_capable) + } else if (port->pd_capable) { port->upcoming_state = SNK_NEGOTIATE_CAPABILITIES; - else + if (port->spr_avs_data.active) { + /* + * De-activate AVS and fallback to PD to + * re-evaluate whether AVS is supported in the + * current sink cap set. + */ + port->spr_avs_data.active = false; + port->spr_avs_data.port_snk_status = SPR_AVS_UNKNOWN; + } + } else { break; - + } port->update_sink_caps = true; ret = tcpm_ams_start(port, POWER_NEGOTIATION); @@ -7778,7 +8149,8 @@ static void tcpm_fw_get_pd_revision(struct tcpm_port *port, struct fwnode_handle enum tcpm_psy_online_states { TCPM_PSY_OFFLINE = 0, TCPM_PSY_FIXED_ONLINE, - TCPM_PSY_PROG_ONLINE, + TCPM_PSY_PPS_ONLINE, + TCPM_PSY_SPR_AVS_ONLINE, }; static enum power_supply_property tcpm_psy_props[] = { @@ -7796,7 +8168,9 @@ static int tcpm_psy_get_online(struct tcpm_port *port, { if (port->vbus_charge) { if (port->pps_data.active) - val->intval = TCPM_PSY_PROG_ONLINE; + val->intval = TCPM_PSY_PPS_ONLINE; + else if (port->spr_avs_data.active) + val->intval = TCPM_PSY_SPR_AVS_ONLINE; else val->intval = TCPM_PSY_FIXED_ONLINE; } else { @@ -7811,6 +8185,8 @@ static int tcpm_psy_get_voltage_min(struct tcpm_port *port, { if (port->pps_data.active) val->intval = port->pps_data.min_volt * 1000; + else if (port->spr_avs_data.active) + val->intval = SPR_AVS_TIER1_MIN_VOLT_MV * 1000; else val->intval = port->supply_voltage * 1000; @@ -7822,6 +8198,8 @@ static int tcpm_psy_get_voltage_max(struct tcpm_port *port, { if (port->pps_data.active) val->intval = port->pps_data.max_volt * 1000; + else if (port->spr_avs_data.active) + val->intval = port->spr_avs_data.max_out_volt_mv * 1000; else val->intval = port->supply_voltage * 1000; @@ -7841,6 +8219,8 @@ static int tcpm_psy_get_current_max(struct tcpm_port *port, { if (port->pps_data.active) val->intval = port->pps_data.max_curr * 1000; + else if (port->spr_avs_data.active) + val->intval = port->spr_avs_data.max_current_ma * 1000; else val->intval = port->current_limit * 1000; @@ -7916,17 +8296,41 @@ static int tcpm_psy_get_prop(struct power_supply *psy, return ret; } +static int tcpm_disable_pps_avs(struct tcpm_port *port) +{ + int ret = 0; + + if (port->pps_data.active) + ret = tcpm_pps_activate(port, false); + else if (port->spr_avs_data.active) + ret = tcpm_spr_avs_activate(port, false); + + return ret; +} + static int tcpm_psy_set_online(struct tcpm_port *port, const union power_supply_propval *val) { - int ret; + int ret = 0; switch (val->intval) { case TCPM_PSY_FIXED_ONLINE: - ret = tcpm_pps_activate(port, false); + ret = tcpm_disable_pps_avs(port); + break; + case TCPM_PSY_PPS_ONLINE: + if (port->spr_avs_data.active) + ret = tcpm_spr_avs_activate(port, false); + if (!ret) + ret = tcpm_pps_activate(port, true); break; - case TCPM_PSY_PROG_ONLINE: - ret = tcpm_pps_activate(port, true); + case TCPM_PSY_SPR_AVS_ONLINE: + tcpm_log(port, "request to set AVS online"); + if (port->spr_avs_data.active) + return 0; + ret = tcpm_disable_pps_avs(port); + if (ret) + break; + ret = tcpm_spr_avs_activate(port, true); break; default: ret = -EINVAL; @@ -7955,13 +8359,10 @@ static int tcpm_psy_set_prop(struct power_supply *psy, ret = tcpm_psy_set_online(port, val); break; case POWER_SUPPLY_PROP_VOLTAGE_NOW: - ret = tcpm_pps_set_out_volt(port, val->intval / 1000); + ret = tcpm_aug_set_out_volt(port, val->intval / 1000); break; case POWER_SUPPLY_PROP_CURRENT_NOW: - if (val->intval > port->pps_data.max_curr * 1000) - ret = -EINVAL; - else - ret = tcpm_pps_set_op_curr(port, val->intval / 1000); + ret = tcpm_aug_set_op_curr(port, val->intval / 1000); break; default: ret = -EINVAL; @@ -8006,7 +8407,9 @@ static int devm_tcpm_psy_register(struct tcpm_port *port) port->psy_desc.type = POWER_SUPPLY_TYPE_USB; port->psy_desc.usb_types = BIT(POWER_SUPPLY_USB_TYPE_C) | BIT(POWER_SUPPLY_USB_TYPE_PD) | - BIT(POWER_SUPPLY_USB_TYPE_PD_PPS); + BIT(POWER_SUPPLY_USB_TYPE_PD_PPS) | + BIT(POWER_SUPPLY_USB_TYPE_PD_PPS_SPR_AVS) | + BIT(POWER_SUPPLY_USB_TYPE_PD_SPR_AVS); port->psy_desc.properties = tcpm_psy_props; port->psy_desc.num_properties = ARRAY_SIZE(tcpm_psy_props); port->psy_desc.get_property = tcpm_psy_get_prop; @@ -8101,7 +8504,7 @@ struct tcpm_port *tcpm_register_port(struct device *dev, struct tcpc_dev *tcpc) init_completion(&port->tx_complete); init_completion(&port->swap_complete); - init_completion(&port->pps_complete); + init_completion(&port->aug_supply_req_complete); tcpm_debugfs_init(port); err = tcpm_fw_get_caps(port, tcpc->fwnode); diff --git a/include/linux/usb/pd.h b/include/linux/usb/pd.h index 5a98983195cb..337a5485af7c 100644 --- a/include/linux/usb/pd.h +++ b/include/linux/usb/pd.h @@ -398,9 +398,30 @@ enum pd_apdo_type { #define PDO_SPR_AVS_APDO_15V_TO_20V_MAX_CURR GENMASK(9, 0) /* 10mA unit */ /* SPR AVS has two different current ranges 9V - 15V, 15V - 20V */ -#define SPR_AVS_TIER1_MIN_VOLT_MV 9000 -#define SPR_AVS_TIER1_MAX_VOLT_MV 15000 -#define SPR_AVS_TIER2_MAX_VOLT_MV 20000 +#define SPR_AVS_TIER1_MIN_VOLT_MV 9000 +#define SPR_AVS_TIER1_MAX_VOLT_MV 15000 +#define SPR_AVS_TIER2_MAX_VOLT_MV 20000 + +#define SPR_AVS_AVS_SMALL_STEP_V 1 +/* vAvsStep - 100mv */ +#define SPR_AVS_VOLT_MV_STEP 100 +/* SPR AVS RDO Operating Current is in 50mA step */ +#define RDO_SPR_AVS_CURR_MA_STEP 50 +/* SPR AVS RDO Output voltage is in 25mV step */ +#define RDO_SPR_AVS_OUT_VOLT_MV_STEP 25 + +#define RDO_SPR_AVS_VOLT GENMASK(20, 9) +#define RDO_SPR_AVS_CURR GENMASK(6, 0) + +#define RDO_SPR_AVS_OUT_VOLT(mv) \ + FIELD_PREP(RDO_SPR_AVS_VOLT, ((mv) / RDO_SPR_AVS_OUT_VOLT_MV_STEP)) + +#define RDO_SPR_AVS_OP_CURR(ma) \ + FIELD_PREP(RDO_SPR_AVS_CURR, ((ma) / RDO_SPR_AVS_CURR_MA_STEP)) + +#define RDO_AVS(idx, out_mv, op_ma, flags) \ + (RDO_OBJ(idx) | (flags) | \ + RDO_SPR_AVS_OUT_VOLT(out_mv) | RDO_SPR_AVS_OP_CURR(op_ma)) static inline enum pd_pdo_type pdo_type(u32 pdo) { @@ -660,6 +681,11 @@ static inline unsigned int rdo_max_power(u32 rdo) #define PD_P_SNK_STDBY_MW 2500 /* 2500 mW */ +#define PD_I_SNK_STBY_MA 500 /* 500 mA */ + +#define PD_T_AVS_SRC_TRANS_SMALL 50 /* 50 ms */ +#define PD_T_AVS_SRC_TRANS_LARGE 700 /* 700 ms */ + #if IS_ENABLED(CONFIG_TYPEC) struct usb_power_delivery; diff --git a/include/linux/usb/tcpm.h b/include/linux/usb/tcpm.h index b22e659f81ba..93079450bba0 100644 --- a/include/linux/usb/tcpm.h +++ b/include/linux/usb/tcpm.h @@ -31,7 +31,7 @@ enum typec_cc_polarity { /* Time to wait for TCPC to complete transmit */ #define PD_T_TCPC_TX_TIMEOUT 100 /* in ms */ #define PD_ROLE_SWAP_TIMEOUT (MSEC_PER_SEC * 10) -#define PD_PPS_CTRL_TIMEOUT (MSEC_PER_SEC * 10) +#define PD_AUG_PSY_CTRL_TIMEOUT (MSEC_PER_SEC * 10) enum tcpm_transmit_status { TCPC_TX_SUCCESS = 0, -- cgit v1.2.3 From 16c1e8385b3bb65d412d7a60107f8894587c63fa Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Mar 2026 15:25:44 -0400 Subject: cpufreq: optimize policy_is_shared() The switch to cpumask_nth() over cpumask_weight(), as it may return earlier - as soon as the function counts the required number of CPUs. Signed-off-by: Yury Norov Acked-by: Viresh Kumar Reviewed-by: Zhongqiu Han Link: https://patch.msgid.link/20260314192544.605914-1-ynorov@nvidia.com Signed-off-by: Rafael J. Wysocki --- include/linux/cpufreq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index cc894fc38971..8ca2bcb3d7ae 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -232,7 +232,7 @@ static inline bool policy_is_inactive(struct cpufreq_policy *policy) static inline bool policy_is_shared(struct cpufreq_policy *policy) { - return cpumask_weight(policy->cpus) > 1; + return cpumask_nth(1, policy->cpus) < nr_cpumask_bits; } #ifdef CONFIG_CPU_FREQ -- cgit v1.2.3 From deffe1edba626d474fef38007c03646ca5876a0e Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Fri, 13 Mar 2026 14:48:02 +0100 Subject: module: Fix freeing of charp module parameters when CONFIG_SYSFS=n When setting a charp module parameter, the param_set_charp() function allocates memory to store a copy of the input value. Later, when the module is potentially unloaded, the destroy_params() function is called to free this allocated memory. However, destroy_params() is available only when CONFIG_SYSFS=y, otherwise only a dummy variant is present. In the unlikely case that the kernel is configured with CONFIG_MODULES=y and CONFIG_SYSFS=n, this results in a memory leak of charp values when a module is unloaded. Fix this issue by making destroy_params() always available when CONFIG_MODULES=y. Rename the function to module_destroy_params() to clarify that it is intended for use by the module loader. Fixes: e180a6b7759a ("param: fix charp parameters set via sysfs") Signed-off-by: Petr Pavlu Signed-off-by: Sami Tolvanen --- include/linux/moduleparam.h | 11 +++-------- kernel/module/main.c | 4 ++-- kernel/params.c | 27 ++++++++++++++++++--------- 3 files changed, 23 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 7d22d4c4ea2e..8667f72503d9 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -426,14 +426,9 @@ extern char *parse_args(const char *name, void *arg, parse_unknown_fn unknown); /* Called by module remove. */ -#ifdef CONFIG_SYSFS -extern void destroy_params(const struct kernel_param *params, unsigned num); -#else -static inline void destroy_params(const struct kernel_param *params, - unsigned num) -{ -} -#endif /* !CONFIG_SYSFS */ +#ifdef CONFIG_MODULES +void module_destroy_params(const struct kernel_param *params, unsigned int num); +#endif /* All the helper functions */ /* The macros to do compile-time type checking stolen from Jakub diff --git a/kernel/module/main.c b/kernel/module/main.c index c3ce106c70af..ef2e2130972f 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1408,7 +1408,7 @@ static void free_module(struct module *mod) module_unload_free(mod); /* Free any allocated parameters. */ - destroy_params(mod->kp, mod->num_kp); + module_destroy_params(mod->kp, mod->num_kp); if (is_livepatch_module(mod)) free_module_elf(mod); @@ -3519,7 +3519,7 @@ static int load_module(struct load_info *info, const char __user *uargs, mod_sysfs_teardown(mod); coming_cleanup: mod->state = MODULE_STATE_GOING; - destroy_params(mod->kp, mod->num_kp); + module_destroy_params(mod->kp, mod->num_kp); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); klp_module_going(mod); diff --git a/kernel/params.c b/kernel/params.c index 7188a12dbe86..c6a354d54213 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -745,15 +745,6 @@ void module_param_sysfs_remove(struct module *mod) } #endif -void destroy_params(const struct kernel_param *params, unsigned num) -{ - unsigned int i; - - for (i = 0; i < num; i++) - if (params[i].ops->free) - params[i].ops->free(params[i].arg); -} - struct module_kobject * __init_or_module lookup_or_create_module_kobject(const char *name) { @@ -985,3 +976,21 @@ static int __init param_sysfs_builtin_init(void) late_initcall(param_sysfs_builtin_init); #endif /* CONFIG_SYSFS */ + +#ifdef CONFIG_MODULES + +/* + * module_destroy_params - free all parameters for one module + * @params: module parameters (array) + * @num: number of module parameters + */ +void module_destroy_params(const struct kernel_param *params, unsigned int num) +{ + unsigned int i; + + for (i = 0; i < num; i++) + if (params[i].ops->free) + params[i].ops->free(params[i].arg); +} + +#endif /* CONFIG_MODULES */ -- cgit v1.2.3 From 65f535501e2a3378629b8650eca553920de5e5a2 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Fri, 13 Mar 2026 14:48:03 +0100 Subject: module: Clean up parse_args() arguments * Use the preferred `unsigned int` over plain `unsigned` for the `num` parameter. * Synchronize the parameter names in moduleparam.h with the ones used by the implementation in params.c. Signed-off-by: Petr Pavlu Signed-off-by: Sami Tolvanen --- include/linux/moduleparam.h | 8 ++++---- kernel/params.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 8667f72503d9..604bc6e9f3a1 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -417,12 +417,12 @@ extern bool parameqn(const char *name1, const char *name2, size_t n); typedef int (*parse_unknown_fn)(char *param, char *val, const char *doing, void *arg); /* Called on module insert or kernel boot */ -extern char *parse_args(const char *name, +extern char *parse_args(const char *doing, char *args, const struct kernel_param *params, - unsigned num, - s16 level_min, - s16 level_max, + unsigned int num, + s16 min_level, + s16 max_level, void *arg, parse_unknown_fn unknown); /* Called by module remove. */ diff --git a/kernel/params.c b/kernel/params.c index c6a354d54213..74d620bc2521 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -161,7 +161,7 @@ static int parse_one(char *param, char *parse_args(const char *doing, char *args, const struct kernel_param *params, - unsigned num, + unsigned int num, s16 min_level, s16 max_level, void *arg, parse_unknown_fn unknown) -- cgit v1.2.3 From 44a063c00fb13cf1f2e8a53a2ab10b232a44954b Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Fri, 13 Mar 2026 14:48:04 +0100 Subject: module: Remove extern keyword from param prototypes The external function declarations do not need the "extern" keyword. Remove it to align with the Linux kernel coding style and to silence the associated checkpatch warnings. Signed-off-by: Petr Pavlu Signed-off-by: Sami Tolvanen --- include/linux/moduleparam.h | 89 ++++++++++++++++++++++----------------------- 1 file changed, 44 insertions(+), 45 deletions(-) (limited to 'include/linux') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 604bc6e9f3a1..075f28585074 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -317,8 +317,8 @@ struct kparam_array name, &__param_ops_##name, arg, perm, -1, 0) #ifdef CONFIG_SYSFS -extern void kernel_param_lock(struct module *mod); -extern void kernel_param_unlock(struct module *mod); +void kernel_param_lock(struct module *mod); +void kernel_param_unlock(struct module *mod); #else static inline void kernel_param_lock(struct module *mod) { @@ -398,7 +398,7 @@ static inline void kernel_param_unlock(struct module *mod) * Returns: true if the two parameter names are equal. * Dashes (-) are considered equal to underscores (_). */ -extern bool parameq(const char *name1, const char *name2); +bool parameq(const char *name1, const char *name2); /** * parameqn - checks if two parameter names match @@ -412,18 +412,18 @@ extern bool parameq(const char *name1, const char *name2); * are equal. * Dashes (-) are considered equal to underscores (_). */ -extern bool parameqn(const char *name1, const char *name2, size_t n); +bool parameqn(const char *name1, const char *name2, size_t n); typedef int (*parse_unknown_fn)(char *param, char *val, const char *doing, void *arg); /* Called on module insert or kernel boot */ -extern char *parse_args(const char *doing, - char *args, - const struct kernel_param *params, - unsigned int num, - s16 min_level, - s16 max_level, - void *arg, parse_unknown_fn unknown); +char *parse_args(const char *doing, + char *args, + const struct kernel_param *params, + unsigned int num, + s16 min_level, + s16 max_level, + void *arg, parse_unknown_fn unknown); /* Called by module remove. */ #ifdef CONFIG_MODULES @@ -437,78 +437,77 @@ void module_destroy_params(const struct kernel_param *params, unsigned int num); static inline type __always_unused *__check_##name(void) { return(p); } extern const struct kernel_param_ops param_ops_byte; -extern int param_set_byte(const char *val, const struct kernel_param *kp); -extern int param_get_byte(char *buffer, const struct kernel_param *kp); +int param_set_byte(const char *val, const struct kernel_param *kp); +int param_get_byte(char *buffer, const struct kernel_param *kp); #define param_check_byte(name, p) __param_check(name, p, unsigned char) extern const struct kernel_param_ops param_ops_short; -extern int param_set_short(const char *val, const struct kernel_param *kp); -extern int param_get_short(char *buffer, const struct kernel_param *kp); +int param_set_short(const char *val, const struct kernel_param *kp); +int param_get_short(char *buffer, const struct kernel_param *kp); #define param_check_short(name, p) __param_check(name, p, short) extern const struct kernel_param_ops param_ops_ushort; -extern int param_set_ushort(const char *val, const struct kernel_param *kp); -extern int param_get_ushort(char *buffer, const struct kernel_param *kp); +int param_set_ushort(const char *val, const struct kernel_param *kp); +int param_get_ushort(char *buffer, const struct kernel_param *kp); #define param_check_ushort(name, p) __param_check(name, p, unsigned short) extern const struct kernel_param_ops param_ops_int; -extern int param_set_int(const char *val, const struct kernel_param *kp); -extern int param_get_int(char *buffer, const struct kernel_param *kp); +int param_set_int(const char *val, const struct kernel_param *kp); +int param_get_int(char *buffer, const struct kernel_param *kp); #define param_check_int(name, p) __param_check(name, p, int) extern const struct kernel_param_ops param_ops_uint; -extern int param_set_uint(const char *val, const struct kernel_param *kp); -extern int param_get_uint(char *buffer, const struct kernel_param *kp); +int param_set_uint(const char *val, const struct kernel_param *kp); +int param_get_uint(char *buffer, const struct kernel_param *kp); int param_set_uint_minmax(const char *val, const struct kernel_param *kp, unsigned int min, unsigned int max); #define param_check_uint(name, p) __param_check(name, p, unsigned int) extern const struct kernel_param_ops param_ops_long; -extern int param_set_long(const char *val, const struct kernel_param *kp); -extern int param_get_long(char *buffer, const struct kernel_param *kp); +int param_set_long(const char *val, const struct kernel_param *kp); +int param_get_long(char *buffer, const struct kernel_param *kp); #define param_check_long(name, p) __param_check(name, p, long) extern const struct kernel_param_ops param_ops_ulong; -extern int param_set_ulong(const char *val, const struct kernel_param *kp); -extern int param_get_ulong(char *buffer, const struct kernel_param *kp); +int param_set_ulong(const char *val, const struct kernel_param *kp); +int param_get_ulong(char *buffer, const struct kernel_param *kp); #define param_check_ulong(name, p) __param_check(name, p, unsigned long) extern const struct kernel_param_ops param_ops_ullong; -extern int param_set_ullong(const char *val, const struct kernel_param *kp); -extern int param_get_ullong(char *buffer, const struct kernel_param *kp); +int param_set_ullong(const char *val, const struct kernel_param *kp); +int param_get_ullong(char *buffer, const struct kernel_param *kp); #define param_check_ullong(name, p) __param_check(name, p, unsigned long long) extern const struct kernel_param_ops param_ops_hexint; -extern int param_set_hexint(const char *val, const struct kernel_param *kp); -extern int param_get_hexint(char *buffer, const struct kernel_param *kp); +int param_set_hexint(const char *val, const struct kernel_param *kp); +int param_get_hexint(char *buffer, const struct kernel_param *kp); #define param_check_hexint(name, p) param_check_uint(name, p) extern const struct kernel_param_ops param_ops_charp; -extern int param_set_charp(const char *val, const struct kernel_param *kp); -extern int param_get_charp(char *buffer, const struct kernel_param *kp); -extern void param_free_charp(void *arg); +int param_set_charp(const char *val, const struct kernel_param *kp); +int param_get_charp(char *buffer, const struct kernel_param *kp); +void param_free_charp(void *arg); #define param_check_charp(name, p) __param_check(name, p, char *) /* We used to allow int as well as bool. We're taking that away! */ extern const struct kernel_param_ops param_ops_bool; -extern int param_set_bool(const char *val, const struct kernel_param *kp); -extern int param_get_bool(char *buffer, const struct kernel_param *kp); +int param_set_bool(const char *val, const struct kernel_param *kp); +int param_get_bool(char *buffer, const struct kernel_param *kp); #define param_check_bool(name, p) __param_check(name, p, bool) extern const struct kernel_param_ops param_ops_bool_enable_only; -extern int param_set_bool_enable_only(const char *val, - const struct kernel_param *kp); +int param_set_bool_enable_only(const char *val, const struct kernel_param *kp); /* getter is the same as for the regular bool */ #define param_check_bool_enable_only param_check_bool extern const struct kernel_param_ops param_ops_invbool; -extern int param_set_invbool(const char *val, const struct kernel_param *kp); -extern int param_get_invbool(char *buffer, const struct kernel_param *kp); +int param_set_invbool(const char *val, const struct kernel_param *kp); +int param_get_invbool(char *buffer, const struct kernel_param *kp); #define param_check_invbool(name, p) __param_check(name, p, bool) /* An int, which can only be set like a bool (though it shows as an int). */ extern const struct kernel_param_ops param_ops_bint; -extern int param_set_bint(const char *val, const struct kernel_param *kp); +int param_set_bint(const char *val, const struct kernel_param *kp); #define param_get_bint param_get_int #define param_check_bint param_check_int @@ -615,19 +614,19 @@ enum hwparam_type { extern const struct kernel_param_ops param_array_ops; extern const struct kernel_param_ops param_ops_string; -extern int param_set_copystring(const char *val, const struct kernel_param *); -extern int param_get_string(char *buffer, const struct kernel_param *kp); +int param_set_copystring(const char *val, const struct kernel_param *kp); +int param_get_string(char *buffer, const struct kernel_param *kp); /* for exporting parameters in /sys/module/.../parameters */ struct module; #if defined(CONFIG_SYSFS) && defined(CONFIG_MODULES) -extern int module_param_sysfs_setup(struct module *mod, - const struct kernel_param *kparam, - unsigned int num_params); +int module_param_sysfs_setup(struct module *mod, + const struct kernel_param *kparam, + unsigned int num_params); -extern void module_param_sysfs_remove(struct module *mod); +void module_param_sysfs_remove(struct module *mod); #else static inline int module_param_sysfs_setup(struct module *mod, const struct kernel_param *kparam, -- cgit v1.2.3 From 306c36a76da2d6d2b5e91db925d41a9a8d77dbfd Mon Sep 17 00:00:00 2001 From: Josh Law Date: Wed, 18 Mar 2026 15:59:12 +0000 Subject: bootconfig: constify xbc_calc_checksum() data parameter xbc_calc_checksum() only reads the data buffer, so mark the parameter as const void * and the internal pointer as const unsigned char *. Link: https://lore.kernel.org/all/20260318155919.78168-7-objecting@objecting.org/ Signed-off-by: Josh Law Signed-off-by: Masami Hiramatsu (Google) --- include/linux/bootconfig.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index 25df9260d206..23a96c5edcf3 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -36,9 +36,9 @@ bool __init cmdline_has_extra_options(void); * The checksum will be used with the BOOTCONFIG_MAGIC and the size for * embedding the bootconfig in the initrd image. */ -static inline __init uint32_t xbc_calc_checksum(void *data, uint32_t size) +static inline __init uint32_t xbc_calc_checksum(const void *data, uint32_t size) { - unsigned char *p = data; + const unsigned char *p = data; uint32_t ret = 0; while (size--) -- cgit v1.2.3 From 6eb255d019b810614c5cbd99b9ef281b7b9361e3 Mon Sep 17 00:00:00 2001 From: Josh Law Date: Wed, 18 Mar 2026 15:59:19 +0000 Subject: lib/bootconfig: change xbc_node_index() return type to uint16_t lib/bootconfig.c:136:21: warning: conversion from 'long int' to 'int' may change value [-Wconversion] lib/bootconfig.c:308:33: warning: conversion from 'int' to 'uint16_t' may change value [-Wconversion] lib/bootconfig.c:467:37: warning: conversion from 'int' to 'uint16_t' may change value [-Wconversion] lib/bootconfig.c:469:40: warning: conversion from 'int' to 'uint16_t' may change value [-Wconversion] lib/bootconfig.c:472:54: warning: conversion from 'int' to 'uint16_t' may change value [-Wconversion] lib/bootconfig.c:476:45: warning: conversion from 'int' to 'uint16_t' may change value [-Wconversion] xbc_node_index() returns the position of a node in the xbc_nodes array, which has at most XBC_NODE_MAX (8192) entries, well within uint16_t range. Every caller stores the result in a uint16_t field (node->parent, node->child, node->next, or the keys[] array in compose_key_after), so the int return type causes narrowing warnings at all six call sites. Change the return type to uint16_t and add an explicit cast on the pointer subtraction to match the storage width and eliminate the warnings. Link: https://lore.kernel.org/all/20260318155919.78168-14-objecting@objecting.org/ Signed-off-by: Josh Law Signed-off-by: Masami Hiramatsu (Google) --- include/linux/bootconfig.h | 2 +- lib/bootconfig.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index 23a96c5edcf3..692a5acc2ffc 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -66,7 +66,7 @@ struct xbc_node { /* Node tree access raw APIs */ struct xbc_node * __init xbc_root_node(void); -int __init xbc_node_index(struct xbc_node *node); +uint16_t __init xbc_node_index(struct xbc_node *node); struct xbc_node * __init xbc_node_get_parent(struct xbc_node *node); struct xbc_node * __init xbc_node_get_child(struct xbc_node *node); struct xbc_node * __init xbc_node_get_next(struct xbc_node *node); diff --git a/lib/bootconfig.c b/lib/bootconfig.c index 343aa9629464..01966ab9a8b5 100644 --- a/lib/bootconfig.c +++ b/lib/bootconfig.c @@ -131,9 +131,9 @@ struct xbc_node * __init xbc_root_node(void) * * Return the index number of @node in XBC node list. */ -int __init xbc_node_index(struct xbc_node *node) +uint16_t __init xbc_node_index(struct xbc_node *node) { - return node - &xbc_nodes[0]; + return (uint16_t)(node - &xbc_nodes[0]); } /** -- cgit v1.2.3 From 9c6b4009da5991db6bf02a47a578885a04a5e3f6 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 16 Mar 2026 11:04:03 +0100 Subject: net: mdio-gpio: remove linux/mdio-gpio.h The three defines from the linux/mdio-gpio.h header are only used in the mdio-gpio module. There's no reason to have them in a public header. Move them into the driver and remove mdio-gpio.h. Signed-off-by: Bartosz Golaszewski Link: https://patch.msgid.link/20260316-gpio-mdio-hdr-cleanup-v1-1-2df696f74728@oss.qualcomm.com Signed-off-by: Jakub Kicinski --- drivers/net/mdio/mdio-gpio.c | 5 ++++- include/linux/mdio-gpio.h | 9 --------- 2 files changed, 4 insertions(+), 10 deletions(-) delete mode 100644 include/linux/mdio-gpio.h (limited to 'include/linux') diff --git a/drivers/net/mdio/mdio-gpio.c b/drivers/net/mdio/mdio-gpio.c index 1cfd538b5105..c99310889896 100644 --- a/drivers/net/mdio/mdio-gpio.c +++ b/drivers/net/mdio/mdio-gpio.c @@ -20,13 +20,16 @@ #include #include #include -#include #include #include #include #include #include +#define MDIO_GPIO_MDC 0 +#define MDIO_GPIO_MDIO 1 +#define MDIO_GPIO_MDO 2 + struct mdio_gpio_info { struct mdiobb_ctrl ctrl; struct gpio_desc *mdc, *mdio, *mdo; diff --git a/include/linux/mdio-gpio.h b/include/linux/mdio-gpio.h deleted file mode 100644 index cea443a672cb..000000000000 --- a/include/linux/mdio-gpio.h +++ /dev/null @@ -1,9 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __LINUX_MDIO_GPIO_H -#define __LINUX_MDIO_GPIO_H - -#define MDIO_GPIO_MDC 0 -#define MDIO_GPIO_MDIO 1 -#define MDIO_GPIO_MDO 2 - -#endif -- cgit v1.2.3 From 356d4fbcf3defaff0f98d2b6b54f3b26f0ff189d Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 16 Mar 2026 11:04:04 +0100 Subject: net: mdio-gpio: remove linux/platform_data/mdio-gpio.h Nobody defines struct mdio_gpio_platform_data. Remove platform data support from mdio-gpio and drop the header. Signed-off-by: Bartosz Golaszewski Link: https://patch.msgid.link/20260316-gpio-mdio-hdr-cleanup-v1-2-2df696f74728@oss.qualcomm.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 - drivers/net/mdio/mdio-gpio.c | 7 ------- include/linux/platform_data/mdio-gpio.h | 14 -------------- 3 files changed, 22 deletions(-) delete mode 100644 include/linux/platform_data/mdio-gpio.h (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 5d477fd592db..7d65f9435950 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9547,7 +9547,6 @@ F: include/linux/phy_fixed.h F: include/linux/phy_link_topology.h F: include/linux/phylib_stubs.h F: include/linux/platform_data/mdio-bcm-unimac.h -F: include/linux/platform_data/mdio-gpio.h F: include/net/phy/ F: include/trace/events/mdio.h F: include/uapi/linux/mdio.h diff --git a/drivers/net/mdio/mdio-gpio.c b/drivers/net/mdio/mdio-gpio.c index c99310889896..958d1c6608ab 100644 --- a/drivers/net/mdio/mdio-gpio.c +++ b/drivers/net/mdio/mdio-gpio.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include @@ -113,7 +112,6 @@ static struct mii_bus *mdio_gpio_bus_init(struct device *dev, struct mdio_gpio_info *bitbang, int bus_id) { - struct mdio_gpio_platform_data *pdata = dev_get_platdata(dev); struct mii_bus *new_bus; bitbang->ctrl.ops = &mdio_gpio_ops; @@ -130,11 +128,6 @@ static struct mii_bus *mdio_gpio_bus_init(struct device *dev, else strscpy(new_bus->id, "gpio", sizeof(new_bus->id)); - if (pdata) { - new_bus->phy_mask = pdata->phy_mask; - new_bus->phy_ignore_ta_mask = pdata->phy_ignore_ta_mask; - } - if (device_is_compatible(dev, "microchip,mdio-smi0")) { bitbang->ctrl.op_c22_read = 0; bitbang->ctrl.op_c22_write = 0; diff --git a/include/linux/platform_data/mdio-gpio.h b/include/linux/platform_data/mdio-gpio.h deleted file mode 100644 index 13874fa6e767..000000000000 --- a/include/linux/platform_data/mdio-gpio.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * MDIO-GPIO bus platform data structure - */ - -#ifndef __LINUX_MDIO_GPIO_PDATA_H -#define __LINUX_MDIO_GPIO_PDATA_H - -struct mdio_gpio_platform_data { - u32 phy_mask; - u32 phy_ignore_ta_mask; -}; - -#endif /* __LINUX_MDIO_GPIO_PDATA_H */ -- cgit v1.2.3 From dc3d720e12f602059490c1ab2bfee84a7465998f Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Tue, 17 Mar 2026 12:18:05 -0700 Subject: net: ethtool: add ethtool COALESCE_RX_CQE_FRAMES/NSECS Add two parameters for drivers supporting Rx CQE coalescing / descriptor writeback. ETHTOOL_A_COALESCE_RX_CQE_FRAMES: Maximum number of frames that can be coalesced into a CQE or writeback. ETHTOOL_A_COALESCE_RX_CQE_NSECS: Max time in nanoseconds after the first packet arrival in a coalesced CQE or writeback to be sent. Signed-off-by: Haiyang Zhang Link: https://patch.msgid.link/20260317191826.1346111-2-haiyangz@linux.microsoft.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/ethtool.yaml | 8 ++++++++ Documentation/networking/ethtool-netlink.rst | 11 +++++++++++ include/linux/ethtool.h | 6 +++++- include/uapi/linux/ethtool_netlink_generated.h | 2 ++ net/ethtool/coalesce.c | 14 +++++++++++++- 5 files changed, 39 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index a05b5425b76a..5dd4d1b5d94b 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -857,6 +857,12 @@ attribute-sets: name: tx-profile type: nest nested-attributes: profile + - + name: rx-cqe-frames + type: u32 + - + name: rx-cqe-nsecs + type: u32 - name: pause-stat @@ -2253,6 +2259,8 @@ operations: - tx-aggr-time-usecs - rx-profile - tx-profile + - rx-cqe-frames + - rx-cqe-nsecs dump: *coalesce-get-op - name: coalesce-set diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 32179168eb73..e92abf45faf5 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -1076,6 +1076,8 @@ Kernel response contents: ``ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS`` u32 time (us), aggr, Tx ``ETHTOOL_A_COALESCE_RX_PROFILE`` nested profile of DIM, Rx ``ETHTOOL_A_COALESCE_TX_PROFILE`` nested profile of DIM, Tx + ``ETHTOOL_A_COALESCE_RX_CQE_FRAMES`` u32 max packets, Rx CQE + ``ETHTOOL_A_COALESCE_RX_CQE_NSECS`` u32 delay (ns), Rx CQE =========================================== ====== ======================= Attributes are only included in reply if their value is not zero or the @@ -1109,6 +1111,13 @@ well with frequent small-sized URBs transmissions. to DIM parameters, see `Generic Network Dynamic Interrupt Moderation (Net DIM) `_. +Rx CQE coalescing allows multiple received packets to be coalesced into a +single Completion Queue Entry (CQE) or descriptor writeback. +``ETHTOOL_A_COALESCE_RX_CQE_FRAMES`` describes the maximum number of +frames that can be coalesced into a CQE or writeback. +``ETHTOOL_A_COALESCE_RX_CQE_NSECS`` describes max time in nanoseconds after +the first packet arrival in a coalesced CQE or writeback to be sent. + COALESCE_SET ============ @@ -1147,6 +1156,8 @@ Request contents: ``ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS`` u32 time (us), aggr, Tx ``ETHTOOL_A_COALESCE_RX_PROFILE`` nested profile of DIM, Rx ``ETHTOOL_A_COALESCE_TX_PROFILE`` nested profile of DIM, Tx + ``ETHTOOL_A_COALESCE_RX_CQE_FRAMES`` u32 max packets, Rx CQE + ``ETHTOOL_A_COALESCE_RX_CQE_NSECS`` u32 delay (ns), Rx CQE =========================================== ====== ======================= Request is rejected if it attributes declared as unsupported by driver (i.e. diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 83c375840835..656d465bcd06 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -332,6 +332,8 @@ struct kernel_ethtool_coalesce { u32 tx_aggr_max_bytes; u32 tx_aggr_max_frames; u32 tx_aggr_time_usecs; + u32 rx_cqe_frames; + u32 rx_cqe_nsecs; }; /** @@ -380,7 +382,9 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, #define ETHTOOL_COALESCE_TX_AGGR_TIME_USECS BIT(26) #define ETHTOOL_COALESCE_RX_PROFILE BIT(27) #define ETHTOOL_COALESCE_TX_PROFILE BIT(28) -#define ETHTOOL_COALESCE_ALL_PARAMS GENMASK(28, 0) +#define ETHTOOL_COALESCE_RX_CQE_FRAMES BIT(29) +#define ETHTOOL_COALESCE_RX_CQE_NSECS BIT(30) +#define ETHTOOL_COALESCE_ALL_PARAMS GENMASK(30, 0) #define ETHTOOL_COALESCE_USECS \ (ETHTOOL_COALESCE_RX_USECS | ETHTOOL_COALESCE_TX_USECS) diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 114b83017297..8134baf7860f 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -371,6 +371,8 @@ enum { ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS, ETHTOOL_A_COALESCE_RX_PROFILE, ETHTOOL_A_COALESCE_TX_PROFILE, + ETHTOOL_A_COALESCE_RX_CQE_FRAMES, + ETHTOOL_A_COALESCE_RX_CQE_NSECS, __ETHTOOL_A_COALESCE_CNT, ETHTOOL_A_COALESCE_MAX = (__ETHTOOL_A_COALESCE_CNT - 1) diff --git a/net/ethtool/coalesce.c b/net/ethtool/coalesce.c index 3e18ca1ccc5e..349bb02c517a 100644 --- a/net/ethtool/coalesce.c +++ b/net/ethtool/coalesce.c @@ -118,6 +118,8 @@ static int coalesce_reply_size(const struct ethnl_req_info *req_base, nla_total_size(sizeof(u32)) + /* _TX_AGGR_MAX_BYTES */ nla_total_size(sizeof(u32)) + /* _TX_AGGR_MAX_FRAMES */ nla_total_size(sizeof(u32)) + /* _TX_AGGR_TIME_USECS */ + nla_total_size(sizeof(u32)) + /* _RX_CQE_FRAMES */ + nla_total_size(sizeof(u32)) + /* _RX_CQE_NSECS */ total_modersz * 2; /* _{R,T}X_PROFILE */ } @@ -269,7 +271,11 @@ static int coalesce_fill_reply(struct sk_buff *skb, coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES, kcoal->tx_aggr_max_frames, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS, - kcoal->tx_aggr_time_usecs, supported)) + kcoal->tx_aggr_time_usecs, supported) || + coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_CQE_FRAMES, + kcoal->rx_cqe_frames, supported) || + coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_CQE_NSECS, + kcoal->rx_cqe_nsecs, supported)) return -EMSGSIZE; if (!req_base->dev || !req_base->dev->irq_moder) @@ -338,6 +344,8 @@ const struct nla_policy ethnl_coalesce_set_policy[] = { [ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS] = { .type = NLA_U32 }, + [ETHTOOL_A_COALESCE_RX_CQE_FRAMES] = { .type = NLA_U32 }, + [ETHTOOL_A_COALESCE_RX_CQE_NSECS] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_RX_PROFILE] = NLA_POLICY_NESTED(coalesce_profile_policy), [ETHTOOL_A_COALESCE_TX_PROFILE] = @@ -570,6 +578,10 @@ __ethnl_set_coalesce(struct ethnl_req_info *req_info, struct genl_info *info, tb[ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES], &mod); ethnl_update_u32(&kernel_coalesce.tx_aggr_time_usecs, tb[ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS], &mod); + ethnl_update_u32(&kernel_coalesce.rx_cqe_frames, + tb[ETHTOOL_A_COALESCE_RX_CQE_FRAMES], &mod); + ethnl_update_u32(&kernel_coalesce.rx_cqe_nsecs, + tb[ETHTOOL_A_COALESCE_RX_CQE_NSECS], &mod); if (dev->irq_moder && dev->irq_moder->profile_flags & DIM_PROFILE_RX) { ret = ethnl_update_profile(dev, &dev->irq_moder->rx_profile, -- cgit v1.2.3 From a8eed0ba6a4b2f1803ecdfa9f11a4818cf87c474 Mon Sep 17 00:00:00 2001 From: Shardul Bankar Date: Wed, 18 Mar 2026 13:08:22 +0530 Subject: hfsplus: refactor b-tree map page access and add node-type validation In HFS+ b-trees, the node allocation bitmap is stored across multiple records. The first chunk resides in the b-tree Header Node at record index 2, while all subsequent chunks are stored in dedicated Map Nodes at record index 0. This structural quirk forces callers like hfs_bmap_alloc() and hfs_bmap_free() to duplicate boilerplate code to validate offsets, correct lengths, and map the underlying pages via kmap_local_page(). There is also currently no strict node-type validation before reading these records, leaving the allocator vulnerable if a corrupted image points a map linkage to an Index or Leaf node. Introduce a unified bit-level API to encapsulate the map record access: 1. A new `struct hfs_bmap_ctx` to cleanly pass state and safely handle page math across all architectures. 2. `hfs_bmap_get_map_page()`: Automatically validates node types (HFS_NODE_HEADER vs HFS_NODE_MAP), infers the correct record index, handles page-boundary math, and returns the unmapped `struct page *` directly to the caller to avoid asymmetric mappings. 3. `hfs_bmap_clear_bit()`: A clean wrapper that internally handles page mapping/unmapping for single-bit operations. Refactor hfs_bmap_alloc() and hfs_bmap_free() to utilize this new API. This deduplicates the allocator logic, hardens the map traversal against fuzzed images, and provides the exact abstractions needed for upcoming mount-time validation checks. Signed-off-by: Shardul Bankar Reviewed-by: Viacheslav Dubeyko Tested-by: Viacheslav Dubeyko Signed-off-by: Viacheslav Dubeyko Link: https://lore.kernel.org/r/20260318073823.3933718-2-shardul.b@mpiricsoftware.com Signed-off-by: Viacheslav Dubeyko --- fs/hfsplus/btree.c | 169 ++++++++++++++++++++++++++++++++------------- include/linux/hfs_common.h | 2 + 2 files changed, 124 insertions(+), 47 deletions(-) (limited to 'include/linux') diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index 1220a2f22737..64d168347b4b 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -129,6 +129,95 @@ u32 hfsplus_calc_btree_clump_size(u32 block_size, u32 node_size, return clump_size; } +/* Context for iterating b-tree map pages + * @page_idx: The index of the page within the b-node's page array + * @off: The byte offset within the mapped page + * @len: The remaining length of the map record + */ +struct hfs_bmap_ctx { + unsigned int page_idx; + unsigned int off; + u16 len; +}; + +/* + * Finds the specific page containing the requested byte offset within the map + * record. Automatically handles the difference between header and map nodes. + * Returns the struct page pointer, or an ERR_PTR on failure. + * Note: The caller is responsible for mapping/unmapping the returned page. + */ +static struct page *hfs_bmap_get_map_page(struct hfs_bnode *node, struct hfs_bmap_ctx *ctx, + u32 byte_offset) +{ + u16 rec_idx, off16; + unsigned int page_off; + + if (node->this == HFSPLUS_TREE_HEAD) { + if (node->type != HFS_NODE_HEADER) { + pr_err("hfsplus: invalid btree header node\n"); + return ERR_PTR(-EIO); + } + rec_idx = HFSPLUS_BTREE_HDR_MAP_REC_INDEX; + } else { + if (node->type != HFS_NODE_MAP) { + pr_err("hfsplus: invalid btree map node\n"); + return ERR_PTR(-EIO); + } + rec_idx = HFSPLUS_BTREE_MAP_NODE_REC_INDEX; + } + + ctx->len = hfs_brec_lenoff(node, rec_idx, &off16); + if (!ctx->len) + return ERR_PTR(-ENOENT); + + if (!is_bnode_offset_valid(node, off16)) + return ERR_PTR(-EIO); + + ctx->len = check_and_correct_requested_length(node, off16, ctx->len); + + if (byte_offset >= ctx->len) + return ERR_PTR(-EINVAL); + + page_off = (u32)off16 + node->page_offset + byte_offset; + ctx->page_idx = page_off >> PAGE_SHIFT; + ctx->off = page_off & ~PAGE_MASK; + + return node->page[ctx->page_idx]; +} + +/** + * hfs_bmap_clear_bit - clear a bit in the b-tree map + * @node: the b-tree node containing the map record + * @node_bit_idx: the relative bit index within the node's map record + * + * Returns 0 on success, -EINVAL if already clear, or negative error code. + */ +static int hfs_bmap_clear_bit(struct hfs_bnode *node, u32 node_bit_idx) +{ + struct hfs_bmap_ctx ctx; + struct page *page; + u8 *bmap, mask; + + page = hfs_bmap_get_map_page(node, &ctx, node_bit_idx / BITS_PER_BYTE); + if (IS_ERR(page)) + return PTR_ERR(page); + + bmap = kmap_local_page(page); + + mask = 1 << (7 - (node_bit_idx % BITS_PER_BYTE)); + + if (!(bmap[ctx.off] & mask)) { + kunmap_local(bmap); + return -EINVAL; + } + + bmap[ctx.off] &= ~mask; + set_page_dirty(page); + kunmap_local(bmap); + + return 0; +} + /* Get a reference to a B*Tree and do some initial checks */ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id) { @@ -374,11 +463,9 @@ int hfs_bmap_reserve(struct hfs_btree *tree, u32 rsvd_nodes) struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) { struct hfs_bnode *node, *next_node; - struct page **pagep; + struct hfs_bmap_ctx ctx; + struct page *page; u32 nidx, idx; - unsigned off; - u16 off16; - u16 len; u8 *data, byte, m; int i, res; @@ -390,30 +477,26 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) node = hfs_bnode_find(tree, nidx); if (IS_ERR(node)) return node; - len = hfs_brec_lenoff(node, 2, &off16); - off = off16; - if (!is_bnode_offset_valid(node, off)) { + page = hfs_bmap_get_map_page(node, &ctx, 0); + if (IS_ERR(page)) { + res = PTR_ERR(page); hfs_bnode_put(node); - return ERR_PTR(-EIO); + return ERR_PTR(res); } - len = check_and_correct_requested_length(node, off, len); - off += node->page_offset; - pagep = node->page + (off >> PAGE_SHIFT); - data = kmap_local_page(*pagep); - off &= ~PAGE_MASK; + data = kmap_local_page(page); idx = 0; for (;;) { - while (len) { - byte = data[off]; + while (ctx.len) { + byte = data[ctx.off]; if (byte != 0xff) { for (m = 0x80, i = 0; i < 8; m >>= 1, i++) { if (!(byte & m)) { idx += i; - data[off] |= m; - set_page_dirty(*pagep); + data[ctx.off] |= m; + set_page_dirty(page); kunmap_local(data); tree->free_nodes--; mark_inode_dirty(tree->inode); @@ -423,13 +506,14 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) } } } - if (++off >= PAGE_SIZE) { + if (++ctx.off >= PAGE_SIZE) { kunmap_local(data); - data = kmap_local_page(*++pagep); - off = 0; + page = node->page[++ctx.page_idx]; + data = kmap_local_page(page); + ctx.off = 0; } idx += 8; - len--; + ctx.len--; } kunmap_local(data); nidx = node->next; @@ -443,22 +527,22 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) return next_node; node = next_node; - len = hfs_brec_lenoff(node, 0, &off16); - off = off16; - off += node->page_offset; - pagep = node->page + (off >> PAGE_SHIFT); - data = kmap_local_page(*pagep); - off &= ~PAGE_MASK; + page = hfs_bmap_get_map_page(node, &ctx, 0); + if (IS_ERR(page)) { + res = PTR_ERR(page); + hfs_bnode_put(node); + return ERR_PTR(res); + } + data = kmap_local_page(page); } } void hfs_bmap_free(struct hfs_bnode *node) { struct hfs_btree *tree; - struct page *page; u16 off, len; u32 nidx; - u8 *data, byte, m; + int res; hfs_dbg("node %u\n", node->this); BUG_ON(!node->this); @@ -495,24 +579,15 @@ void hfs_bmap_free(struct hfs_bnode *node) } len = hfs_brec_lenoff(node, 0, &off); } - off += node->page_offset + nidx / 8; - page = node->page[off >> PAGE_SHIFT]; - data = kmap_local_page(page); - off &= ~PAGE_MASK; - m = 1 << (~nidx & 7); - byte = data[off]; - if (!(byte & m)) { - pr_crit("trying to free free bnode " - "%u(%d)\n", - node->this, node->type); - kunmap_local(data); - hfs_bnode_put(node); - return; + + res = hfs_bmap_clear_bit(node, nidx); + if (res == -EINVAL) { + pr_crit("trying to free free bnode %u(%d)\n", + node->this, node->type); + } else if (!res) { + tree->free_nodes++; + mark_inode_dirty(tree->inode); } - data[off] = byte & ~m; - set_page_dirty(page); - kunmap_local(data); + hfs_bnode_put(node); - tree->free_nodes++; - mark_inode_dirty(tree->inode); } diff --git a/include/linux/hfs_common.h b/include/linux/hfs_common.h index dadb5e0aa8a3..be24c687858e 100644 --- a/include/linux/hfs_common.h +++ b/include/linux/hfs_common.h @@ -510,6 +510,8 @@ struct hfs_btree_header_rec { #define HFSPLUS_NODE_MXSZ 32768 #define HFSPLUS_ATTR_TREE_NODE_SIZE 8192 #define HFSPLUS_BTREE_HDR_NODE_RECS_COUNT 3 +#define HFSPLUS_BTREE_HDR_MAP_REC_INDEX 2 /* Map (bitmap) record in Header node */ +#define HFSPLUS_BTREE_MAP_NODE_REC_INDEX 0 /* Map record in Map Node */ #define HFSPLUS_BTREE_HDR_USER_BYTES 128 /* btree key type */ -- cgit v1.2.3 From c888c4c834c9afc18879fde91ad405be21b7425d Mon Sep 17 00:00:00 2001 From: Svyatoslav Ryhel Date: Tue, 3 Mar 2026 10:42:28 +0200 Subject: gpu: host1x: convert MIPI to use operation function pointers Convert existing MIPI code to use operation function pointers, a necessary step for supporting Tegra20/Tegra30 SoCs. All common MIPI configuration that is SoC-independent remains in mipi.c, while all SoC-specific code is moved to tegra114-mipi.c (The naming matches the first SoC generation with a dedicated calibration block). Shared structures and function calls are placed into tegra-mipi-cal.h. Tested-by: Luca Ceresoli # tegra20, parallel camera Signed-off-by: Svyatoslav Ryhel Acked-by: Mikko Perttunen Signed-off-by: Hans Verkuil --- drivers/gpu/drm/tegra/dsi.c | 1 + drivers/gpu/host1x/Makefile | 1 + drivers/gpu/host1x/mipi.c | 592 +++++++------------------------- drivers/gpu/host1x/tegra114-mipi.c | 483 ++++++++++++++++++++++++++ drivers/staging/media/tegra-video/csi.c | 1 + include/linux/host1x.h | 10 - include/linux/tegra-mipi-cal.h | 57 +++ 7 files changed, 666 insertions(+), 479 deletions(-) create mode 100644 drivers/gpu/host1x/tegra114-mipi.c create mode 100644 include/linux/tegra-mipi-cal.h (limited to 'include/linux') diff --git a/drivers/gpu/drm/tegra/dsi.c b/drivers/gpu/drm/tegra/dsi.c index 2c5aefe9621a..7f25c50621c9 100644 --- a/drivers/gpu/drm/tegra/dsi.c +++ b/drivers/gpu/drm/tegra/dsi.c @@ -14,6 +14,7 @@ #include #include #include +#include #include