From 45e1be5ddec98db71e7481fa7a3005673200d85c Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Tue, 2 Dec 2025 18:36:20 +0100 Subject: dt-bindings: power: qcom,rpmpd: Add SC8280XP_MXC_AO Not sure how useful it's gonna be in practice, but the definition is missing (unlike the previously-unused SC8280XP_MXC-non-_AO), so add it to allow the driver to create the corresponding pmdomain. Fixes: dbfb5f94e084 ("dt-bindings: power: rpmpd: Add sc8280xp RPMh power-domains") Acked-by: Rob Herring (Arm) Signed-off-by: Konrad Dybcio Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20251202-topic-8280_mxc-v2-1-46cdf47a829e@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- include/dt-bindings/power/qcom,rpmhpd.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/dt-bindings/power/qcom,rpmhpd.h b/include/dt-bindings/power/qcom,rpmhpd.h index 50e7c886709d..06851363ae0e 100644 --- a/include/dt-bindings/power/qcom,rpmhpd.h +++ b/include/dt-bindings/power/qcom,rpmhpd.h @@ -264,5 +264,6 @@ #define SC8280XP_NSP 13 #define SC8280XP_QPHY 14 #define SC8280XP_XO 15 +#define SC8280XP_MXC_AO 16 #endif -- cgit v1.2.3 From 52f758edc9f9f0b1f4f83eb668a5f76482ce00ca Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 12 Dec 2025 15:44:50 +0900 Subject: hyperv: Avoid -Wflex-array-member-not-at-end warning -Wflex-array-member-not-at-end was introduced in GCC-14, and we are getting ready to enable it, globally. Use the new __TRAILING_OVERLAP() helper to fix the following warning: include/hyperv/hvgdk_mini.h:581:25: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] This helper creates a union between a flexible-array member (FAM) and a set of MEMBERS that would otherwise follow it. This overlays the trailing MEMBER u64 gva_list[]; onto the FAM struct hv_tlb_flush_ex::hv_vp_set.bank_contents[], while keeping the FAM and the start of MEMBER aligned. The static_assert() ensures this alignment remains, and it's intentionally placed inmediately after the related structure --no blank line in between. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Wei Liu --- include/hyperv/hvgdk_mini.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h index 04b18d0e37af..30fbbde81c5c 100644 --- a/include/hyperv/hvgdk_mini.h +++ b/include/hyperv/hvgdk_mini.h @@ -578,9 +578,12 @@ struct hv_tlb_flush { /* HV_INPUT_FLUSH_VIRTUAL_ADDRESS_LIST */ struct hv_tlb_flush_ex { u64 address_space; u64 flags; - struct hv_vpset hv_vp_set; - u64 gva_list[]; + __TRAILING_OVERLAP(struct hv_vpset, hv_vp_set, bank_contents, __packed, + u64 gva_list[]; + ); } __packed; +static_assert(offsetof(struct hv_tlb_flush_ex, hv_vp_set.bank_contents) == + offsetof(struct hv_tlb_flush_ex, gva_list)); struct ms_hyperv_tsc_page { /* HV_REFERENCE_TSC_PAGE */ volatile u32 tsc_sequence; -- cgit v1.2.3 From 9910159f06590c17df4fbddedaabb4c0201cc4cb Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Mon, 15 Dec 2025 14:17:23 +0100 Subject: iio: core: add separate lockdep class for info_exist_lock When one iio device is a consumer of another, it is possible that the ->info_exist_lock of both ends up being taken when reading the value of the consumer device. Since they currently belong to the same lockdep class (being initialized in a single location with mutex_init()), that results in a lockdep warning CPU0 ---- lock(&iio_dev_opaque->info_exist_lock); lock(&iio_dev_opaque->info_exist_lock); *** DEADLOCK *** May be due to missing lock nesting notation 4 locks held by sensors/414: #0: c31fd6dc (&p->lock){+.+.}-{3:3}, at: seq_read_iter+0x44/0x4e4 #1: c4f5a1c4 (&of->mutex){+.+.}-{3:3}, at: kernfs_seq_start+0x1c/0xac #2: c2827548 (kn->active#34){.+.+}-{0:0}, at: kernfs_seq_start+0x30/0xac #3: c1dd2b68 (&iio_dev_opaque->info_exist_lock){+.+.}-{3:3}, at: iio_read_channel_processed_scale+0x24/0xd8 stack backtrace: CPU: 0 UID: 0 PID: 414 Comm: sensors Not tainted 6.17.11 #5 NONE Hardware name: Generic AM33XX (Flattened Device Tree) Call trace: unwind_backtrace from show_stack+0x10/0x14 show_stack from dump_stack_lvl+0x44/0x60 dump_stack_lvl from print_deadlock_bug+0x2b8/0x334 print_deadlock_bug from __lock_acquire+0x13a4/0x2ab0 __lock_acquire from lock_acquire+0xd0/0x2c0 lock_acquire from __mutex_lock+0xa0/0xe8c __mutex_lock from mutex_lock_nested+0x1c/0x24 mutex_lock_nested from iio_read_channel_raw+0x20/0x6c iio_read_channel_raw from rescale_read_raw+0x128/0x1c4 rescale_read_raw from iio_channel_read+0xe4/0xf4 iio_channel_read from iio_read_channel_processed_scale+0x6c/0xd8 iio_read_channel_processed_scale from iio_hwmon_read_val+0x68/0xbc iio_hwmon_read_val from dev_attr_show+0x18/0x48 dev_attr_show from sysfs_kf_seq_show+0x80/0x110 sysfs_kf_seq_show from seq_read_iter+0xdc/0x4e4 seq_read_iter from vfs_read+0x238/0x2e4 vfs_read from ksys_read+0x6c/0xec ksys_read from ret_fast_syscall+0x0/0x1c Just as the mlock_key already has its own lockdep class, add a lock_class_key for the info_exist mutex. Note that this has in theory been a problem since before IIO first left staging, but it only occurs when a chain of consumers is in use and that is not often done. Fixes: ac917a81117c ("staging:iio:core set the iio_dev.info pointer to null on unregister under lock.") Signed-off-by: Rasmus Villemoes Reviewed-by: Peter Rosin Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/industrialio-core.c | 4 +++- include/linux/iio/iio-opaque.h | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c index f69deefcfb6f..117ffad4f376 100644 --- a/drivers/iio/industrialio-core.c +++ b/drivers/iio/industrialio-core.c @@ -1657,6 +1657,7 @@ static void iio_dev_release(struct device *device) mutex_destroy(&iio_dev_opaque->info_exist_lock); mutex_destroy(&iio_dev_opaque->mlock); + lockdep_unregister_key(&iio_dev_opaque->info_exist_key); lockdep_unregister_key(&iio_dev_opaque->mlock_key); ida_free(&iio_ida, iio_dev_opaque->id); @@ -1717,9 +1718,10 @@ struct iio_dev *iio_device_alloc(struct device *parent, int sizeof_priv) INIT_LIST_HEAD(&iio_dev_opaque->ioctl_handlers); lockdep_register_key(&iio_dev_opaque->mlock_key); + lockdep_register_key(&iio_dev_opaque->info_exist_key); mutex_init_with_key(&iio_dev_opaque->mlock, &iio_dev_opaque->mlock_key); - mutex_init(&iio_dev_opaque->info_exist_lock); + mutex_init_with_key(&iio_dev_opaque->info_exist_lock, &iio_dev_opaque->info_exist_key); indio_dev->dev.parent = parent; indio_dev->dev.type = &iio_device_type; diff --git a/include/linux/iio/iio-opaque.h b/include/linux/iio/iio-opaque.h index 4247497f3f8b..b87841a355f8 100644 --- a/include/linux/iio/iio-opaque.h +++ b/include/linux/iio/iio-opaque.h @@ -14,6 +14,7 @@ * @mlock: lock used to prevent simultaneous device state changes * @mlock_key: lockdep class for iio_dev lock * @info_exist_lock: lock to prevent use during removal + * @info_exist_key: lockdep class for info_exist lock * @trig_readonly: mark the current trigger immutable * @event_interface: event chrdevs associated with interrupt lines * @attached_buffers: array of buffers statically attached by the driver @@ -47,6 +48,7 @@ struct iio_dev_opaque { struct mutex mlock; struct lock_class_key mlock_key; struct mutex info_exist_lock; + struct lock_class_key info_exist_key; bool trig_readonly; struct iio_event_interface *event_interface; struct iio_buffer **attached_buffers; -- cgit v1.2.3 From 9a49157deeb23581fc5c8189b486340d7343264a Mon Sep 17 00:00:00 2001 From: Brian Kao Date: Thu, 18 Dec 2025 03:17:23 +0000 Subject: scsi: core: Fix error handler encryption support Some low-level drivers (LLD) access block layer crypto fields, such as rq->crypt_keyslot and rq->crypt_ctx within `struct request`, to configure hardware for inline encryption. However, SCSI Error Handling (EH) commands (e.g., TEST UNIT READY, START STOP UNIT) should not involve any encryption setup. To prevent drivers from erroneously applying crypto settings during EH, this patch saves the original values of rq->crypt_keyslot and rq->crypt_ctx before an EH command is prepared via scsi_eh_prep_cmnd(). These fields in the 'struct request' are then set to NULL. The original values are restored in scsi_eh_restore_cmnd() after the EH command completes. This ensures that the block layer crypto context does not leak into EH command execution. Signed-off-by: Brian Kao Link: https://patch.msgid.link/20251218031726.2642834-1-powenkao@google.com Cc: stable@vger.kernel.org Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_error.c | 24 ++++++++++++++++++++++++ include/scsi/scsi_eh.h | 6 ++++++ 2 files changed, 30 insertions(+) (limited to 'include') diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index f869108fd969..eebca96c1fc1 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -1063,6 +1063,9 @@ void scsi_eh_prep_cmnd(struct scsi_cmnd *scmd, struct scsi_eh_save *ses, unsigned char *cmnd, int cmnd_size, unsigned sense_bytes) { struct scsi_device *sdev = scmd->device; +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + struct request *rq = scsi_cmd_to_rq(scmd); +#endif /* * We need saved copies of a number of fields - this is because @@ -1114,6 +1117,18 @@ void scsi_eh_prep_cmnd(struct scsi_cmnd *scmd, struct scsi_eh_save *ses, scmd->cmnd[1] = (scmd->cmnd[1] & 0x1f) | (sdev->lun << 5 & 0xe0); + /* + * Encryption must be disabled for the commands submitted by the error handler. + * Hence, clear the encryption context information. + */ +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + ses->rq_crypt_keyslot = rq->crypt_keyslot; + ses->rq_crypt_ctx = rq->crypt_ctx; + + rq->crypt_keyslot = NULL; + rq->crypt_ctx = NULL; +#endif + /* * Zero the sense buffer. The scsi spec mandates that any * untransferred sense data should be interpreted as being zero. @@ -1131,6 +1146,10 @@ EXPORT_SYMBOL(scsi_eh_prep_cmnd); */ void scsi_eh_restore_cmnd(struct scsi_cmnd* scmd, struct scsi_eh_save *ses) { +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + struct request *rq = scsi_cmd_to_rq(scmd); +#endif + /* * Restore original data */ @@ -1143,6 +1162,11 @@ void scsi_eh_restore_cmnd(struct scsi_cmnd* scmd, struct scsi_eh_save *ses) scmd->underflow = ses->underflow; scmd->prot_op = ses->prot_op; scmd->eh_eflags = ses->eh_eflags; + +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + rq->crypt_keyslot = ses->rq_crypt_keyslot; + rq->crypt_ctx = ses->rq_crypt_ctx; +#endif } EXPORT_SYMBOL(scsi_eh_restore_cmnd); diff --git a/include/scsi/scsi_eh.h b/include/scsi/scsi_eh.h index 1ae08e81339f..15679be90c5c 100644 --- a/include/scsi/scsi_eh.h +++ b/include/scsi/scsi_eh.h @@ -41,6 +41,12 @@ struct scsi_eh_save { unsigned char cmnd[32]; struct scsi_data_buffer sdb; struct scatterlist sense_sgl; + + /* struct request fields */ +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + struct bio_crypt_ctx *rq_crypt_ctx; + struct blk_crypto_keyslot *rq_crypt_keyslot; +#endif }; extern void scsi_eh_prep_cmnd(struct scsi_cmnd *scmd, -- cgit v1.2.3 From cce0be6eb4971456b703aaeafd571650d314bcca Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 31 Dec 2025 11:42:31 -0500 Subject: NFS: Fix a deadlock involving nfs_release_folio() Wang Zhaolong reports a deadlock involving NFSv4.1 state recovery waiting on kthreadd, which is attempting to reclaim memory by calling nfs_release_folio(). The latter cannot make progress due to state recovery being needed. It seems that the only safe thing to do here is to kick off a writeback of the folio, without waiting for completion, or else kicking off an asynchronous commit. Reported-by: Wang Zhaolong Fixes: 96780ca55e3c ("NFS: fix up nfs_release_folio() to try to release the page") Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 3 ++- fs/nfs/nfstrace.h | 3 +++ fs/nfs/write.c | 33 +++++++++++++++++++++++++++++++++ include/linux/nfs_fs.h | 1 + 4 files changed, 39 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index d020aab40c64..d1c138a416cf 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -511,7 +511,8 @@ static bool nfs_release_folio(struct folio *folio, gfp_t gfp) if ((current_gfp_context(gfp) & GFP_KERNEL) != GFP_KERNEL || current_is_kswapd() || current_is_kcompactd()) return false; - if (nfs_wb_folio(folio->mapping->host, folio) < 0) + if (nfs_wb_folio_reclaim(folio->mapping->host, folio) < 0 || + folio_test_private(folio)) return false; } return nfs_fscache_release_folio(folio, gfp); diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 6ce55e8e6b67..9f9ce4a565ea 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -1062,6 +1062,9 @@ DECLARE_EVENT_CLASS(nfs_folio_event_done, DEFINE_NFS_FOLIO_EVENT(nfs_aop_readpage); DEFINE_NFS_FOLIO_EVENT_DONE(nfs_aop_readpage_done); +DEFINE_NFS_FOLIO_EVENT(nfs_writeback_folio_reclaim); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_writeback_folio_reclaim_done); + DEFINE_NFS_FOLIO_EVENT(nfs_writeback_folio); DEFINE_NFS_FOLIO_EVENT_DONE(nfs_writeback_folio_done); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 336c510f3750..bf412455e8ed 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -2024,6 +2024,39 @@ int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio) return ret; } +/** + * nfs_wb_folio_reclaim - Write back all requests on one page + * @inode: pointer to page + * @folio: pointer to folio + * + * Assumes that the folio has been locked by the caller + */ +int nfs_wb_folio_reclaim(struct inode *inode, struct folio *folio) +{ + loff_t range_start = folio_pos(folio); + size_t len = folio_size(folio); + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 0, + .range_start = range_start, + .range_end = range_start + len - 1, + .for_sync = 1, + }; + int ret; + + if (folio_test_writeback(folio)) + return -EBUSY; + if (folio_clear_dirty_for_io(folio)) { + trace_nfs_writeback_folio_reclaim(inode, range_start, len); + ret = nfs_writepage_locked(folio, &wbc); + trace_nfs_writeback_folio_reclaim_done(inode, range_start, len, + ret); + return ret; + } + nfs_commit_inode(inode, 0); + return 0; +} + /** * nfs_wb_folio - Write back all requests on one page * @inode: pointer to page diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index a6624edb7226..8dd79a3f3d66 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -637,6 +637,7 @@ extern int nfs_update_folio(struct file *file, struct folio *folio, extern int nfs_sync_inode(struct inode *inode); extern int nfs_wb_all(struct inode *inode); extern int nfs_wb_folio(struct inode *inode, struct folio *folio); +extern int nfs_wb_folio_reclaim(struct inode *inode, struct folio *folio); int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio); extern int nfs_commit_inode(struct inode *, int); extern struct nfs_commit_data *nfs_commitdata_alloc(void); -- cgit v1.2.3 From 22cd0db47f4f65ebe8afc8c34ab120c47c73da2a Mon Sep 17 00:00:00 2001 From: Jacopo Mondi Date: Mon, 15 Dec 2025 13:08:12 +0100 Subject: media: uapi: mali-c55-config: Remove version identifier The Mali C55 driver uses the v4l2-isp framework, which defines its own versioning number which does not need to be defined again in each platform-specific header. Remove the definition of mali_c55_param_buffer_version enumeration from the Mali C55 uAPI header. Signed-off-by: Jacopo Mondi Reviewed-by: Daniel Scally Signed-off-by: Hans Verkuil --- include/uapi/linux/media/arm/mali-c55-config.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/media/arm/mali-c55-config.h b/include/uapi/linux/media/arm/mali-c55-config.h index 109082c5694f..3d335f950eeb 100644 --- a/include/uapi/linux/media/arm/mali-c55-config.h +++ b/include/uapi/linux/media/arm/mali-c55-config.h @@ -194,15 +194,6 @@ struct mali_c55_stats_buffer { __u32 reserved3[15]; } __attribute__((packed)); -/** - * enum mali_c55_param_buffer_version - Mali-C55 parameters block versioning - * - * @MALI_C55_PARAM_BUFFER_V1: First version of Mali-C55 parameters block - */ -enum mali_c55_param_buffer_version { - MALI_C55_PARAM_BUFFER_V1, -}; - /** * enum mali_c55_param_block_type - Enumeration of Mali-C55 parameter blocks * -- cgit v1.2.3 From 14adddc65340f2034751c95616861c0e888e2bb1 Mon Sep 17 00:00:00 2001 From: Sebastian Reichel Date: Tue, 14 Oct 2025 18:00:57 +0200 Subject: drm/bridge: dw-hdmi-qp: Fix spurious IRQ on resume After resume from suspend to RAM, the following splash is generated if the HDMI driver is probed (independent of a connected cable): [ 1194.484052] irq 80: nobody cared (try booting with the "irqpoll" option) [ 1194.484074] CPU: 0 UID: 0 PID: 627 Comm: rtcwake Not tainted 6.17.0-rc7-g96f1a11414b3 #1 PREEMPT [ 1194.484082] Hardware name: Rockchip RK3576 EVB V10 Board (DT) [ 1194.484085] Call trace: [ 1194.484087] ... (stripped) [ 1194.484283] handlers: [ 1194.484285] [<00000000bc363dcb>] dw_hdmi_qp_main_hardirq [dw_hdmi_qp] [ 1194.484302] Disabling IRQ #80 Apparently the HDMI IP is losing part of its state while the system is suspended and generates spurious interrupts during resume. The bug has not yet been noticed, as system suspend does not yet work properly on upstream kernel with either the Rockchip RK3588 or RK3576 platform. Fixes: 128a9bf8ace2 ("drm/rockchip: Add basic RK3588 HDMI output support") Signed-off-by: Sebastian Reichel Reviewed-by: Cristian Ciocaltea Signed-off-by: Heiko Stuebner Link: https://patch.msgid.link/20251014-rockchip-hdmi-suspend-fix-v1-1-983fcbf44839@collabora.com --- drivers/gpu/drm/bridge/synopsys/dw-hdmi-qp.c | 9 +++++++++ drivers/gpu/drm/rockchip/dw_hdmi_qp-rockchip.c | 12 +++++++++++- include/drm/bridge/dw_hdmi_qp.h | 1 + 3 files changed, 21 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/gpu/drm/bridge/synopsys/dw-hdmi-qp.c b/drivers/gpu/drm/bridge/synopsys/dw-hdmi-qp.c index fe4c026280f0..60166919c5b5 100644 --- a/drivers/gpu/drm/bridge/synopsys/dw-hdmi-qp.c +++ b/drivers/gpu/drm/bridge/synopsys/dw-hdmi-qp.c @@ -163,6 +163,7 @@ struct dw_hdmi_qp { unsigned long ref_clk_rate; struct regmap *regm; + int main_irq; unsigned long tmds_char_rate; }; @@ -1271,6 +1272,7 @@ struct dw_hdmi_qp *dw_hdmi_qp_bind(struct platform_device *pdev, dw_hdmi_qp_init_hw(hdmi); + hdmi->main_irq = plat_data->main_irq; ret = devm_request_threaded_irq(dev, plat_data->main_irq, dw_hdmi_qp_main_hardirq, NULL, IRQF_SHARED, dev_name(dev), hdmi); @@ -1331,9 +1333,16 @@ struct dw_hdmi_qp *dw_hdmi_qp_bind(struct platform_device *pdev, } EXPORT_SYMBOL_GPL(dw_hdmi_qp_bind); +void dw_hdmi_qp_suspend(struct device *dev, struct dw_hdmi_qp *hdmi) +{ + disable_irq(hdmi->main_irq); +} +EXPORT_SYMBOL_GPL(dw_hdmi_qp_suspend); + void dw_hdmi_qp_resume(struct device *dev, struct dw_hdmi_qp *hdmi) { dw_hdmi_qp_init_hw(hdmi); + enable_irq(hdmi->main_irq); } EXPORT_SYMBOL_GPL(dw_hdmi_qp_resume); diff --git a/drivers/gpu/drm/rockchip/dw_hdmi_qp-rockchip.c b/drivers/gpu/drm/rockchip/dw_hdmi_qp-rockchip.c index c9fe6aa3e3e3..6e39e8a00774 100644 --- a/drivers/gpu/drm/rockchip/dw_hdmi_qp-rockchip.c +++ b/drivers/gpu/drm/rockchip/dw_hdmi_qp-rockchip.c @@ -640,6 +640,15 @@ static void dw_hdmi_qp_rockchip_remove(struct platform_device *pdev) component_del(&pdev->dev, &dw_hdmi_qp_rockchip_ops); } +static int __maybe_unused dw_hdmi_qp_rockchip_suspend(struct device *dev) +{ + struct rockchip_hdmi_qp *hdmi = dev_get_drvdata(dev); + + dw_hdmi_qp_suspend(dev, hdmi->hdmi); + + return 0; +} + static int __maybe_unused dw_hdmi_qp_rockchip_resume(struct device *dev) { struct rockchip_hdmi_qp *hdmi = dev_get_drvdata(dev); @@ -655,7 +664,8 @@ static int __maybe_unused dw_hdmi_qp_rockchip_resume(struct device *dev) } static const struct dev_pm_ops dw_hdmi_qp_rockchip_pm = { - SET_SYSTEM_SLEEP_PM_OPS(NULL, dw_hdmi_qp_rockchip_resume) + SET_SYSTEM_SLEEP_PM_OPS(dw_hdmi_qp_rockchip_suspend, + dw_hdmi_qp_rockchip_resume) }; struct platform_driver dw_hdmi_qp_rockchip_pltfm_driver = { diff --git a/include/drm/bridge/dw_hdmi_qp.h b/include/drm/bridge/dw_hdmi_qp.h index 3f461f6b9bbf..3af12f82da2c 100644 --- a/include/drm/bridge/dw_hdmi_qp.h +++ b/include/drm/bridge/dw_hdmi_qp.h @@ -34,5 +34,6 @@ struct dw_hdmi_qp_plat_data { struct dw_hdmi_qp *dw_hdmi_qp_bind(struct platform_device *pdev, struct drm_encoder *encoder, const struct dw_hdmi_qp_plat_data *plat_data); +void dw_hdmi_qp_suspend(struct device *dev, struct dw_hdmi_qp *hdmi); void dw_hdmi_qp_resume(struct device *dev, struct dw_hdmi_qp *hdmi); #endif /* __DW_HDMI_QP__ */ -- cgit v1.2.3 From 2740ac33c87b3d0dfa022efd6ba04c6261b1abbd Mon Sep 17 00:00:00 2001 From: Johannes Brüderl Date: Sun, 7 Dec 2025 10:02:20 +0100 Subject: usb: core: add USB_QUIRK_NO_BOS for devices that hang on BOS descriptor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add USB_QUIRK_NO_BOS quirk flag to skip requesting the BOS descriptor for devices that cannot handle it. Add Elgato 4K X (0fd9:009b) to the quirk table. This device hangs when the BOS descriptor is requested at SuperSpeed Plus (10Gbps). Link: https://bugzilla.kernel.org/show_bug.cgi?id=220027 Cc: stable Signed-off-by: Johannes Brüderl Link: https://patch.msgid.link/20251207090220.14807-1-johannes.bruederl@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/config.c | 5 +++++ drivers/usb/core/quirks.c | 3 +++ include/linux/usb/quirks.h | 3 +++ 3 files changed, 11 insertions(+) (limited to 'include') diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c index baf5bc844b6f..2bb1ceb9d621 100644 --- a/drivers/usb/core/config.c +++ b/drivers/usb/core/config.c @@ -1040,6 +1040,11 @@ int usb_get_bos_descriptor(struct usb_device *dev) __u8 cap_type; int ret; + if (dev->quirks & USB_QUIRK_NO_BOS) { + dev_dbg(ddev, "skipping BOS descriptor\n"); + return -ENOMSG; + } + bos = kzalloc(sizeof(*bos), GFP_KERNEL); if (!bos) return -ENOMEM; diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index 47f589c4104a..c4d85089d19b 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -450,6 +450,9 @@ static const struct usb_device_id usb_quirk_list[] = { { USB_DEVICE(0x0c45, 0x7056), .driver_info = USB_QUIRK_IGNORE_REMOTE_WAKEUP }, + /* Elgato 4K X - BOS descriptor fetch hangs at SuperSpeed Plus */ + { USB_DEVICE(0x0fd9, 0x009b), .driver_info = USB_QUIRK_NO_BOS }, + /* Sony Xperia XZ1 Compact (lilac) smartphone in fastboot mode */ { USB_DEVICE(0x0fce, 0x0dde), .driver_info = USB_QUIRK_NO_LPM }, diff --git a/include/linux/usb/quirks.h b/include/linux/usb/quirks.h index 59409c1fc3de..2f7bd2fdc616 100644 --- a/include/linux/usb/quirks.h +++ b/include/linux/usb/quirks.h @@ -75,4 +75,7 @@ /* short SET_ADDRESS request timeout */ #define USB_QUIRK_SHORT_SET_ADDRESS_REQ_TIMEOUT BIT(16) +/* skip BOS descriptor request */ +#define USB_QUIRK_NO_BOS BIT(17) + #endif /* __LINUX_USB_QUIRKS_H */ -- cgit v1.2.3 From ef56578274d2b98423c8ef82bb450223f5811b59 Mon Sep 17 00:00:00 2001 From: Michal Koutný Date: Wed, 7 Jan 2026 17:59:41 +0100 Subject: cgroup: Eliminate cgrp_ancestor_storage in cgroup_root MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cgrp_ancestor_storage has two drawbacks: - it's not guaranteed that the member immediately follows struct cgrp in cgroup_root (root cgroup's ancestors[0] might thus point to a padding and not in cgrp_ancestor_storage proper), - this idiom raises warnings with -Wflex-array-member-not-at-end. Instead of relying on the auxiliary member in cgroup_root, define the 0-th level ancestor inside struct cgroup (needed for static allocation of cgrp_dfl_root), deeper cgroups would allocate flexible _low_ancestors[]. Unionized alias through ancestors[] will transparently join the two ranges. The above change would still leave the flexible array at the end of struct cgroup inside cgroup_root, so move cgrp also towards the end of cgroup_root to resolve the -Wflex-array-member-not-at-end. Link: https://lore.kernel.org/r/5fb74444-2fbb-476e-b1bf-3f3e279d0ced@embeddedor.com/ Reported-by: Gustavo A. R. Silva Closes: https://lore.kernel.org/r/b3eb050d-9451-4b60-b06c-ace7dab57497@embeddedor.com/ Cc: David Laight Acked-by: Gustavo A. R. Silva Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 25 ++++++++++++++----------- kernel/cgroup/cgroup.c | 2 +- 2 files changed, 15 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index b760a3c470a5..f7cc60de0058 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -626,7 +626,13 @@ struct cgroup { #endif /* All ancestors including self */ - struct cgroup *ancestors[]; + union { + DECLARE_FLEX_ARRAY(struct cgroup *, ancestors); + struct { + struct cgroup *_root_ancestor; + DECLARE_FLEX_ARRAY(struct cgroup *, _low_ancestors); + }; + }; }; /* @@ -647,16 +653,6 @@ struct cgroup_root { struct list_head root_list; struct rcu_head rcu; /* Must be near the top */ - /* - * The root cgroup. The containing cgroup_root will be destroyed on its - * release. cgrp->ancestors[0] will be used overflowing into the - * following field. cgrp_ancestor_storage must immediately follow. - */ - struct cgroup cgrp; - - /* must follow cgrp for cgrp->ancestors[0], see above */ - struct cgroup *cgrp_ancestor_storage; - /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ atomic_t nr_cgrps; @@ -668,6 +664,13 @@ struct cgroup_root { /* The name for this hierarchy - may be empty */ char name[MAX_CGROUP_ROOT_NAMELEN]; + + /* + * The root cgroup. The containing cgroup_root will be destroyed on its + * release. This must be embedded last due to flexible array at the end + * of struct cgroup. + */ + struct cgroup cgrp; }; /* diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index e717208cfb18..554a02ee298b 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5847,7 +5847,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, int ret; /* allocate the cgroup and its ID, 0 is reserved for the root */ - cgrp = kzalloc(struct_size(cgrp, ancestors, (level + 1)), GFP_KERNEL); + cgrp = kzalloc(struct_size(cgrp, _low_ancestors, level), GFP_KERNEL); if (!cgrp) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From 47c27c9c9c720bc93fdc69605d0ecd9382e99047 Mon Sep 17 00:00:00 2001 From: Jaroslav Kysela Date: Wed, 7 Jan 2026 22:36:42 +0100 Subject: ALSA: pcm: Improve the fix for race of buffer access at PCM OSS layer Handle the error code from snd_pcm_buffer_access_lock() in snd_pcm_runtime_buffer_set_silence() function. Found by Alexandros Panagiotou Fixes: 93a81ca06577 ("ALSA: pcm: Fix race of buffer access at PCM OSS layer") Cc: stable@vger.kernel.org # 6.15 Signed-off-by: Jaroslav Kysela Link: https://patch.msgid.link/20260107213642.332954-1-perex@perex.cz Signed-off-by: Takashi Iwai --- include/sound/pcm.h | 2 +- sound/core/oss/pcm_oss.c | 4 +++- sound/core/pcm_native.c | 9 +++++++-- 3 files changed, 11 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/sound/pcm.h b/include/sound/pcm.h index 58fd6e84f961..a7860c047503 100644 --- a/include/sound/pcm.h +++ b/include/sound/pcm.h @@ -1402,7 +1402,7 @@ int snd_pcm_lib_mmap_iomem(struct snd_pcm_substream *substream, struct vm_area_s #define snd_pcm_lib_mmap_iomem NULL #endif -void snd_pcm_runtime_buffer_set_silence(struct snd_pcm_runtime *runtime); +int snd_pcm_runtime_buffer_set_silence(struct snd_pcm_runtime *runtime); /** * snd_pcm_limit_isa_dma_size - Get the max size fitting with ISA DMA transfer diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c index a82dd155e1d3..b12df5b5ddfc 100644 --- a/sound/core/oss/pcm_oss.c +++ b/sound/core/oss/pcm_oss.c @@ -1074,7 +1074,9 @@ static int snd_pcm_oss_change_params_locked(struct snd_pcm_substream *substream) runtime->oss.params = 0; runtime->oss.prepare = 1; runtime->oss.buffer_used = 0; - snd_pcm_runtime_buffer_set_silence(runtime); + err = snd_pcm_runtime_buffer_set_silence(runtime); + if (err < 0) + goto failure; runtime->oss.period_frames = snd_pcm_alsa_frames(substream, oss_period_size); diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 68bee40c9ada..932a9bf98cbc 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -730,13 +730,18 @@ static void snd_pcm_buffer_access_unlock(struct snd_pcm_runtime *runtime) } /* fill the PCM buffer with the current silence format; called from pcm_oss.c */ -void snd_pcm_runtime_buffer_set_silence(struct snd_pcm_runtime *runtime) +int snd_pcm_runtime_buffer_set_silence(struct snd_pcm_runtime *runtime) { - snd_pcm_buffer_access_lock(runtime); + int err; + + err = snd_pcm_buffer_access_lock(runtime); + if (err < 0) + return err; if (runtime->dma_area) snd_pcm_format_set_silence(runtime->format, runtime->dma_area, bytes_to_samples(runtime, runtime->dma_bytes)); snd_pcm_buffer_access_unlock(runtime); + return 0; } EXPORT_SYMBOL_GPL(snd_pcm_runtime_buffer_set_silence); -- cgit v1.2.3 From 54b603f2db6b95495bc33a8f2bde80f044baff9a Mon Sep 17 00:00:00 2001 From: Yaxiong Tian Date: Tue, 30 Dec 2025 14:15:34 +0800 Subject: PM: EM: Fix incorrect description of the cost field in struct em_perf_state Due to commit 1b600da51073 ("PM: EM: Optimize em_cpu_energy() and remove division"), the logic for energy consumption calculation has been modified. The actual calculation of cost is 10 * power * max_frequency / frequency instead of power * max_frequency / frequency. Therefore, the comment for cost has been updated to reflect the correct content. Fixes: 1b600da51073 ("PM: EM: Optimize em_cpu_energy() and remove division") Signed-off-by: Yaxiong Tian Reviewed-by: Lukasz Luba [ rjw: Added Fixes: tag ] Link: https://patch.msgid.link/20251230061534.816894-1-tianyaxiong@kylinos.cn Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 43aa6153dc57..e7497f804644 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -18,7 +18,7 @@ * @power: The power consumed at this level (by 1 CPU or by a registered * device). It can be a total power: static and dynamic. * @cost: The cost coefficient associated with this level, used during - * energy calculation. Equal to: power * max_frequency / frequency + * energy calculation. Equal to: 10 * power * max_frequency / frequency * @flags: see "em_perf_state flags" description below. */ struct em_perf_state { -- cgit v1.2.3 From 872ac785e7680dac9ec7f8c5ccd4f667f49d6997 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 6 Jan 2026 17:24:26 +0000 Subject: ipv4: ip_tunnel: spread netdev_lockdep_set_classes() Inspired by yet another syzbot report. IPv6 tunnels call netdev_lockdep_set_classes() for each tunnel type, while IPv4 currently centralizes netdev_lockdep_set_classes() call from ip_tunnel_init(). Make ip_tunnel_init() a macro, so that we have different lockdep classes per tunnel type. Fixes: 0bef512012b1 ("net: add netdev_lockdep_set_classes() to virtual drivers") Reported-by: syzbot+1240b33467289f5ab50b@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/695d439f.050a0220.1c677c.0347.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20260106172426.1760721-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ip_tunnels.h | 13 ++++++++++++- net/ipv4/ip_tunnel.c | 5 ++--- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index ecae35512b9b..4021e6a73e32 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -19,6 +19,7 @@ #include #include #include +#include #if IS_ENABLED(CONFIG_IPV6) #include @@ -372,7 +373,17 @@ static inline void ip_tunnel_init_flow(struct flowi4 *fl4, fl4->flowi4_flags = flow_flags; } -int ip_tunnel_init(struct net_device *dev); +int __ip_tunnel_init(struct net_device *dev); +#define ip_tunnel_init(DEV) \ +({ \ + struct net_device *__dev = (DEV); \ + int __res = __ip_tunnel_init(__dev); \ + \ + if (!__res) \ + netdev_lockdep_set_classes(__dev);\ + __res; \ +}) + void ip_tunnel_uninit(struct net_device *dev); void ip_tunnel_dellink(struct net_device *dev, struct list_head *head); struct net *ip_tunnel_get_link_net(const struct net_device *dev); diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 158a30ae7c5f..50d0f5fe4e4c 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -1281,7 +1281,7 @@ int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], } EXPORT_SYMBOL_GPL(ip_tunnel_changelink); -int ip_tunnel_init(struct net_device *dev) +int __ip_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; @@ -1308,10 +1308,9 @@ int ip_tunnel_init(struct net_device *dev) if (tunnel->collect_md) netif_keep_dst(dev); - netdev_lockdep_set_classes(dev); return 0; } -EXPORT_SYMBOL_GPL(ip_tunnel_init); +EXPORT_SYMBOL_GPL(__ip_tunnel_init); void ip_tunnel_uninit(struct net_device *dev) { -- cgit v1.2.3 From a8f49a0043011c3dd12998a6c700bb59d5365c20 Mon Sep 17 00:00:00 2001 From: "Chia-Lin Kao (AceLan)" Date: Thu, 4 Dec 2025 10:46:47 +0800 Subject: drm/dp: Add byte-by-byte fallback for broken USB-C adapters Some USB-C hubs and adapters have buggy firmware where multi-byte AUX reads consistently timeout, while single-byte reads from the same address work correctly. Known affected devices that exhibit this issue: - Lenovo USB-C to VGA adapter (VIA VL817 chipset) idVendor=17ef, idProduct=7217 - Dell DA310 USB-C mobile adapter hub idVendor=413c, idProduct=c010 Analysis of the failure pattern shows: - Single-byte probes to 0xf0000 (LTTPR) succeed - Single-byte probes to 0x00102 (TRAINING_AUX_RD_INTERVAL) succeed - Multi-byte reads from 0x00000 (DPCD capabilities) timeout with -ETIMEDOUT - Retrying does not help - the failure is consistent across all attempts The issue appears to be a firmware bug in the AUX transaction handling that specifically affects multi-byte reads. Add a fallback mechanism in drm_dp_dpcd_read_data() that attempts byte-by-byte reading when the normal multi-byte read fails. This workaround only activates for adapters that fail the standard read path, ensuring no impact on correctly functioning hardware. Tested with: - Lenovo USB-C to VGA adapter (VIA VL817) - now works with fallback - Dell DA310 USB-C hub - now works with fallback - Dell/Analogix Slimport adapter - continues to work with normal path Signed-off-by: Chia-Lin Kao (AceLan) Reviewed-by: Mario Limonciello (AMD) Link: https://patch.msgid.link/20251204024647.1462866-1-acelan.kao@canonical.com Signed-off-by: Mario Limonciello (AMD) --- include/drm/display/drm_dp_helper.h | 57 ++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/drm/display/drm_dp_helper.h b/include/drm/display/drm_dp_helper.h index df2f24b950e4..14d2859f0bda 100644 --- a/include/drm/display/drm_dp_helper.h +++ b/include/drm/display/drm_dp_helper.h @@ -551,6 +551,22 @@ ssize_t drm_dp_dpcd_read(struct drm_dp_aux *aux, unsigned int offset, ssize_t drm_dp_dpcd_write(struct drm_dp_aux *aux, unsigned int offset, void *buffer, size_t size); +/** + * drm_dp_dpcd_readb() - read a single byte from the DPCD + * @aux: DisplayPort AUX channel + * @offset: address of the register to read + * @valuep: location where the value of the register will be stored + * + * Returns the number of bytes transferred (1) on success, or a negative + * error code on failure. In most of the cases you should be using + * drm_dp_dpcd_read_byte() instead. + */ +static inline ssize_t drm_dp_dpcd_readb(struct drm_dp_aux *aux, + unsigned int offset, u8 *valuep) +{ + return drm_dp_dpcd_read(aux, offset, valuep, 1); +} + /** * drm_dp_dpcd_read_data() - read a series of bytes from the DPCD * @aux: DisplayPort AUX channel (SST or MST) @@ -570,12 +586,29 @@ static inline int drm_dp_dpcd_read_data(struct drm_dp_aux *aux, void *buffer, size_t size) { int ret; + size_t i; + u8 *buf = buffer; ret = drm_dp_dpcd_read(aux, offset, buffer, size); - if (ret < 0) - return ret; - if (ret < size) - return -EPROTO; + if (ret >= 0) { + if (ret < size) + return -EPROTO; + return 0; + } + + /* + * Workaround for USB-C hubs/adapters with buggy firmware that fail + * multi-byte AUX reads but work with single-byte reads. + * Known affected devices: + * - Lenovo USB-C to VGA adapter (VIA VL817, idVendor=17ef, idProduct=7217) + * - Dell DA310 USB-C hub (idVendor=413c, idProduct=c010) + * Attempt byte-by-byte reading as a fallback. + */ + for (i = 0; i < size; i++) { + ret = drm_dp_dpcd_readb(aux, offset + i, &buf[i]); + if (ret < 0) + return ret; + } return 0; } @@ -609,22 +642,6 @@ static inline int drm_dp_dpcd_write_data(struct drm_dp_aux *aux, return 0; } -/** - * drm_dp_dpcd_readb() - read a single byte from the DPCD - * @aux: DisplayPort AUX channel - * @offset: address of the register to read - * @valuep: location where the value of the register will be stored - * - * Returns the number of bytes transferred (1) on success, or a negative - * error code on failure. In most of the cases you should be using - * drm_dp_dpcd_read_byte() instead. - */ -static inline ssize_t drm_dp_dpcd_readb(struct drm_dp_aux *aux, - unsigned int offset, u8 *valuep) -{ - return drm_dp_dpcd_read(aux, offset, valuep, 1); -} - /** * drm_dp_dpcd_writeb() - write a single byte to the DPCD * @aux: DisplayPort AUX channel -- cgit v1.2.3 From caa07a815d6ee32586beb66f67e7e3c103a02efd Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Thu, 8 Jan 2026 14:32:10 +0900 Subject: PM: EM: Rename em.yaml to dev-energymodel.yaml MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The EM YNL specification used many acronyms, including ‘em’, ‘pd’, ‘ps’, etc. While the acronyms are short and convenient, they could be confusing. So, let’s spell them out to be more specific. The following changes were made in the spec. Note that the protocol name cannot exceed GENL_NAMSIZ (16). em -> dev-energymodel pds -> perf-domains pd -> perf-domain pd-id -> perf-domain-id pd-table -> perf-table ps -> perf-state get-pds -> get-perf-domains get-pd-table -> get-perf-table pd-created -> perf-domain-created pd-updated -> perf-domain-updated pd-deleted -> perf-domain-deleted In addition. doc strings were added to the spec. based on the comments in energy_model.h. Two flag attributes (perf-state-flags and perf-domain-flags) were added for easily interpreting the bit flags. Finally, the autogenerated files and em_netlink.c were updated accordingly to reflect the name changes. Suggested-by: Donald Hunter Reviewed-by: Lukasz Luba Reviewed-by: Donald Hunter Signed-off-by: Changwoo Min Link: https://patch.msgid.link/20260108053212.642478-3-changwoo@igalia.com Signed-off-by: Rafael J. Wysocki --- Documentation/netlink/specs/dev-energymodel.yaml | 175 +++++++++++++++++++++++ Documentation/netlink/specs/em.yaml | 116 --------------- MAINTAINERS | 8 +- include/uapi/linux/dev_energymodel.h | 89 ++++++++++++ include/uapi/linux/energy_model.h | 63 -------- kernel/power/em_netlink.c | 135 ++++++++++------- kernel/power/em_netlink_autogen.c | 44 +++--- kernel/power/em_netlink_autogen.h | 20 +-- 8 files changed, 384 insertions(+), 266 deletions(-) create mode 100644 Documentation/netlink/specs/dev-energymodel.yaml delete mode 100644 Documentation/netlink/specs/em.yaml create mode 100644 include/uapi/linux/dev_energymodel.h delete mode 100644 include/uapi/linux/energy_model.h (limited to 'include') diff --git a/Documentation/netlink/specs/dev-energymodel.yaml b/Documentation/netlink/specs/dev-energymodel.yaml new file mode 100644 index 000000000000..cbc4bc38f23c --- /dev/null +++ b/Documentation/netlink/specs/dev-energymodel.yaml @@ -0,0 +1,175 @@ +# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +# +# Copyright (c) 2025 Valve Corporation. +# +--- +name: dev-energymodel + +doc: | + Energy model netlink interface to notify its changes. + +protocol: genetlink + +uapi-header: linux/dev_energymodel.h + +definitions: + - + type: flags + name: perf-state-flags + entries: + - + name: perf-state-inefficient + doc: >- + The performance state is inefficient. There is in this perf-domain, + another performance state with a higher frequency but a lower or + equal power cost. + - + type: flags + name: perf-domain-flags + entries: + - + name: perf-domain-microwatts + doc: >- + The power values are in micro-Watts or some other scale. + - + name: perf-domain-skip-inefficiencies + doc: >- + Skip inefficient states when estimating energy consumption. + - + name: perf-domain-artificial + doc: >- + The power values are artificial and might be created by platform + missing real power information. + +attribute-sets: + - + name: perf-domains + doc: >- + Information on all the performance domains. + attributes: + - + name: perf-domain + type: nest + nested-attributes: perf-domain + multi-attr: true + - + name: perf-domain + doc: >- + Information on a single performance domains. + attributes: + - + name: pad + type: pad + - + name: perf-domain-id + type: u32 + doc: >- + A unique ID number for each performance domain. + - + name: flags + type: u64 + doc: >- + Bitmask of performance domain flags. + enum: perf-domain-flags + - + name: cpus + type: string + doc: >- + CPUs that belong to this performance domain. + - + name: perf-table + doc: >- + Performance states table. + attributes: + - + name: perf-domain-id + type: u32 + doc: >- + A unique ID number for each performance domain. + - + name: perf-state + type: nest + nested-attributes: perf-state + multi-attr: true + - + name: perf-state + doc: >- + Performance state of a performance domain. + attributes: + - + name: pad + type: pad + - + name: performance + type: u64 + doc: >- + CPU performance (capacity) at a given frequency. + - + name: frequency + type: u64 + doc: >- + The frequency in KHz, for consistency with CPUFreq. + - + name: power + type: u64 + doc: >- + The power consumed at this level (by 1 CPU or by a registered + device). It can be a total power: static and dynamic. + - + name: cost + type: u64 + doc: >- + The cost coefficient associated with this level, used during energy + calculation. Equal to: power * max_frequency / frequency. + - + name: flags + type: u64 + doc: >- + Bitmask of performance state flags. + enum: perf-state-flags + +operations: + list: + - + name: get-perf-domains + attribute-set: perf-domains + doc: Get the list of information for all performance domains. + do: + reply: + attributes: + - perf-domain + - + name: get-perf-table + attribute-set: perf-table + doc: Get the energy model table of a performance domain. + do: + request: + attributes: + - perf-domain-id + reply: + attributes: + - perf-domain-id + - perf-state + - + name: perf-domain-created + doc: A performance domain is created. + notify: get-perf-table + mcgrp: event + - + name: perf-domain-updated + doc: A performance domain is updated. + notify: get-perf-table + mcgrp: event + - + name: perf-domain-deleted + doc: A performance domain is deleted. + attribute-set: perf-table + event: + attributes: + - perf-domain-id + mcgrp: event + +mcast-groups: + list: + - + name: event diff --git a/Documentation/netlink/specs/em.yaml b/Documentation/netlink/specs/em.yaml deleted file mode 100644 index 0c595a874f08..000000000000 --- a/Documentation/netlink/specs/em.yaml +++ /dev/null @@ -1,116 +0,0 @@ -# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) -# -# Copyright (c) 2025 Valve Corporation. -# ---- -name: em - -doc: | - Energy model netlink interface to notify its changes. - -protocol: genetlink - -uapi-header: linux/energy_model.h - -attribute-sets: - - - name: pds - attributes: - - - name: pd - type: nest - nested-attributes: pd - multi-attr: true - - - name: pd - attributes: - - - name: pad - type: pad - - - name: pd-id - type: u32 - - - name: flags - type: u64 - - - name: cpus - type: string - - - name: pd-table - attributes: - - - name: pd-id - type: u32 - - - name: ps - type: nest - nested-attributes: ps - multi-attr: true - - - name: ps - attributes: - - - name: pad - type: pad - - - name: performance - type: u64 - - - name: frequency - type: u64 - - - name: power - type: u64 - - - name: cost - type: u64 - - - name: flags - type: u64 - -operations: - list: - - - name: get-pds - attribute-set: pds - doc: Get the list of information for all performance domains. - do: - reply: - attributes: - - pd - - - name: get-pd-table - attribute-set: pd-table - doc: Get the energy model table of a performance domain. - do: - request: - attributes: - - pd-id - reply: - attributes: - - pd-id - - ps - - - name: pd-created - doc: A performance domain is created. - notify: get-pd-table - mcgrp: event - - - name: pd-updated - doc: A performance domain is updated. - notify: get-pd-table - mcgrp: event - - - name: pd-deleted - doc: A performance domain is deleted. - attribute-set: pd-table - event: - attributes: - - pd-id - mcgrp: event - -mcast-groups: - list: - - - name: event diff --git a/MAINTAINERS b/MAINTAINERS index 765ad2daa218..1e208243b28e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9298,12 +9298,12 @@ M: Lukasz Luba M: "Rafael J. Wysocki" L: linux-pm@vger.kernel.org S: Maintained -F: kernel/power/energy_model.c -F: include/linux/energy_model.h +F: Documentation/netlink/specs/dev-energymodel.yaml F: Documentation/power/energy-model.rst -F: Documentation/netlink/specs/em.yaml -F: include/uapi/linux/energy_model.h +F: include/linux/energy_model.h +F: include/uapi/linux/dev_energymodel.h F: kernel/power/em_netlink*.* +F: kernel/power/energy_model.c EPAPR HYPERVISOR BYTE CHANNEL DEVICE DRIVER M: Laurentiu Tudor diff --git a/include/uapi/linux/dev_energymodel.h b/include/uapi/linux/dev_energymodel.h new file mode 100644 index 000000000000..3399967e1f93 --- /dev/null +++ b/include/uapi/linux/dev_energymodel.h @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/dev-energymodel.yaml */ +/* YNL-GEN uapi header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ + +#ifndef _UAPI_LINUX_DEV_ENERGYMODEL_H +#define _UAPI_LINUX_DEV_ENERGYMODEL_H + +#define DEV_ENERGYMODEL_FAMILY_NAME "dev-energymodel" +#define DEV_ENERGYMODEL_FAMILY_VERSION 1 + +/** + * enum dev_energymodel_perf_state_flags + * @DEV_ENERGYMODEL_PERF_STATE_FLAGS_PERF_STATE_INEFFICIENT: The performance + * state is inefficient. There is in this perf-domain, another performance + * state with a higher frequency but a lower or equal power cost. + */ +enum dev_energymodel_perf_state_flags { + DEV_ENERGYMODEL_PERF_STATE_FLAGS_PERF_STATE_INEFFICIENT = 1, +}; + +/** + * enum dev_energymodel_perf_domain_flags + * @DEV_ENERGYMODEL_PERF_DOMAIN_FLAGS_PERF_DOMAIN_MICROWATTS: The power values + * are in micro-Watts or some other scale. + * @DEV_ENERGYMODEL_PERF_DOMAIN_FLAGS_PERF_DOMAIN_SKIP_INEFFICIENCIES: Skip + * inefficient states when estimating energy consumption. + * @DEV_ENERGYMODEL_PERF_DOMAIN_FLAGS_PERF_DOMAIN_ARTIFICIAL: The power values + * are artificial and might be created by platform missing real power + * information. + */ +enum dev_energymodel_perf_domain_flags { + DEV_ENERGYMODEL_PERF_DOMAIN_FLAGS_PERF_DOMAIN_MICROWATTS = 1, + DEV_ENERGYMODEL_PERF_DOMAIN_FLAGS_PERF_DOMAIN_SKIP_INEFFICIENCIES = 2, + DEV_ENERGYMODEL_PERF_DOMAIN_FLAGS_PERF_DOMAIN_ARTIFICIAL = 4, +}; + +enum { + DEV_ENERGYMODEL_A_PERF_DOMAINS_PERF_DOMAIN = 1, + + __DEV_ENERGYMODEL_A_PERF_DOMAINS_MAX, + DEV_ENERGYMODEL_A_PERF_DOMAINS_MAX = (__DEV_ENERGYMODEL_A_PERF_DOMAINS_MAX - 1) +}; + +enum { + DEV_ENERGYMODEL_A_PERF_DOMAIN_PAD = 1, + DEV_ENERGYMODEL_A_PERF_DOMAIN_PERF_DOMAIN_ID, + DEV_ENERGYMODEL_A_PERF_DOMAIN_FLAGS, + DEV_ENERGYMODEL_A_PERF_DOMAIN_CPUS, + + __DEV_ENERGYMODEL_A_PERF_DOMAIN_MAX, + DEV_ENERGYMODEL_A_PERF_DOMAIN_MAX = (__DEV_ENERGYMODEL_A_PERF_DOMAIN_MAX - 1) +}; + +enum { + DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID = 1, + DEV_ENERGYMODEL_A_PERF_TABLE_PERF_STATE, + + __DEV_ENERGYMODEL_A_PERF_TABLE_MAX, + DEV_ENERGYMODEL_A_PERF_TABLE_MAX = (__DEV_ENERGYMODEL_A_PERF_TABLE_MAX - 1) +}; + +enum { + DEV_ENERGYMODEL_A_PERF_STATE_PAD = 1, + DEV_ENERGYMODEL_A_PERF_STATE_PERFORMANCE, + DEV_ENERGYMODEL_A_PERF_STATE_FREQUENCY, + DEV_ENERGYMODEL_A_PERF_STATE_POWER, + DEV_ENERGYMODEL_A_PERF_STATE_COST, + DEV_ENERGYMODEL_A_PERF_STATE_FLAGS, + + __DEV_ENERGYMODEL_A_PERF_STATE_MAX, + DEV_ENERGYMODEL_A_PERF_STATE_MAX = (__DEV_ENERGYMODEL_A_PERF_STATE_MAX - 1) +}; + +enum { + DEV_ENERGYMODEL_CMD_GET_PERF_DOMAINS = 1, + DEV_ENERGYMODEL_CMD_GET_PERF_TABLE, + DEV_ENERGYMODEL_CMD_PERF_DOMAIN_CREATED, + DEV_ENERGYMODEL_CMD_PERF_DOMAIN_UPDATED, + DEV_ENERGYMODEL_CMD_PERF_DOMAIN_DELETED, + + __DEV_ENERGYMODEL_CMD_MAX, + DEV_ENERGYMODEL_CMD_MAX = (__DEV_ENERGYMODEL_CMD_MAX - 1) +}; + +#define DEV_ENERGYMODEL_MCGRP_EVENT "event" + +#endif /* _UAPI_LINUX_DEV_ENERGYMODEL_H */ diff --git a/include/uapi/linux/energy_model.h b/include/uapi/linux/energy_model.h deleted file mode 100644 index 0bcad967854f..000000000000 --- a/include/uapi/linux/energy_model.h +++ /dev/null @@ -1,63 +0,0 @@ -/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ -/* Do not edit directly, auto-generated from: */ -/* Documentation/netlink/specs/em.yaml */ -/* YNL-GEN uapi header */ -/* To regenerate run: tools/net/ynl/ynl-regen.sh */ - -#ifndef _UAPI_LINUX_ENERGY_MODEL_H -#define _UAPI_LINUX_ENERGY_MODEL_H - -#define EM_FAMILY_NAME "em" -#define EM_FAMILY_VERSION 1 - -enum { - EM_A_PDS_PD = 1, - - __EM_A_PDS_MAX, - EM_A_PDS_MAX = (__EM_A_PDS_MAX - 1) -}; - -enum { - EM_A_PD_PAD = 1, - EM_A_PD_PD_ID, - EM_A_PD_FLAGS, - EM_A_PD_CPUS, - - __EM_A_PD_MAX, - EM_A_PD_MAX = (__EM_A_PD_MAX - 1) -}; - -enum { - EM_A_PD_TABLE_PD_ID = 1, - EM_A_PD_TABLE_PS, - - __EM_A_PD_TABLE_MAX, - EM_A_PD_TABLE_MAX = (__EM_A_PD_TABLE_MAX - 1) -}; - -enum { - EM_A_PS_PAD = 1, - EM_A_PS_PERFORMANCE, - EM_A_PS_FREQUENCY, - EM_A_PS_POWER, - EM_A_PS_COST, - EM_A_PS_FLAGS, - - __EM_A_PS_MAX, - EM_A_PS_MAX = (__EM_A_PS_MAX - 1) -}; - -enum { - EM_CMD_GET_PDS = 1, - EM_CMD_GET_PD_TABLE, - EM_CMD_PD_CREATED, - EM_CMD_PD_UPDATED, - EM_CMD_PD_DELETED, - - __EM_CMD_MAX, - EM_CMD_MAX = (__EM_CMD_MAX - 1) -}; - -#define EM_MCGRP_EVENT "event" - -#endif /* _UAPI_LINUX_ENERGY_MODEL_H */ diff --git a/kernel/power/em_netlink.c b/kernel/power/em_netlink.c index 4b85da138a06..6f6238c465bb 100644 --- a/kernel/power/em_netlink.c +++ b/kernel/power/em_netlink.c @@ -12,27 +12,31 @@ #include #include #include -#include +#include #include "em_netlink.h" #include "em_netlink_autogen.h" -#define EM_A_PD_CPUS_LEN 256 +#define DEV_ENERGYMODEL_A_PERF_DOMAIN_CPUS_LEN 256 /*************************** Command encoding ********************************/ static int __em_nl_get_pd_size(struct em_perf_domain *pd, void *data) { - char cpus_buf[EM_A_PD_CPUS_LEN]; + char cpus_buf[DEV_ENERGYMODEL_A_PERF_DOMAIN_CPUS_LEN]; int *tot_msg_sz = data; int msg_sz, cpus_sz; cpus_sz = snprintf(cpus_buf, sizeof(cpus_buf), "%*pb", cpumask_pr_args(to_cpumask(pd->cpus))); - msg_sz = nla_total_size(0) + /* EM_A_PDS_PD */ - nla_total_size(sizeof(u32)) + /* EM_A_PD_PD_ID */ - nla_total_size_64bit(sizeof(u64)) + /* EM_A_PD_FLAGS */ - nla_total_size(cpus_sz); /* EM_A_PD_CPUS */ + msg_sz = nla_total_size(0) + + /* DEV_ENERGYMODEL_A_PERF_DOMAINS_PERF_DOMAIN */ + nla_total_size(sizeof(u32)) + + /* DEV_ENERGYMODEL_A_PERF_DOMAIN_PERF_DOMAIN_ID */ + nla_total_size_64bit(sizeof(u64)) + + /* DEV_ENERGYMODEL_A_PERF_DOMAIN_FLAGS */ + nla_total_size(cpus_sz); + /* DEV_ENERGYMODEL_A_PERF_DOMAIN_CPUS */ *tot_msg_sz += nlmsg_total_size(genlmsg_msg_size(msg_sz)); return 0; @@ -40,23 +44,26 @@ static int __em_nl_get_pd_size(struct em_perf_domain *pd, void *data) static int __em_nl_get_pd(struct em_perf_domain *pd, void *data) { - char cpus_buf[EM_A_PD_CPUS_LEN]; + char cpus_buf[DEV_ENERGYMODEL_A_PERF_DOMAIN_CPUS_LEN]; struct sk_buff *msg = data; struct nlattr *entry; - entry = nla_nest_start(msg, EM_A_PDS_PD); + entry = nla_nest_start(msg, + DEV_ENERGYMODEL_A_PERF_DOMAINS_PERF_DOMAIN); if (!entry) goto out_cancel_nest; - if (nla_put_u32(msg, EM_A_PD_PD_ID, pd->id)) + if (nla_put_u32(msg, DEV_ENERGYMODEL_A_PERF_DOMAIN_PERF_DOMAIN_ID, + pd->id)) goto out_cancel_nest; - if (nla_put_u64_64bit(msg, EM_A_PD_FLAGS, pd->flags, EM_A_PD_PAD)) + if (nla_put_u64_64bit(msg, DEV_ENERGYMODEL_A_PERF_DOMAIN_FLAGS, + pd->flags, DEV_ENERGYMODEL_A_PERF_DOMAIN_PAD)) goto out_cancel_nest; snprintf(cpus_buf, sizeof(cpus_buf), "%*pb", cpumask_pr_args(to_cpumask(pd->cpus))); - if (nla_put_string(msg, EM_A_PD_CPUS, cpus_buf)) + if (nla_put_string(msg, DEV_ENERGYMODEL_A_PERF_DOMAIN_CPUS, cpus_buf)) goto out_cancel_nest; nla_nest_end(msg, entry); @@ -69,7 +76,8 @@ out_cancel_nest: return -EMSGSIZE; } -int em_nl_get_pds_doit(struct sk_buff *skb, struct genl_info *info) +int dev_energymodel_nl_get_perf_domains_doit(struct sk_buff *skb, + struct genl_info *info) { struct sk_buff *msg; void *hdr; @@ -82,7 +90,7 @@ int em_nl_get_pds_doit(struct sk_buff *skb, struct genl_info *info) if (!msg) return -ENOMEM; - hdr = genlmsg_put_reply(msg, info, &em_nl_family, 0, cmd); + hdr = genlmsg_put_reply(msg, info, &dev_energymodel_nl_family, 0, cmd); if (!hdr) goto out_free_msg; @@ -107,10 +115,10 @@ static struct em_perf_domain *__em_nl_get_pd_table_id(struct nlattr **attrs) struct em_perf_domain *pd; int id; - if (!attrs[EM_A_PD_TABLE_PD_ID]) + if (!attrs[DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID]) return NULL; - id = nla_get_u32(attrs[EM_A_PD_TABLE_PD_ID]); + id = nla_get_u32(attrs[DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID]); pd = em_perf_domain_get_by_id(id); return pd; } @@ -119,25 +127,34 @@ static int __em_nl_get_pd_table_size(const struct em_perf_domain *pd) { int id_sz, ps_sz; - id_sz = nla_total_size(sizeof(u32)); /* EM_A_PD_TABLE_PD_ID */ - ps_sz = nla_total_size(0) + /* EM_A_PD_TABLE_PS */ - nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_PERFORMANCE */ - nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_FREQUENCY */ - nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_POWER */ - nla_total_size_64bit(sizeof(u64)) + /* EM_A_PS_COST */ - nla_total_size_64bit(sizeof(u64)); /* EM_A_PS_FLAGS */ + id_sz = nla_total_size(sizeof(u32)); + /* DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID */ + ps_sz = nla_total_size(0) + + /* DEV_ENERGYMODEL_A_PERF_TABLE_PERF_STATE */ + nla_total_size_64bit(sizeof(u64)) + + /* DEV_ENERGYMODEL_A_PERF_STATE_PERFORMANCE */ + nla_total_size_64bit(sizeof(u64)) + + /* DEV_ENERGYMODEL_A_PERF_STATE_FREQUENCY */ + nla_total_size_64bit(sizeof(u64)) + + /* DEV_ENERGYMODEL_A_PERF_STATE_POWER */ + nla_total_size_64bit(sizeof(u64)) + + /* DEV_ENERGYMODEL_A_PERF_STATE_COST */ + nla_total_size_64bit(sizeof(u64)); + /* DEV_ENERGYMODEL_A_PERF_STATE_FLAGS */ ps_sz *= pd->nr_perf_states; return nlmsg_total_size(genlmsg_msg_size(id_sz + ps_sz)); } -static int __em_nl_get_pd_table(struct sk_buff *msg, const struct em_perf_domain *pd) +static +int __em_nl_get_pd_table(struct sk_buff *msg, const struct em_perf_domain *pd) { struct em_perf_state *table, *ps; struct nlattr *entry; int i; - if (nla_put_u32(msg, EM_A_PD_TABLE_PD_ID, pd->id)) + if (nla_put_u32(msg, DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID, + pd->id)) goto out_err; rcu_read_lock(); @@ -146,24 +163,35 @@ static int __em_nl_get_pd_table(struct sk_buff *msg, const struct em_perf_domain for (i = 0; i < pd->nr_perf_states; i++) { ps = &table[i]; - entry = nla_nest_start(msg, EM_A_PD_TABLE_PS); + entry = nla_nest_start(msg, + DEV_ENERGYMODEL_A_PERF_TABLE_PERF_STATE); if (!entry) goto out_unlock_ps; - if (nla_put_u64_64bit(msg, EM_A_PS_PERFORMANCE, - ps->performance, EM_A_PS_PAD)) + if (nla_put_u64_64bit(msg, + DEV_ENERGYMODEL_A_PERF_STATE_PERFORMANCE, + ps->performance, + DEV_ENERGYMODEL_A_PERF_STATE_PAD)) goto out_cancel_ps_nest; - if (nla_put_u64_64bit(msg, EM_A_PS_FREQUENCY, - ps->frequency, EM_A_PS_PAD)) + if (nla_put_u64_64bit(msg, + DEV_ENERGYMODEL_A_PERF_STATE_FREQUENCY, + ps->frequency, + DEV_ENERGYMODEL_A_PERF_STATE_PAD)) goto out_cancel_ps_nest; - if (nla_put_u64_64bit(msg, EM_A_PS_POWER, - ps->power, EM_A_PS_PAD)) + if (nla_put_u64_64bit(msg, + DEV_ENERGYMODEL_A_PERF_STATE_POWER, + ps->power, + DEV_ENERGYMODEL_A_PERF_STATE_PAD)) goto out_cancel_ps_nest; - if (nla_put_u64_64bit(msg, EM_A_PS_COST, - ps->cost, EM_A_PS_PAD)) + if (nla_put_u64_64bit(msg, + DEV_ENERGYMODEL_A_PERF_STATE_COST, + ps->cost, + DEV_ENERGYMODEL_A_PERF_STATE_PAD)) goto out_cancel_ps_nest; - if (nla_put_u64_64bit(msg, EM_A_PS_FLAGS, - ps->flags, EM_A_PS_PAD)) + if (nla_put_u64_64bit(msg, + DEV_ENERGYMODEL_A_PERF_STATE_FLAGS, + ps->flags, + DEV_ENERGYMODEL_A_PERF_STATE_PAD)) goto out_cancel_ps_nest; nla_nest_end(msg, entry); @@ -179,7 +207,8 @@ out_err: return -EMSGSIZE; } -int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info) +int dev_energymodel_nl_get_perf_table_doit(struct sk_buff *skb, + struct genl_info *info) { int cmd = info->genlhdr->cmd; int msg_sz, ret = -EMSGSIZE; @@ -197,7 +226,7 @@ int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info) if (!msg) return -ENOMEM; - hdr = genlmsg_put_reply(msg, info, &em_nl_family, 0, cmd); + hdr = genlmsg_put_reply(msg, info, &dev_energymodel_nl_family, 0, cmd); if (!hdr) goto out_free_msg; @@ -221,7 +250,7 @@ static void __em_notify_pd_table(const struct em_perf_domain *pd, int ntf_type) int msg_sz, ret = -EMSGSIZE; void *hdr; - if (!genl_has_listeners(&em_nl_family, &init_net, EM_NLGRP_EVENT)) + if (!genl_has_listeners(&dev_energymodel_nl_family, &init_net, DEV_ENERGYMODEL_NLGRP_EVENT)) return; msg_sz = __em_nl_get_pd_table_size(pd); @@ -230,7 +259,7 @@ static void __em_notify_pd_table(const struct em_perf_domain *pd, int ntf_type) if (!msg) return; - hdr = genlmsg_put(msg, 0, 0, &em_nl_family, 0, ntf_type); + hdr = genlmsg_put(msg, 0, 0, &dev_energymodel_nl_family, 0, ntf_type); if (!hdr) goto out_free_msg; @@ -240,28 +269,28 @@ static void __em_notify_pd_table(const struct em_perf_domain *pd, int ntf_type) genlmsg_end(msg, hdr); - genlmsg_multicast(&em_nl_family, msg, 0, EM_NLGRP_EVENT, GFP_KERNEL); + genlmsg_multicast(&dev_energymodel_nl_family, msg, 0, + DEV_ENERGYMODEL_NLGRP_EVENT, GFP_KERNEL); return; out_free_msg: nlmsg_free(msg); - return; } void em_notify_pd_created(const struct em_perf_domain *pd) { - __em_notify_pd_table(pd, EM_CMD_PD_CREATED); + __em_notify_pd_table(pd, DEV_ENERGYMODEL_CMD_PERF_DOMAIN_CREATED); } void em_notify_pd_updated(const struct em_perf_domain *pd) { - __em_notify_pd_table(pd, EM_CMD_PD_UPDATED); + __em_notify_pd_table(pd, DEV_ENERGYMODEL_CMD_PERF_DOMAIN_UPDATED); } static int __em_notify_pd_deleted_size(const struct em_perf_domain *pd) { - int id_sz = nla_total_size(sizeof(u32)); /* EM_A_PD_TABLE_PD_ID */ + int id_sz = nla_total_size(sizeof(u32)); /* DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID */ return nlmsg_total_size(genlmsg_msg_size(id_sz)); } @@ -272,7 +301,8 @@ void em_notify_pd_deleted(const struct em_perf_domain *pd) void *hdr; int msg_sz; - if (!genl_has_listeners(&em_nl_family, &init_net, EM_NLGRP_EVENT)) + if (!genl_has_listeners(&dev_energymodel_nl_family, &init_net, + DEV_ENERGYMODEL_NLGRP_EVENT)) return; msg_sz = __em_notify_pd_deleted_size(pd); @@ -281,28 +311,29 @@ void em_notify_pd_deleted(const struct em_perf_domain *pd) if (!msg) return; - hdr = genlmsg_put(msg, 0, 0, &em_nl_family, 0, EM_CMD_PD_DELETED); + hdr = genlmsg_put(msg, 0, 0, &dev_energymodel_nl_family, 0, + DEV_ENERGYMODEL_CMD_PERF_DOMAIN_DELETED); if (!hdr) goto out_free_msg; - if (nla_put_u32(msg, EM_A_PD_TABLE_PD_ID, pd->id)) { + if (nla_put_u32(msg, DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID, + pd->id)) goto out_free_msg; - } genlmsg_end(msg, hdr); - genlmsg_multicast(&em_nl_family, msg, 0, EM_NLGRP_EVENT, GFP_KERNEL); + genlmsg_multicast(&dev_energymodel_nl_family, msg, 0, + DEV_ENERGYMODEL_NLGRP_EVENT, GFP_KERNEL); return; out_free_msg: nlmsg_free(msg); - return; } /**************************** Initialization *********************************/ static int __init em_netlink_init(void) { - return genl_register_family(&em_nl_family); + return genl_register_family(&dev_energymodel_nl_family); } postcore_initcall(em_netlink_init); diff --git a/kernel/power/em_netlink_autogen.c b/kernel/power/em_netlink_autogen.c index ceb3b2bb6ebe..44acef0e7df2 100644 --- a/kernel/power/em_netlink_autogen.c +++ b/kernel/power/em_netlink_autogen.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) /* Do not edit directly, auto-generated from: */ -/* Documentation/netlink/specs/em.yaml */ +/* Documentation/netlink/specs/dev-energymodel.yaml */ /* YNL-GEN kernel source */ /* To regenerate run: tools/net/ynl/ynl-regen.sh */ @@ -9,41 +9,41 @@ #include "em_netlink_autogen.h" -#include +#include -/* EM_CMD_GET_PD_TABLE - do */ -static const struct nla_policy em_get_pd_table_nl_policy[EM_A_PD_TABLE_PD_ID + 1] = { - [EM_A_PD_TABLE_PD_ID] = { .type = NLA_U32, }, +/* DEV_ENERGYMODEL_CMD_GET_PERF_TABLE - do */ +static const struct nla_policy dev_energymodel_get_perf_table_nl_policy[DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID + 1] = { + [DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID] = { .type = NLA_U32, }, }; -/* Ops table for em */ -static const struct genl_split_ops em_nl_ops[] = { +/* Ops table for dev_energymodel */ +static const struct genl_split_ops dev_energymodel_nl_ops[] = { { - .cmd = EM_CMD_GET_PDS, - .doit = em_nl_get_pds_doit, + .cmd = DEV_ENERGYMODEL_CMD_GET_PERF_DOMAINS, + .doit = dev_energymodel_nl_get_perf_domains_doit, .flags = GENL_CMD_CAP_DO, }, { - .cmd = EM_CMD_GET_PD_TABLE, - .doit = em_nl_get_pd_table_doit, - .policy = em_get_pd_table_nl_policy, - .maxattr = EM_A_PD_TABLE_PD_ID, + .cmd = DEV_ENERGYMODEL_CMD_GET_PERF_TABLE, + .doit = dev_energymodel_nl_get_perf_table_doit, + .policy = dev_energymodel_get_perf_table_nl_policy, + .maxattr = DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID, .flags = GENL_CMD_CAP_DO, }, }; -static const struct genl_multicast_group em_nl_mcgrps[] = { - [EM_NLGRP_EVENT] = { "event", }, +static const struct genl_multicast_group dev_energymodel_nl_mcgrps[] = { + [DEV_ENERGYMODEL_NLGRP_EVENT] = { "event", }, }; -struct genl_family em_nl_family __ro_after_init = { - .name = EM_FAMILY_NAME, - .version = EM_FAMILY_VERSION, +struct genl_family dev_energymodel_nl_family __ro_after_init = { + .name = DEV_ENERGYMODEL_FAMILY_NAME, + .version = DEV_ENERGYMODEL_FAMILY_VERSION, .netnsok = true, .parallel_ops = true, .module = THIS_MODULE, - .split_ops = em_nl_ops, - .n_split_ops = ARRAY_SIZE(em_nl_ops), - .mcgrps = em_nl_mcgrps, - .n_mcgrps = ARRAY_SIZE(em_nl_mcgrps), + .split_ops = dev_energymodel_nl_ops, + .n_split_ops = ARRAY_SIZE(dev_energymodel_nl_ops), + .mcgrps = dev_energymodel_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(dev_energymodel_nl_mcgrps), }; diff --git a/kernel/power/em_netlink_autogen.h b/kernel/power/em_netlink_autogen.h index 140ab548103c..f7e4bddcbd53 100644 --- a/kernel/power/em_netlink_autogen.h +++ b/kernel/power/em_netlink_autogen.h @@ -1,24 +1,26 @@ /* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ /* Do not edit directly, auto-generated from: */ -/* Documentation/netlink/specs/em.yaml */ +/* Documentation/netlink/specs/dev-energymodel.yaml */ /* YNL-GEN kernel header */ /* To regenerate run: tools/net/ynl/ynl-regen.sh */ -#ifndef _LINUX_EM_GEN_H -#define _LINUX_EM_GEN_H +#ifndef _LINUX_DEV_ENERGYMODEL_GEN_H +#define _LINUX_DEV_ENERGYMODEL_GEN_H #include #include -#include +#include -int em_nl_get_pds_doit(struct sk_buff *skb, struct genl_info *info); -int em_nl_get_pd_table_doit(struct sk_buff *skb, struct genl_info *info); +int dev_energymodel_nl_get_perf_domains_doit(struct sk_buff *skb, + struct genl_info *info); +int dev_energymodel_nl_get_perf_table_doit(struct sk_buff *skb, + struct genl_info *info); enum { - EM_NLGRP_EVENT, + DEV_ENERGYMODEL_NLGRP_EVENT, }; -extern struct genl_family em_nl_family; +extern struct genl_family dev_energymodel_nl_family; -#endif /* _LINUX_EM_GEN_H */ +#endif /* _LINUX_DEV_ENERGYMODEL_GEN_H */ -- cgit v1.2.3 From 380ff27af25e49e2cb2ff8fd0ecd7c95be2976ee Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Thu, 8 Jan 2026 14:32:12 +0900 Subject: PM: EM: Add dump to get-perf-domains in the EM YNL spec Add dump to get-perf-domains, so that a user can fetch either information about a specific performance domain with do or information about all performance domains with dump. Share the reply format of do and dump using perf-domain-attrs, so remove perf-domains. The YNL spec, autogenerated files, and the do implementation are updated, and the dump implementation is added. Suggested-by: Donald Hunter Reviewed-by: Lukasz Luba Reviewed-by: Donald Hunter Signed-off-by: Changwoo Min Link: https://patch.msgid.link/20260108053212.642478-5-changwoo@igalia.com Signed-off-by: Rafael J. Wysocki --- Documentation/netlink/specs/dev-energymodel.yaml | 25 +++++---- include/uapi/linux/dev_energymodel.h | 7 --- kernel/power/em_netlink.c | 68 ++++++++++++++++++------ kernel/power/em_netlink_autogen.c | 16 +++++- kernel/power/em_netlink_autogen.h | 2 + 5 files changed, 80 insertions(+), 38 deletions(-) (limited to 'include') diff --git a/Documentation/netlink/specs/dev-energymodel.yaml b/Documentation/netlink/specs/dev-energymodel.yaml index af8b8f72f722..11faabfdfbe8 100644 --- a/Documentation/netlink/specs/dev-energymodel.yaml +++ b/Documentation/netlink/specs/dev-energymodel.yaml @@ -42,16 +42,6 @@ definitions: missing real power information. attribute-sets: - - - name: perf-domains - doc: >- - Information on all the performance domains. - attributes: - - - name: perf-domain - type: nest - nested-attributes: perf-domain - multi-attr: true - name: perf-domain doc: >- @@ -133,12 +123,21 @@ operations: list: - name: get-perf-domains - attribute-set: perf-domains + attribute-set: perf-domain doc: Get the list of information for all performance domains. do: - reply: + request: attributes: - - perf-domain + - perf-domain-id + reply: + attributes: &perf-domain-attrs + - pad + - perf-domain-id + - flags + - cpus + dump: + reply: + attributes: *perf-domain-attrs - name: get-perf-table attribute-set: perf-table diff --git a/include/uapi/linux/dev_energymodel.h b/include/uapi/linux/dev_energymodel.h index 3399967e1f93..355d8885c9a0 100644 --- a/include/uapi/linux/dev_energymodel.h +++ b/include/uapi/linux/dev_energymodel.h @@ -36,13 +36,6 @@ enum dev_energymodel_perf_domain_flags { DEV_ENERGYMODEL_PERF_DOMAIN_FLAGS_PERF_DOMAIN_ARTIFICIAL = 4, }; -enum { - DEV_ENERGYMODEL_A_PERF_DOMAINS_PERF_DOMAIN = 1, - - __DEV_ENERGYMODEL_A_PERF_DOMAINS_MAX, - DEV_ENERGYMODEL_A_PERF_DOMAINS_MAX = (__DEV_ENERGYMODEL_A_PERF_DOMAINS_MAX - 1) -}; - enum { DEV_ENERGYMODEL_A_PERF_DOMAIN_PAD = 1, DEV_ENERGYMODEL_A_PERF_DOMAIN_PERF_DOMAIN_ID, diff --git a/kernel/power/em_netlink.c b/kernel/power/em_netlink.c index b6edb018c65a..5a611d3950fd 100644 --- a/kernel/power/em_netlink.c +++ b/kernel/power/em_netlink.c @@ -18,6 +18,13 @@ #include "em_netlink_autogen.h" /*************************** Command encoding ********************************/ +struct dump_ctx { + int idx; + int start; + struct sk_buff *skb; + struct netlink_callback *cb; +}; + static int __em_nl_get_pd_size(struct em_perf_domain *pd, void *data) { int nr_cpus, msg_sz, cpus_sz; @@ -43,14 +50,8 @@ static int __em_nl_get_pd(struct em_perf_domain *pd, void *data) { struct sk_buff *msg = data; struct cpumask *cpumask; - struct nlattr *entry; int cpu; - entry = nla_nest_start(msg, - DEV_ENERGYMODEL_A_PERF_DOMAINS_PERF_DOMAIN); - if (!entry) - goto out_cancel_nest; - if (nla_put_u32(msg, DEV_ENERGYMODEL_A_PERF_DOMAIN_PERF_DOMAIN_ID, pd->id)) goto out_cancel_nest; @@ -66,26 +67,50 @@ static int __em_nl_get_pd(struct em_perf_domain *pd, void *data) goto out_cancel_nest; } - nla_nest_end(msg, entry); - return 0; out_cancel_nest: - nla_nest_cancel(msg, entry); - return -EMSGSIZE; } +static int __em_nl_get_pd_for_dump(struct em_perf_domain *pd, void *data) +{ + const struct genl_info *info; + struct dump_ctx *ctx = data; + void *hdr; + int ret; + + if (ctx->idx++ < ctx->start) + return 0; + + info = genl_info_dump(ctx->cb); + hdr = genlmsg_iput(ctx->skb, info); + if (!hdr) { + genlmsg_cancel(ctx->skb, hdr); + return -EMSGSIZE; + } + + ret = __em_nl_get_pd(pd, ctx->skb); + genlmsg_end(ctx->skb, hdr); + return ret; +} + int dev_energymodel_nl_get_perf_domains_doit(struct sk_buff *skb, struct genl_info *info) { + int id, ret = -EMSGSIZE, msg_sz = 0; + int cmd = info->genlhdr->cmd; + struct em_perf_domain *pd; struct sk_buff *msg; void *hdr; - int cmd = info->genlhdr->cmd; - int ret = -EMSGSIZE, msg_sz = 0; - for_each_em_perf_domain(__em_nl_get_pd_size, &msg_sz); + if (!info->attrs[DEV_ENERGYMODEL_A_PERF_DOMAIN_PERF_DOMAIN_ID]) + return -EINVAL; + id = nla_get_u32(info->attrs[DEV_ENERGYMODEL_A_PERF_DOMAIN_PERF_DOMAIN_ID]); + pd = em_perf_domain_get_by_id(id); + + __em_nl_get_pd_size(pd, &msg_sz); msg = genlmsg_new(msg_sz, GFP_KERNEL); if (!msg) return -ENOMEM; @@ -94,10 +119,9 @@ int dev_energymodel_nl_get_perf_domains_doit(struct sk_buff *skb, if (!hdr) goto out_free_msg; - ret = for_each_em_perf_domain(__em_nl_get_pd, msg); + ret = __em_nl_get_pd(pd, msg); if (ret) goto out_cancel_msg; - genlmsg_end(msg, hdr); return genlmsg_reply(msg, info); @@ -106,10 +130,22 @@ out_cancel_msg: genlmsg_cancel(msg, hdr); out_free_msg: nlmsg_free(msg); - return ret; } +int dev_energymodel_nl_get_perf_domains_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct dump_ctx ctx = { + .idx = 0, + .start = cb->args[0], + .skb = skb, + .cb = cb, + }; + + return for_each_em_perf_domain(__em_nl_get_pd_for_dump, &ctx); +} + static struct em_perf_domain *__em_nl_get_pd_table_id(struct nlattr **attrs) { struct em_perf_domain *pd; diff --git a/kernel/power/em_netlink_autogen.c b/kernel/power/em_netlink_autogen.c index 44acef0e7df2..fedd473e4244 100644 --- a/kernel/power/em_netlink_autogen.c +++ b/kernel/power/em_netlink_autogen.c @@ -11,6 +11,11 @@ #include +/* DEV_ENERGYMODEL_CMD_GET_PERF_DOMAINS - do */ +static const struct nla_policy dev_energymodel_get_perf_domains_nl_policy[DEV_ENERGYMODEL_A_PERF_DOMAIN_PERF_DOMAIN_ID + 1] = { + [DEV_ENERGYMODEL_A_PERF_DOMAIN_PERF_DOMAIN_ID] = { .type = NLA_U32, }, +}; + /* DEV_ENERGYMODEL_CMD_GET_PERF_TABLE - do */ static const struct nla_policy dev_energymodel_get_perf_table_nl_policy[DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID + 1] = { [DEV_ENERGYMODEL_A_PERF_TABLE_PERF_DOMAIN_ID] = { .type = NLA_U32, }, @@ -18,10 +23,17 @@ static const struct nla_policy dev_energymodel_get_perf_table_nl_policy[DEV_ENER /* Ops table for dev_energymodel */ static const struct genl_split_ops dev_energymodel_nl_ops[] = { + { + .cmd = DEV_ENERGYMODEL_CMD_GET_PERF_DOMAINS, + .doit = dev_energymodel_nl_get_perf_domains_doit, + .policy = dev_energymodel_get_perf_domains_nl_policy, + .maxattr = DEV_ENERGYMODEL_A_PERF_DOMAIN_PERF_DOMAIN_ID, + .flags = GENL_CMD_CAP_DO, + }, { .cmd = DEV_ENERGYMODEL_CMD_GET_PERF_DOMAINS, - .doit = dev_energymodel_nl_get_perf_domains_doit, - .flags = GENL_CMD_CAP_DO, + .dumpit = dev_energymodel_nl_get_perf_domains_dumpit, + .flags = GENL_CMD_CAP_DUMP, }, { .cmd = DEV_ENERGYMODEL_CMD_GET_PERF_TABLE, diff --git a/kernel/power/em_netlink_autogen.h b/kernel/power/em_netlink_autogen.h index f7e4bddcbd53..5caf2f7e18a5 100644 --- a/kernel/power/em_netlink_autogen.h +++ b/kernel/power/em_netlink_autogen.h @@ -14,6 +14,8 @@ int dev_energymodel_nl_get_perf_domains_doit(struct sk_buff *skb, struct genl_info *info); +int dev_energymodel_nl_get_perf_domains_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); int dev_energymodel_nl_get_perf_table_doit(struct sk_buff *skb, struct genl_info *info); -- cgit v1.2.3 From 6abbb8703aeeb645a681ab6ad155e0b450413787 Mon Sep 17 00:00:00 2001 From: Günther Noack Date: Sun, 11 Jan 2026 18:52:04 +0100 Subject: landlock: Clarify documentation for the IOCTL access right MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the description of the LANDLOCK_ACCESS_FS_IOCTL_DEV access right together with the file access rights. This group of access rights applies to files (in this case device files), and they can be added to file or directory inodes using landlock_add_rule(2). The check for that works the same for all file access rights, including LANDLOCK_ACCESS_FS_IOCTL_DEV. Invoking ioctl(2) on directory FDs can not currently be restricted with Landlock. Having it grouped separately in the documentation is a remnant from earlier revisions of the LANDLOCK_ACCESS_FS_IOCTL_DEV patch set. Link: https://lore.kernel.org/all/20260108.Thaex5ruach2@digikod.net/ Signed-off-by: Günther Noack Link: https://lore.kernel.org/r/20260111175203.6545-2-gnoack3000@gmail.com Signed-off-by: Mickaël Salaün --- include/uapi/linux/landlock.h | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h index f030adc462ee..75fd7f5e6cc3 100644 --- a/include/uapi/linux/landlock.h +++ b/include/uapi/linux/landlock.h @@ -216,6 +216,23 @@ struct landlock_net_port_attr { * :manpage:`ftruncate(2)`, :manpage:`creat(2)`, or :manpage:`open(2)` with * ``O_TRUNC``. This access right is available since the third version of the * Landlock ABI. + * - %LANDLOCK_ACCESS_FS_IOCTL_DEV: Invoke :manpage:`ioctl(2)` commands on an opened + * character or block device. + * + * This access right applies to all `ioctl(2)` commands implemented by device + * drivers. However, the following common IOCTL commands continue to be + * invokable independent of the %LANDLOCK_ACCESS_FS_IOCTL_DEV right: + * + * * IOCTL commands targeting file descriptors (``FIOCLEX``, ``FIONCLEX``), + * * IOCTL commands targeting file descriptions (``FIONBIO``, ``FIOASYNC``), + * * IOCTL commands targeting file systems (``FIFREEZE``, ``FITHAW``, + * ``FIGETBSZ``, ``FS_IOC_GETFSUUID``, ``FS_IOC_GETFSSYSFSPATH``) + * * Some IOCTL commands which do not make sense when used with devices, but + * whose implementations are safe and return the right error codes + * (``FS_IOC_FIEMAP``, ``FICLONE``, ``FICLONERANGE``, ``FIDEDUPERANGE``) + * + * This access right is available since the fifth version of the Landlock + * ABI. * * Whether an opened file can be truncated with :manpage:`ftruncate(2)` or used * with `ioctl(2)` is determined during :manpage:`open(2)`, in the same way as @@ -275,26 +292,6 @@ struct landlock_net_port_attr { * If multiple requirements are not met, the ``EACCES`` error code takes * precedence over ``EXDEV``. * - * The following access right applies both to files and directories: - * - * - %LANDLOCK_ACCESS_FS_IOCTL_DEV: Invoke :manpage:`ioctl(2)` commands on an opened - * character or block device. - * - * This access right applies to all `ioctl(2)` commands implemented by device - * drivers. However, the following common IOCTL commands continue to be - * invokable independent of the %LANDLOCK_ACCESS_FS_IOCTL_DEV right: - * - * * IOCTL commands targeting file descriptors (``FIOCLEX``, ``FIONCLEX``), - * * IOCTL commands targeting file descriptions (``FIONBIO``, ``FIOASYNC``), - * * IOCTL commands targeting file systems (``FIFREEZE``, ``FITHAW``, - * ``FIGETBSZ``, ``FS_IOC_GETFSUUID``, ``FS_IOC_GETFSSYSFSPATH``) - * * Some IOCTL commands which do not make sense when used with devices, but - * whose implementations are safe and return the right error codes - * (``FS_IOC_FIEMAP``, ``FICLONE``, ``FICLONERANGE``, ``FIDEDUPERANGE``) - * - * This access right is available since the fifth version of the Landlock - * ABI. - * * .. warning:: * * It is currently not possible to restrict some file-related actions -- cgit v1.2.3 From 05f66cf5e7a5fc7c7227541f8a4a476037999916 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 26 Dec 2025 19:39:38 +0800 Subject: PCI: Provide pci_free_irq_vectors() stub 473b9f331718 ("rust: pci: fix build failure when CONFIG_PCI_MSI is disabled") fixed a build error by providing Rust helpers when CONFIG_PCI_MSI is not set. However the Rust helpers rely on pci_free_irq_vectors(), which is only available when CONFIG_PCI=y. When CONFIG_PCI is not set, there is already a stub for pci_alloc_irq_vectors(). Add a similar stub for pci_free_irq_vectors(). Fixes: 473b9f331718 ("rust: pci: fix build failure when CONFIG_PCI_MSI is disabled") Reported-by: FUJITA Tomonori Closes: https://lore.kernel.org/rust-for-linux/20251209014312.575940-1-fujita.tomonori@gmail.com/ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202512220740.4Kexm4dW-lkp@intel.com/ Reported-by: Liang Jie Closes: https://lore.kernel.org/rust-for-linux/20251222034415.1384223-1-buaajxlj@163.com/ Signed-off-by: Boqun Feng Signed-off-by: Bjorn Helgaas Reviewed-by: Drew Fustini Reviewed-by: David Gow Reviewed-by: Joel Fernandes Reviewed-by: Danilo Krummrich Link: https://patch.msgid.link/20251226113938.52145-1-boqun.feng@gmail.com --- include/linux/pci.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index 864775651c6f..b5cc0c2b9906 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -2210,6 +2210,10 @@ pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs, { return -ENOSPC; } + +static inline void pci_free_irq_vectors(struct pci_dev *dev) +{ +} #endif /* CONFIG_PCI */ /* Include architecture-dependent settings and functions */ -- cgit v1.2.3 From dfdf774656205515b2d6ad94fce63c7ccbe92d91 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 9 Jan 2026 10:29:06 +0100 Subject: net: airoha: Fix typo in airoha_ppe_setup_tc_block_cb definition Fix Typo in airoha_ppe_dev_setup_tc_block_cb routine definition when CONFIG_NET_AIROHA is not enabled. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202601090517.Fj6v501r-lkp@intel.com/ Fixes: f45fc18b6de04 ("net: airoha: Add airoha_ppe_dev struct definition") Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20260109-airoha_ppe_dev_setup_tc_block_cb-typo-v1-1-282e8834a9f9@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/soc/airoha/airoha_offload.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/soc/airoha/airoha_offload.h b/include/linux/soc/airoha/airoha_offload.h index ab64ecdf39a0..d01ef4a6b3d7 100644 --- a/include/linux/soc/airoha/airoha_offload.h +++ b/include/linux/soc/airoha/airoha_offload.h @@ -52,8 +52,8 @@ static inline void airoha_ppe_put_dev(struct airoha_ppe_dev *dev) { } -static inline int airoha_ppe_setup_tc_block_cb(struct airoha_ppe_dev *dev, - void *type_data) +static inline int airoha_ppe_dev_setup_tc_block_cb(struct airoha_ppe_dev *dev, + void *type_data) { return -EOPNOTSUPP; } -- cgit v1.2.3 From ffe4ccd359d006eba559cb1a3c6113144b7fb38c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 7 Jan 2026 10:41:59 +0000 Subject: net: add net.core.qdisc_max_burst In blamed commit, I added a check against the temporary queue built in __dev_xmit_skb(). Idea was to drop packets early, before any spinlock was acquired. if (unlikely(defer_count > READ_ONCE(q->limit))) { kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP); return NET_XMIT_DROP; } It turned out that HTB Qdisc has a zero q->limit. HTB limits packets on a per-class basis. Some of our tests became flaky. Add a new sysctl : net.core.qdisc_max_burst to control how many packets can be stored in the temporary lockless queue. Also add a new QDISC_BURST_DROP drop reason to better diagnose future issues. Thanks Neal ! Fixes: 100dfa74cad9 ("net: dev_queue_xmit() llist adoption") Reported-and-bisected-by: Neal Cardwell Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20260107104159.3669285-1-edumazet@google.com Signed-off-by: Paolo Abeni --- Documentation/admin-guide/sysctl/net.rst | 8 ++++++++ include/net/dropreason-core.h | 6 ++++++ include/net/hotdata.h | 1 + net/core/dev.c | 6 +++--- net/core/hotdata.c | 1 + net/core/sysctl_net_core.c | 7 +++++++ 6 files changed, 26 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst index 369a738a6819..91fa4ccd326c 100644 --- a/Documentation/admin-guide/sysctl/net.rst +++ b/Documentation/admin-guide/sysctl/net.rst @@ -303,6 +303,14 @@ netdev_max_backlog Maximum number of packets, queued on the INPUT side, when the interface receives packets faster than kernel can process them. +qdisc_max_burst +------------------ + +Maximum number of packets that can be temporarily stored before +reaching qdisc. + +Default: 1000 + netdev_rss_key -------------- diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 58d91ccc56e0..a7b7abd66e21 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -67,6 +67,7 @@ FN(TC_EGRESS) \ FN(SECURITY_HOOK) \ FN(QDISC_DROP) \ + FN(QDISC_BURST_DROP) \ FN(QDISC_OVERLIMIT) \ FN(QDISC_CONGESTED) \ FN(CAKE_FLOOD) \ @@ -374,6 +375,11 @@ enum skb_drop_reason { * failed to enqueue to current qdisc) */ SKB_DROP_REASON_QDISC_DROP, + /** + * @SKB_DROP_REASON_QDISC_BURST_DROP: dropped when net.core.qdisc_max_burst + * limit is hit. + */ + SKB_DROP_REASON_QDISC_BURST_DROP, /** * @SKB_DROP_REASON_QDISC_OVERLIMIT: dropped by qdisc when a qdisc * instance exceeds its total buffer size limit. diff --git a/include/net/hotdata.h b/include/net/hotdata.h index 4acec191c54a..6632b1aa7584 100644 --- a/include/net/hotdata.h +++ b/include/net/hotdata.h @@ -42,6 +42,7 @@ struct net_hotdata { int netdev_budget_usecs; int tstamp_prequeue; int max_backlog; + int qdisc_max_burst; int dev_tx_weight; int dev_rx_weight; int sysctl_max_skb_frags; diff --git a/net/core/dev.c b/net/core/dev.c index 9af9c3df452f..ccef685023c2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4203,8 +4203,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, do { if (first_n && !defer_count) { defer_count = atomic_long_inc_return(&q->defer_count); - if (unlikely(defer_count > READ_ONCE(q->limit))) { - kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP); + if (unlikely(defer_count > READ_ONCE(net_hotdata.qdisc_max_burst))) { + kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_BURST_DROP); return NET_XMIT_DROP; } } @@ -4222,7 +4222,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, ll_list = llist_del_all(&q->defer_list); /* There is a small race because we clear defer_count not atomically * with the prior llist_del_all(). This means defer_list could grow - * over q->limit. + * over qdisc_max_burst. */ atomic_long_set(&q->defer_count, 0); diff --git a/net/core/hotdata.c b/net/core/hotdata.c index dddd5c287cf0..a6db36580817 100644 --- a/net/core/hotdata.c +++ b/net/core/hotdata.c @@ -17,6 +17,7 @@ struct net_hotdata net_hotdata __cacheline_aligned = { .tstamp_prequeue = 1, .max_backlog = 1000, + .qdisc_max_burst = 1000, .dev_tx_weight = 64, .dev_rx_weight = 64, .sysctl_max_skb_frags = MAX_SKB_FRAGS, diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 8d4decb2606f..05dd55cf8b58 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -429,6 +429,13 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "qdisc_max_burst", + .data = &net_hotdata.qdisc_max_burst, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { .procname = "netdev_rss_key", .data = &netdev_rss_key, -- cgit v1.2.3 From 1e0a2ba7afb1b60f02599093d84b72ce62ad11c0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 13 Jan 2026 10:50:41 +0100 Subject: sched: Provide idle_rq() helper A fix for the dl_server 'requires' idle_cpu() usage, which made me note that it and available_idle_cpu() are extern function calls. And while idle_cpu() is used outside of kernel/sched/, available_idle_cpu() is not. This makes it hard to make idle_cpu() an inline helper, so provide idle_rq() and implement idle_cpu() and available_idle_cpu() using that. Signed-off-by: Peter Zijlstra (Intel) --- include/linux/sched.h | 1 - kernel/sched/sched.h | 22 ++++++++++++++++++++++ kernel/sched/syscalls.c | 30 +----------------------------- 3 files changed, 23 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index d395f2810fac..da0133524d08 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1874,7 +1874,6 @@ static inline int task_nice(const struct task_struct *p) extern int can_nice(const struct task_struct *p, const int nice); extern int task_curr(const struct task_struct *p); extern int idle_cpu(int cpu); -extern int available_idle_cpu(int cpu); extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *); extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); extern void sched_set_fifo(struct task_struct *p); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d30cca6870f5..e885a935b716 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1364,6 +1364,28 @@ static inline u32 sched_rng(void) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() raw_cpu_ptr(&runqueues) +static inline bool idle_rq(struct rq *rq) +{ + return rq->curr == rq->idle && !rq->nr_running && !rq->ttwu_pending; +} + +/** + * available_idle_cpu - is a given CPU idle for enqueuing work. + * @cpu: the CPU in question. + * + * Return: 1 if the CPU is currently idle. 0 otherwise. + */ +static inline bool available_idle_cpu(int cpu) +{ + if (!idle_rq(cpu_rq(cpu))) + return 0; + + if (vcpu_is_preempted(cpu)) + return 0; + + return 1; +} + #ifdef CONFIG_SCHED_PROXY_EXEC static inline void rq_set_donor(struct rq *rq, struct task_struct *t) { diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index 0496dc29ed0f..cb337de679b8 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -180,35 +180,7 @@ int task_prio(const struct task_struct *p) */ int idle_cpu(int cpu) { - struct rq *rq = cpu_rq(cpu); - - if (rq->curr != rq->idle) - return 0; - - if (rq->nr_running) - return 0; - - if (rq->ttwu_pending) - return 0; - - return 1; -} - -/** - * available_idle_cpu - is a given CPU idle for enqueuing work. - * @cpu: the CPU in question. - * - * Return: 1 if the CPU is currently idle. 0 otherwise. - */ -int available_idle_cpu(int cpu) -{ - if (!idle_cpu(cpu)) - return 0; - - if (vcpu_is_preempted(cpu)) - return 0; - - return 1; + return idle_rq(cpu_rq(cpu)); } /** -- cgit v1.2.3 From 6626734dd2b151753e134730e27d17e64784c345 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 12 Jan 2026 15:46:37 +0000 Subject: mm_zone: Generalise has_managed_dma() It would be useful to be able to check for potential DMA pages beyond just ZONE_DMA - generalise the existing has_managed_dma() function to allow checking other zones too. Signed-off-by: Robin Murphy Acked-by: David Hildenbrand (Red Hat) Acked-by: Mike Rapoport (Microsoft) Tested-by: Vladimir Kondratiev Reviewed-by: Baoquan He Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/bd002d2351074e57be1ca08f03f333debac658fb.1768230104.git.robin.murphy@arm.com --- include/linux/mmzone.h | 9 +++++---- mm/page_alloc.c | 8 ++------ 2 files changed, 7 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 75ef7c9f9307..fc5d6c88d2f0 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1648,14 +1648,15 @@ static inline int is_highmem(const struct zone *zone) return is_highmem_idx(zone_idx(zone)); } -#ifdef CONFIG_ZONE_DMA -bool has_managed_dma(void); -#else +bool has_managed_zone(enum zone_type zone); static inline bool has_managed_dma(void) { +#ifdef CONFIG_ZONE_DMA + return has_managed_zone(ZONE_DMA); +#else return false; -} #endif +} #ifndef CONFIG_NUMA diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 822e05f1a964..36ccc85c5073 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7418,20 +7418,16 @@ bool put_page_back_buddy(struct page *page) } #endif -#ifdef CONFIG_ZONE_DMA -bool has_managed_dma(void) +bool has_managed_zone(enum zone_type zone) { struct pglist_data *pgdat; for_each_online_pgdat(pgdat) { - struct zone *zone = &pgdat->node_zones[ZONE_DMA]; - - if (managed_zone(zone)) + if (managed_zone(&pgdat->node_zones[zone])) return true; } return false; } -#endif /* CONFIG_ZONE_DMA */ #ifdef CONFIG_UNACCEPTED_MEMORY -- cgit v1.2.3 From e2fb7836b01747815f8bb94981c35f2688afb120 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Fri, 19 Dec 2025 08:40:04 +0700 Subject: mm: describe @flags parameter in memalloc_flags_save() Patch series "mm kernel-doc fixes". Here are kernel-doc fixes for mm subsystem. I'm also including textsearch fix since there's currently no maintainer for include/linux/textsearch.h (get_maintainer.pl only shows LKML). This patch (of 4): Sphinx reports kernel-doc warning: WARNING: ./include/linux/sched/mm.h:332 function parameter 'flags' not described in 'memalloc_flags_save' Describe @flags to fix it. Link: https://lkml.kernel.org/r/20251219014006.16328-2-bagasdotme@gmail.com Link: https://lkml.kernel.org/r/20251219014006.16328-3-bagasdotme@gmail.com Signed-off-by: Bagas Sanjaya Fixes: 3f6d5e6a468d ("mm: introduce memalloc_flags_{save,restore}") Acked-by: David Hildenbrand (Red Hat) Acked-by: Harry Yoo Signed-off-by: Andrew Morton --- include/linux/sched/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 0e1d73955fa5..95d0040df584 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -325,6 +325,7 @@ static inline void might_alloc(gfp_t gfp_mask) /** * memalloc_flags_save - Add a PF_* flag to current->flags, save old value + * @flags: Flags to add. * * This allows PF_* flags to be conveniently added, irrespective of current * value, and then the old version restored with memalloc_flags_restore(). -- cgit v1.2.3 From f26528478bb102c28e7ac0cbfc8ec8185afdafc7 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Fri, 19 Dec 2025 08:40:05 +0700 Subject: textsearch: describe @list member in ts_ops search Sphinx reports kernel-doc warning: WARNING: ./include/linux/textsearch.h:49 struct member 'list' not described in 'ts_ops' Describe @list member to fix it. Link: https://lkml.kernel.org/r/20251219014006.16328-4-bagasdotme@gmail.com Fixes: 2de4ff7bd658 ("[LIB]: Textsearch infrastructure.") Signed-off-by: Bagas Sanjaya Cc: Thomas Graf Cc: "David S. Miller" Signed-off-by: Andrew Morton --- include/linux/textsearch.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/textsearch.h b/include/linux/textsearch.h index 6673e4d4ac2e..4933777404d6 100644 --- a/include/linux/textsearch.h +++ b/include/linux/textsearch.h @@ -35,6 +35,7 @@ struct ts_state * @get_pattern: return head of pattern * @get_pattern_len: return length of pattern * @owner: module reference to algorithm + * @list: list to search */ struct ts_ops { -- cgit v1.2.3 From 6cfab50e1440fde19af7c614aacd85e11aa4dcea Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Fri, 19 Dec 2025 08:40:07 +0700 Subject: mm, kfence: describe @slab parameter in __kfence_obj_info() Sphinx reports kernel-doc warning: WARNING: ./include/linux/kfence.h:220 function parameter 'slab' not described in '__kfence_obj_info' Fix it by describing @slab parameter. Link: https://lkml.kernel.org/r/20251219014006.16328-6-bagasdotme@gmail.com Fixes: 2dfe63e61cc3 ("mm, kfence: support kmem_dump_obj() for KFENCE objects") Signed-off-by: Bagas Sanjaya Acked-by: Marco Elver Acked-by: David Hildenbrand (Red Hat) Acked-by: Harry Yoo Signed-off-by: Andrew Morton --- include/linux/kfence.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/kfence.h b/include/linux/kfence.h index 0ad1ddbb8b99..e5822f6e7f27 100644 --- a/include/linux/kfence.h +++ b/include/linux/kfence.h @@ -211,6 +211,7 @@ struct kmem_obj_info; * __kfence_obj_info() - fill kmem_obj_info struct * @kpp: kmem_obj_info to be filled * @object: the object + * @slab: the slab * * Return: * * false - not a KFENCE object -- cgit v1.2.3 From e561383a39ed6e5c85a0b2369720743b694327ae Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Wed, 31 Dec 2025 16:03:09 +0800 Subject: powerpc/watchdog: add support for hardlockup_sys_info sysctl Commit a9af76a78760 ("watchdog: add sys_info sysctls to dump sys info on system lockup") adds 'hardlock_sys_info' systcl knob for general kernel watchdog to control what kinds of system debug info to be dumped on hardlockup. Add similar support in powerpc watchdog code to make the sysctl knob more general, which also fixes a compiling warning in general watchdog code reported by 0day bot. Link: https://lkml.kernel.org/r/20251231080309.39642-1-feng.tang@linux.alibaba.com Fixes: a9af76a78760 ("watchdog: add sys_info sysctls to dump sys info on system lockup") Signed-off-by: Feng Tang Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202512030920.NFKtekA7-lkp@intel.com/ Suggested-by: Petr Mladek Reviewed-by: Petr Mladek Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- arch/powerpc/kernel/watchdog.c | 15 ++++++++++----- include/linux/nmi.h | 1 + kernel/watchdog.c | 2 +- 3 files changed, 12 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index 2429cb1c7baa..764001deb060 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -235,7 +236,11 @@ static void watchdog_smp_panic(int cpu) pr_emerg("CPU %d TB:%lld, last SMP heartbeat TB:%lld (%lldms ago)\n", cpu, tb, last_reset, tb_to_ns(tb - last_reset) / 1000000); - if (!sysctl_hardlockup_all_cpu_backtrace) { + if (sysctl_hardlockup_all_cpu_backtrace || + (hardlockup_si_mask & SYS_INFO_ALL_BT)) { + trigger_allbutcpu_cpu_backtrace(cpu); + cpumask_clear(&wd_smp_cpus_ipi); + } else { /* * Try to trigger the stuck CPUs, unless we are going to * get a backtrace on all of them anyway. @@ -244,11 +249,9 @@ static void watchdog_smp_panic(int cpu) smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000); __cpumask_clear_cpu(c, &wd_smp_cpus_ipi); } - } else { - trigger_allbutcpu_cpu_backtrace(cpu); - cpumask_clear(&wd_smp_cpus_ipi); } + sys_info(hardlockup_si_mask & ~SYS_INFO_ALL_BT); if (hardlockup_panic) nmi_panic(NULL, "Hard LOCKUP"); @@ -415,9 +418,11 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt) xchg(&__wd_nmi_output, 1); // see wd_lockup_ipi - if (sysctl_hardlockup_all_cpu_backtrace) + if (sysctl_hardlockup_all_cpu_backtrace || + (hardlockup_si_mask & SYS_INFO_ALL_BT)) trigger_allbutcpu_cpu_backtrace(cpu); + sys_info(hardlockup_si_mask & ~SYS_INFO_ALL_BT); if (hardlockup_panic) nmi_panic(regs, "Hard LOCKUP"); diff --git a/include/linux/nmi.h b/include/linux/nmi.h index cf3c6ab408aa..207156f2143c 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -83,6 +83,7 @@ static inline void reset_hung_task_detector(void) { } #if defined(CONFIG_HARDLOCKUP_DETECTOR) extern void hardlockup_detector_disable(void); extern unsigned int hardlockup_panic; +extern unsigned long hardlockup_si_mask; #else static inline void hardlockup_detector_disable(void) {} #endif diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 0685e3a8aa0a..366122f4a0f8 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -71,7 +71,7 @@ unsigned int __read_mostly hardlockup_panic = * hard lockup is detected, it could be task, memory, lock etc. * Refer include/linux/sys_info.h for detailed bit definition. */ -static unsigned long hardlockup_si_mask; +unsigned long hardlockup_si_mask; #ifdef CONFIG_SYSFS -- cgit v1.2.3 From 69c88a6a49cfe1fd6bd5c1166d02a7dd29de9569 Mon Sep 17 00:00:00 2001 From: "Anirudh Rayabharam (Microsoft)" Date: Mon, 5 Jan 2026 12:28:36 +0000 Subject: mshv: add definitions for arm64 gpa intercepts Add definitions required for handling GPA intercepts on arm64. Signed-off-by: Anirudh Rayabharam (Microsoft) Reviewed-by: Stanislav Kinsburskii Signed-off-by: Wei Liu --- include/hyperv/hvhdk.h | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'include') diff --git a/include/hyperv/hvhdk.h b/include/hyperv/hvhdk.h index 469186df7826..08965970c17d 100644 --- a/include/hyperv/hvhdk.h +++ b/include/hyperv/hvhdk.h @@ -800,6 +800,53 @@ struct hv_x64_memory_intercept_message { u8 instruction_bytes[16]; } __packed; +#if IS_ENABLED(CONFIG_ARM64) +union hv_arm64_vp_execution_state { + u16 as_uint16; + struct { + u16 cpl:2; /* Exception Level (EL) */ + u16 debug_active:1; + u16 interruption_pending:1; + u16 vtl:4; + u16 virtualization_fault_active:1; + u16 reserved:7; + } __packed; +}; + +struct hv_arm64_intercept_message_header { + u32 vp_index; + u8 instruction_length; + u8 intercept_access_type; + union hv_arm64_vp_execution_state execution_state; + u64 pc; + u64 cpsr; +} __packed; + +union hv_arm64_memory_access_info { + u8 as_uint8; + struct { + u8 gva_valid:1; + u8 gva_gpa_valid:1; + u8 hypercall_output_pending:1; + u8 reserved:5; + } __packed; +}; + +struct hv_arm64_memory_intercept_message { + struct hv_arm64_intercept_message_header header; + u32 cache_type; /* enum hv_cache_type */ + u8 instruction_byte_count; + union hv_arm64_memory_access_info memory_access_info; + u16 reserved1; + u8 instruction_bytes[4]; + u32 reserved2; + u64 guest_virtual_address; + u64 guest_physical_address; + u64 syndrome; +} __packed; + +#endif /* CONFIG_ARM64 */ + /* * Dispatch state for the VP communicated by the hypervisor to the * VP-dispatching thread in the root on return from HVCALL_DISPATCH_VP. -- cgit v1.2.3 From 4650ff58a1b9ee68b2d3a207047998dd42e939b2 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Fri, 9 Jan 2026 15:41:33 +0100 Subject: Revert "can: raw: instantly reject unsupported CAN frames" This reverts commit 1a620a723853a0f49703c317d52dc6b9602cbaa8 and its follow-up fixes for the introduced dependency issues. commit 1a620a723853 ("can: raw: instantly reject unsupported CAN frames") commit cb2dc6d2869a ("can: Kconfig: select CAN driver infrastructure by default") commit 6abd4577bccc ("can: fix build dependency") commit 5a5aff6338c0 ("can: fix build dependency") The entire problem was caused by the requirement that a new network layer feature needed to know about the protocol capabilities of the CAN devices. Instead of accessing CAN device internal data structures which caused the dependency problems a better approach has been developed which makes use of CAN specific ml_priv data which is accessible from both sides. Cc: Marc Kleine-Budde Cc: Arnd Bergmann Cc: Vincent Mailhol Signed-off-by: Oliver Hartkopp Link: https://patch.msgid.link/20260109144135.8495-2-socketcan@hartkopp.net Signed-off-by: Marc Kleine-Budde --- drivers/net/can/Kconfig | 7 ++++-- drivers/net/can/Makefile | 2 +- drivers/net/can/dev/Makefile | 5 ++-- include/linux/can/dev.h | 7 ------ net/can/raw.c | 54 +++++++------------------------------------- 5 files changed, 17 insertions(+), 58 deletions(-) (limited to 'include') diff --git a/drivers/net/can/Kconfig b/drivers/net/can/Kconfig index cfaea6178a71..e15e320db476 100644 --- a/drivers/net/can/Kconfig +++ b/drivers/net/can/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only menuconfig CAN_DEV - bool "CAN Device Drivers" + tristate "CAN Device Drivers" default y depends on CAN help @@ -17,7 +17,10 @@ menuconfig CAN_DEV virtual ones. If you own such devices or plan to use the virtual CAN interfaces to develop applications, say Y here. -if CAN_DEV && CAN + To compile as a module, choose M here: the module will be called + can-dev. + +if CAN_DEV config CAN_VCAN tristate "Virtual Local CAN Interface (vcan)" diff --git a/drivers/net/can/Makefile b/drivers/net/can/Makefile index 37e2f1a2faec..d7bc10a6b8ea 100644 --- a/drivers/net/can/Makefile +++ b/drivers/net/can/Makefile @@ -7,7 +7,7 @@ obj-$(CONFIG_CAN_VCAN) += vcan.o obj-$(CONFIG_CAN_VXCAN) += vxcan.o obj-$(CONFIG_CAN_SLCAN) += slcan/ -obj-$(CONFIG_CAN_DEV) += dev/ +obj-y += dev/ obj-y += esd/ obj-y += rcar/ obj-y += rockchip/ diff --git a/drivers/net/can/dev/Makefile b/drivers/net/can/dev/Makefile index 64226acf0f3d..633687d6b6c0 100644 --- a/drivers/net/can/dev/Makefile +++ b/drivers/net/can/dev/Makefile @@ -1,8 +1,9 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_CAN) += can-dev.o +obj-$(CONFIG_CAN_DEV) += can-dev.o + +can-dev-y += skb.o -can-dev-$(CONFIG_CAN_DEV) += skb.o can-dev-$(CONFIG_CAN_CALC_BITTIMING) += calc_bittiming.o can-dev-$(CONFIG_CAN_NETLINK) += bittiming.o can-dev-$(CONFIG_CAN_NETLINK) += dev.o diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index f6416a56e95d..52c8be5c160e 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -111,14 +111,7 @@ struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max, void free_candev(struct net_device *dev); /* a candev safe wrapper around netdev_priv */ -#if IS_ENABLED(CONFIG_CAN_NETLINK) struct can_priv *safe_candev_priv(struct net_device *dev); -#else -static inline struct can_priv *safe_candev_priv(struct net_device *dev) -{ - return NULL; -} -#endif int open_candev(struct net_device *dev); void close_candev(struct net_device *dev); diff --git a/net/can/raw.c b/net/can/raw.c index be1ef7cf4204..f36a83d3447c 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -892,58 +892,20 @@ static void raw_put_canxl_vcid(struct raw_sock *ro, struct sk_buff *skb) } } -static inline bool raw_dev_cc_enabled(struct net_device *dev, - struct can_priv *priv) +static unsigned int raw_check_txframe(struct raw_sock *ro, struct sk_buff *skb, int mtu) { - /* The CANXL-only mode disables error-signalling on the CAN bus - * which is needed to send CAN CC/FD frames - */ - if (priv) - return !can_dev_in_xl_only_mode(priv); - - /* virtual CAN interfaces always support CAN CC */ - return true; -} - -static inline bool raw_dev_fd_enabled(struct net_device *dev, - struct can_priv *priv) -{ - /* check FD ctrlmode on real CAN interfaces */ - if (priv) - return (priv->ctrlmode & CAN_CTRLMODE_FD); - - /* check MTU for virtual CAN FD interfaces */ - return (READ_ONCE(dev->mtu) >= CANFD_MTU); -} - -static inline bool raw_dev_xl_enabled(struct net_device *dev, - struct can_priv *priv) -{ - /* check XL ctrlmode on real CAN interfaces */ - if (priv) - return (priv->ctrlmode & CAN_CTRLMODE_XL); - - /* check MTU for virtual CAN XL interfaces */ - return can_is_canxl_dev_mtu(READ_ONCE(dev->mtu)); -} - -static unsigned int raw_check_txframe(struct raw_sock *ro, struct sk_buff *skb, - struct net_device *dev) -{ - struct can_priv *priv = safe_candev_priv(dev); - - /* Classical CAN */ - if (can_is_can_skb(skb) && raw_dev_cc_enabled(dev, priv)) + /* Classical CAN -> no checks for flags and device capabilities */ + if (can_is_can_skb(skb)) return CAN_MTU; - /* CAN FD */ + /* CAN FD -> needs to be enabled and a CAN FD or CAN XL device */ if (ro->fd_frames && can_is_canfd_skb(skb) && - raw_dev_fd_enabled(dev, priv)) + (mtu == CANFD_MTU || can_is_canxl_dev_mtu(mtu))) return CANFD_MTU; - /* CAN XL */ + /* CAN XL -> needs to be enabled and a CAN XL device */ if (ro->xl_frames && can_is_canxl_skb(skb) && - raw_dev_xl_enabled(dev, priv)) + can_is_canxl_dev_mtu(mtu)) return CANXL_MTU; return 0; @@ -999,7 +961,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) err = -EINVAL; /* check for valid CAN (CC/FD/XL) frame content */ - txmtu = raw_check_txframe(ro, skb, dev); + txmtu = raw_check_txframe(ro, skb, READ_ONCE(dev->mtu)); if (!txmtu) goto free_skb; -- cgit v1.2.3 From 166e87329ce6f1eaa3475ba2d14ed30e54727c0d Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Fri, 9 Jan 2026 15:41:34 +0100 Subject: can: propagate CAN device capabilities via ml_priv Commit 1a620a723853 ("can: raw: instantly reject unsupported CAN frames") caused a sequence of dependency and linker fixes. Instead of accessing CAN device internal data structures which caused the dependency problems this patch introduces capability information into the CAN specific ml_priv data which is accessible from both sides. With this change the CAN network layer can check the required features and the decoupling of the driver layer and network layer is restored. Fixes: 1a620a723853 ("can: raw: instantly reject unsupported CAN frames") Cc: Marc Kleine-Budde Cc: Arnd Bergmann Cc: Vincent Mailhol Signed-off-by: Oliver Hartkopp Link: https://patch.msgid.link/20260109144135.8495-3-socketcan@hartkopp.net Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev/dev.c | 27 +++++++++++++++++++++++++++ drivers/net/can/dev/netlink.c | 1 + drivers/net/can/vcan.c | 15 +++++++++++++++ drivers/net/can/vxcan.c | 15 +++++++++++++++ include/linux/can/can-ml.h | 24 ++++++++++++++++++++++++ include/linux/can/dev.h | 1 + 6 files changed, 83 insertions(+) (limited to 'include') diff --git a/drivers/net/can/dev/dev.c b/drivers/net/can/dev/dev.c index 091f30e94c61..7ab9578f5b89 100644 --- a/drivers/net/can/dev/dev.c +++ b/drivers/net/can/dev/dev.c @@ -375,6 +375,32 @@ void can_set_default_mtu(struct net_device *dev) } } +void can_set_cap_info(struct net_device *dev) +{ + struct can_priv *priv = netdev_priv(dev); + u32 can_cap; + + if (can_dev_in_xl_only_mode(priv)) { + /* XL only mode => no CC/FD capability */ + can_cap = CAN_CAP_XL; + } else { + /* mixed mode => CC + FD/XL capability */ + can_cap = CAN_CAP_CC; + + if (priv->ctrlmode & CAN_CTRLMODE_FD) + can_cap |= CAN_CAP_FD; + + if (priv->ctrlmode & CAN_CTRLMODE_XL) + can_cap |= CAN_CAP_XL; + } + + if (priv->ctrlmode & (CAN_CTRLMODE_LISTENONLY | + CAN_CTRLMODE_RESTRICTED)) + can_cap |= CAN_CAP_RO; + + can_set_cap(dev, can_cap); +} + /* helper to define static CAN controller features at device creation time */ int can_set_static_ctrlmode(struct net_device *dev, u32 static_mode) { @@ -390,6 +416,7 @@ int can_set_static_ctrlmode(struct net_device *dev, u32 static_mode) /* override MTU which was set by default in can_setup()? */ can_set_default_mtu(dev); + can_set_cap_info(dev); return 0; } diff --git a/drivers/net/can/dev/netlink.c b/drivers/net/can/dev/netlink.c index d6b0e686fb11..0498198a4696 100644 --- a/drivers/net/can/dev/netlink.c +++ b/drivers/net/can/dev/netlink.c @@ -377,6 +377,7 @@ static int can_ctrlmode_changelink(struct net_device *dev, } can_set_default_mtu(dev); + can_set_cap_info(dev); return 0; } diff --git a/drivers/net/can/vcan.c b/drivers/net/can/vcan.c index fdc662aea279..76e6b7b5c6a1 100644 --- a/drivers/net/can/vcan.c +++ b/drivers/net/can/vcan.c @@ -130,6 +130,19 @@ static netdev_tx_t vcan_tx(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } +static void vcan_set_cap_info(struct net_device *dev) +{ + u32 can_cap = CAN_CAP_CC; + + if (dev->mtu > CAN_MTU) + can_cap |= CAN_CAP_FD; + + if (dev->mtu >= CANXL_MIN_MTU) + can_cap |= CAN_CAP_XL; + + can_set_cap(dev, can_cap); +} + static int vcan_change_mtu(struct net_device *dev, int new_mtu) { /* Do not allow changing the MTU while running */ @@ -141,6 +154,7 @@ static int vcan_change_mtu(struct net_device *dev, int new_mtu) return -EINVAL; WRITE_ONCE(dev->mtu, new_mtu); + vcan_set_cap_info(dev); return 0; } @@ -162,6 +176,7 @@ static void vcan_setup(struct net_device *dev) dev->tx_queue_len = 0; dev->flags = IFF_NOARP; can_set_ml_priv(dev, netdev_priv(dev)); + vcan_set_cap_info(dev); /* set flags according to driver capabilities */ if (echo) diff --git a/drivers/net/can/vxcan.c b/drivers/net/can/vxcan.c index b2c19f8c5f8e..f14c6f02b662 100644 --- a/drivers/net/can/vxcan.c +++ b/drivers/net/can/vxcan.c @@ -125,6 +125,19 @@ static int vxcan_get_iflink(const struct net_device *dev) return iflink; } +static void vxcan_set_cap_info(struct net_device *dev) +{ + u32 can_cap = CAN_CAP_CC; + + if (dev->mtu > CAN_MTU) + can_cap |= CAN_CAP_FD; + + if (dev->mtu >= CANXL_MIN_MTU) + can_cap |= CAN_CAP_XL; + + can_set_cap(dev, can_cap); +} + static int vxcan_change_mtu(struct net_device *dev, int new_mtu) { /* Do not allow changing the MTU while running */ @@ -136,6 +149,7 @@ static int vxcan_change_mtu(struct net_device *dev, int new_mtu) return -EINVAL; WRITE_ONCE(dev->mtu, new_mtu); + vxcan_set_cap_info(dev); return 0; } @@ -167,6 +181,7 @@ static void vxcan_setup(struct net_device *dev) can_ml = netdev_priv(dev) + ALIGN(sizeof(struct vxcan_priv), NETDEV_ALIGN); can_set_ml_priv(dev, can_ml); + vxcan_set_cap_info(dev); } /* forward declaration for rtnl_create_link() */ diff --git a/include/linux/can/can-ml.h b/include/linux/can/can-ml.h index 8afa92d15a66..1e99fda2b380 100644 --- a/include/linux/can/can-ml.h +++ b/include/linux/can/can-ml.h @@ -46,6 +46,12 @@ #include #include +/* exposed CAN device capabilities for network layer */ +#define CAN_CAP_CC BIT(0) /* CAN CC aka Classical CAN */ +#define CAN_CAP_FD BIT(1) /* CAN FD */ +#define CAN_CAP_XL BIT(2) /* CAN XL */ +#define CAN_CAP_RO BIT(3) /* read-only mode (LISTEN/RESTRICTED) */ + #define CAN_SFF_RCV_ARRAY_SZ (1 << CAN_SFF_ID_BITS) #define CAN_EFF_RCV_HASH_BITS 10 #define CAN_EFF_RCV_ARRAY_SZ (1 << CAN_EFF_RCV_HASH_BITS) @@ -64,6 +70,7 @@ struct can_ml_priv { #ifdef CAN_J1939 struct j1939_priv *j1939_priv; #endif + u32 can_cap; }; static inline struct can_ml_priv *can_get_ml_priv(struct net_device *dev) @@ -77,4 +84,21 @@ static inline void can_set_ml_priv(struct net_device *dev, netdev_set_ml_priv(dev, ml_priv, ML_PRIV_CAN); } +static inline bool can_cap_enabled(struct net_device *dev, u32 cap) +{ + struct can_ml_priv *can_ml = can_get_ml_priv(dev); + + if (!can_ml) + return false; + + return (can_ml->can_cap & cap); +} + +static inline void can_set_cap(struct net_device *dev, u32 cap) +{ + struct can_ml_priv *can_ml = can_get_ml_priv(dev); + + can_ml->can_cap = cap; +} + #endif /* CAN_ML_H */ diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 52c8be5c160e..6d0710d6f571 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -116,6 +116,7 @@ struct can_priv *safe_candev_priv(struct net_device *dev); int open_candev(struct net_device *dev); void close_candev(struct net_device *dev); void can_set_default_mtu(struct net_device *dev); +void can_set_cap_info(struct net_device *dev); int __must_check can_set_static_ctrlmode(struct net_device *dev, u32 static_mode); int can_hwtstamp_get(struct net_device *netdev, -- cgit v1.2.3 From a995fe1a3aa78b7d06cc1cc7b6b8436c5e93b07f Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Wed, 7 Jan 2026 11:35:05 +0100 Subject: rust: driver: drop device private data post unbind Currently, the driver's device private data is allocated and initialized from driver core code called from bus abstractions after the driver's probe() callback returned the corresponding initializer. Similarly, the driver's device private data is dropped within the remove() callback of bus abstractions after calling the remove() callback of the corresponding driver. However, commit 6f61a2637abe ("rust: device: introduce Device::drvdata()") introduced an accessor for the driver's device private data for a Device, i.e. a device that is currently bound to a driver. Obviously, this is in conflict with dropping the driver's device private data in remove(), since a device can not be considered to be fully unbound after remove() has finished: We also have to consider registrations guarded by devres - such as IRQ or class device registrations - which are torn down after remove() in devres_release_all(). Thus, it can happen that, for instance, a class device or IRQ callback still calls Device::drvdata(), which then runs concurrently to remove() (which sets dev->driver_data to NULL and drops the driver's device private data), before devres_release_all() started to tear down the corresponding registration. This is because devres guarded registrations can, as expected, access the corresponding Device that defines their scope. In C it simply is the driver's responsibility to ensure that its device private data is freed after e.g. an IRQ registration is unregistered. Typically, C drivers achieve this by allocating their device private data with e.g. devm_kzalloc() before doing anything else, i.e. before e.g. registering an IRQ with devm_request_threaded_irq(), relying on the reverse order cleanup of devres. Technically, we could do something similar in Rust. However, the resulting code would be pretty messy: In Rust we have to differentiate between allocated but uninitialized memory and initialized memory in the type system. Thus, we would need to somehow keep track of whether the driver's device private data object has been initialized (i.e. probe() was successful and returned a valid initializer for this memory) and conditionally call the destructor of the corresponding object when it is freed. This is because we'd need to allocate and register the memory of the driver's device private data *before* it is initialized by the initializer returned by the driver's probe() callback, because the driver could already register devres guarded registrations within probe() outside of the driver's device private data initializer. Luckily there is a much simpler solution: Instead of dropping the driver's device private data at the end of remove(), we just drop it after the device has been fully unbound, i.e. after all devres callbacks have been processed. For this, we introduce a new post_unbind() callback private to the driver-core, i.e. the callback is neither exposed to drivers, nor to bus abstractions. This way, the driver-core code can simply continue to conditionally allocate the memory for the driver's device private data when the driver's initializer is returned from probe() - no change needed - and drop it when the driver-core code receives the post_unbind() callback. Closes: https://lore.kernel.org/all/DEZMS6Y4A7XE.XE7EUBT5SJFJ@kernel.org/ Fixes: 6f61a2637abe ("rust: device: introduce Device::drvdata()") Acked-by: Alice Ryhl Acked-by: Greg Kroah-Hartman Acked-by: Igor Korotin Link: https://patch.msgid.link/20260107103511.570525-7-dakr@kernel.org [ Remove #ifdef CONFIG_RUST, rename post_unbind() to post_unbind_rust(). - Danilo] Signed-off-by: Danilo Krummrich --- drivers/base/dd.c | 2 ++ include/linux/device/driver.h | 9 +++++++++ rust/kernel/auxiliary.rs | 4 ++-- rust/kernel/device.rs | 20 +++++++++++--------- rust/kernel/driver.rs | 36 +++++++++++++++++++++++++++++++++++- rust/kernel/i2c.rs | 4 ++-- rust/kernel/pci.rs | 4 ++-- rust/kernel/platform.rs | 4 ++-- rust/kernel/usb.rs | 4 ++-- 9 files changed, 67 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/drivers/base/dd.c b/drivers/base/dd.c index 349f31bedfa1..bea8da5f8a3a 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -548,6 +548,8 @@ static DEVICE_ATTR_RW(state_synced); static void device_unbind_cleanup(struct device *dev) { devres_release_all(dev); + if (dev->driver->p_cb.post_unbind_rust) + dev->driver->p_cb.post_unbind_rust(dev); arch_teardown_dma_ops(dev); kfree(dev->dma_range_map); dev->dma_range_map = NULL; diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h index cd8e0f0a634b..bbc67ec513ed 100644 --- a/include/linux/device/driver.h +++ b/include/linux/device/driver.h @@ -85,6 +85,8 @@ enum probe_type { * uevent. * @p: Driver core's private data, no one other than the driver * core can touch this. + * @p_cb: Callbacks private to the driver core; no one other than the + * driver core is allowed to touch this. * * The device driver-model tracks all of the drivers known to the system. * The main reason for this tracking is to enable the driver core to match @@ -119,6 +121,13 @@ struct device_driver { void (*coredump) (struct device *dev); struct driver_private *p; + struct { + /* + * Called after remove() and after all devres entries have been + * processed. This is a Rust only callback. + */ + void (*post_unbind_rust)(struct device *dev); + } p_cb; }; diff --git a/rust/kernel/auxiliary.rs b/rust/kernel/auxiliary.rs index 17574aa5066f..be76f11aecb7 100644 --- a/rust/kernel/auxiliary.rs +++ b/rust/kernel/auxiliary.rs @@ -96,9 +96,9 @@ impl Adapter { // SAFETY: `remove_callback` is only ever called after a successful call to // `probe_callback`, hence it's guaranteed that `Device::set_drvdata()` has been called // and stored a `Pin>`. - let data = unsafe { adev.as_ref().drvdata_obtain::() }; + let data = unsafe { adev.as_ref().drvdata_borrow::() }; - T::unbind(adev, data.as_ref()); + T::unbind(adev, data); } } diff --git a/rust/kernel/device.rs b/rust/kernel/device.rs index 71b200df0f40..031720bf5d8c 100644 --- a/rust/kernel/device.rs +++ b/rust/kernel/device.rs @@ -232,30 +232,32 @@ impl Device { /// /// # Safety /// - /// - Must only be called once after a preceding call to [`Device::set_drvdata`]. /// - The type `T` must match the type of the `ForeignOwnable` previously stored by /// [`Device::set_drvdata`]. - pub unsafe fn drvdata_obtain(&self) -> Pin> { + pub(crate) unsafe fn drvdata_obtain(&self) -> Option>> { // SAFETY: By the type invariants, `self.as_raw()` is a valid pointer to a `struct device`. let ptr = unsafe { bindings::dev_get_drvdata(self.as_raw()) }; // SAFETY: By the type invariants, `self.as_raw()` is a valid pointer to a `struct device`. unsafe { bindings::dev_set_drvdata(self.as_raw(), core::ptr::null_mut()) }; + if ptr.is_null() { + return None; + } + // SAFETY: - // - By the safety requirements of this function, `ptr` comes from a previous call to - // `into_foreign()`. + // - If `ptr` is not NULL, it comes from a previous call to `into_foreign()`. // - `dev_get_drvdata()` guarantees to return the same pointer given to `dev_set_drvdata()` // in `into_foreign()`. - unsafe { Pin::>::from_foreign(ptr.cast()) } + Some(unsafe { Pin::>::from_foreign(ptr.cast()) }) } /// Borrow the driver's private data bound to this [`Device`]. /// /// # Safety /// - /// - Must only be called after a preceding call to [`Device::set_drvdata`] and before - /// [`Device::drvdata_obtain`]. + /// - Must only be called after a preceding call to [`Device::set_drvdata`] and before the + /// device is fully unbound. /// - The type `T` must match the type of the `ForeignOwnable` previously stored by /// [`Device::set_drvdata`]. pub unsafe fn drvdata_borrow(&self) -> Pin<&T> { @@ -271,7 +273,7 @@ impl Device { /// # Safety /// /// - Must only be called after a preceding call to [`Device::set_drvdata`] and before - /// [`Device::drvdata_obtain`]. + /// the device is fully unbound. /// - The type `T` must match the type of the `ForeignOwnable` previously stored by /// [`Device::set_drvdata`]. unsafe fn drvdata_unchecked(&self) -> Pin<&T> { @@ -320,7 +322,7 @@ impl Device { // SAFETY: // - The above check of `dev_get_drvdata()` guarantees that we are called after - // `set_drvdata()` and before `drvdata_obtain()`. + // `set_drvdata()`. // - We've just checked that the type of the driver's private data is in fact `T`. Ok(unsafe { self.drvdata_unchecked() }) } diff --git a/rust/kernel/driver.rs b/rust/kernel/driver.rs index ba1ca1f7a7e2..bee3ae21a27b 100644 --- a/rust/kernel/driver.rs +++ b/rust/kernel/driver.rs @@ -177,7 +177,39 @@ unsafe impl Sync for Registration {} // any thread, so `Registration` is `Send`. unsafe impl Send for Registration {} -impl Registration { +impl Registration { + extern "C" fn post_unbind_callback(dev: *mut bindings::device) { + // SAFETY: The driver core only ever calls the post unbind callback with a valid pointer to + // a `struct device`. + // + // INVARIANT: `dev` is valid for the duration of the `post_unbind_callback()`. + let dev = unsafe { &*dev.cast::>() }; + + // `remove()` and all devres callbacks have been completed at this point, hence drop the + // driver's device private data. + // + // SAFETY: By the safety requirements of the `Driver` trait, `T::DriverData` is the + // driver's device private data type. + drop(unsafe { dev.drvdata_obtain::() }); + } + + /// Attach generic `struct device_driver` callbacks. + fn callbacks_attach(drv: &Opaque) { + let ptr = drv.get().cast::(); + + // SAFETY: + // - `drv.get()` yields a valid pointer to `Self::DriverType`. + // - Adding `DEVICE_DRIVER_OFFSET` yields the address of the embedded `struct device_driver` + // as guaranteed by the safety requirements of the `Driver` trait. + let base = unsafe { ptr.add(T::DEVICE_DRIVER_OFFSET) }; + + // CAST: `base` points to the offset of the embedded `struct device_driver`. + let base = base.cast::(); + + // SAFETY: It is safe to set the fields of `struct device_driver` on initialization. + unsafe { (*base).p_cb.post_unbind_rust = Some(Self::post_unbind_callback) }; + } + /// Creates a new instance of the registration object. pub fn new(name: &'static CStr, module: &'static ThisModule) -> impl PinInit { try_pin_init!(Self { @@ -189,6 +221,8 @@ impl Registration { // just been initialised above, so it's also valid for read. let drv = unsafe { &*(ptr as *const Opaque) }; + Self::callbacks_attach(drv); + // SAFETY: `drv` is guaranteed to be pinned until `T::unregister`. unsafe { T::register(drv, name, module) } }), diff --git a/rust/kernel/i2c.rs b/rust/kernel/i2c.rs index e86242227081..39b0a9a207fd 100644 --- a/rust/kernel/i2c.rs +++ b/rust/kernel/i2c.rs @@ -178,9 +178,9 @@ impl Adapter { // SAFETY: `remove_callback` is only ever called after a successful call to // `probe_callback`, hence it's guaranteed that `I2cClient::set_drvdata()` has been called // and stored a `Pin>`. - let data = unsafe { idev.as_ref().drvdata_obtain::() }; + let data = unsafe { idev.as_ref().drvdata_borrow::() }; - T::unbind(idev, data.as_ref()); + T::unbind(idev, data); } extern "C" fn shutdown_callback(idev: *mut bindings::i2c_client) { diff --git a/rust/kernel/pci.rs b/rust/kernel/pci.rs index 590723dcb5ae..bea76ca9c3da 100644 --- a/rust/kernel/pci.rs +++ b/rust/kernel/pci.rs @@ -123,9 +123,9 @@ impl Adapter { // SAFETY: `remove_callback` is only ever called after a successful call to // `probe_callback`, hence it's guaranteed that `Device::set_drvdata()` has been called // and stored a `Pin>`. - let data = unsafe { pdev.as_ref().drvdata_obtain::() }; + let data = unsafe { pdev.as_ref().drvdata_borrow::() }; - T::unbind(pdev, data.as_ref()); + T::unbind(pdev, data); } } diff --git a/rust/kernel/platform.rs b/rust/kernel/platform.rs index b8a681df9ddc..35a5813ffb33 100644 --- a/rust/kernel/platform.rs +++ b/rust/kernel/platform.rs @@ -101,9 +101,9 @@ impl Adapter { // SAFETY: `remove_callback` is only ever called after a successful call to // `probe_callback`, hence it's guaranteed that `Device::set_drvdata()` has been called // and stored a `Pin>`. - let data = unsafe { pdev.as_ref().drvdata_obtain::() }; + let data = unsafe { pdev.as_ref().drvdata_borrow::() }; - T::unbind(pdev, data.as_ref()); + T::unbind(pdev, data); } } diff --git a/rust/kernel/usb.rs b/rust/kernel/usb.rs index 4cf4bb1705b5..67ce5c85c619 100644 --- a/rust/kernel/usb.rs +++ b/rust/kernel/usb.rs @@ -103,9 +103,9 @@ impl Adapter { // SAFETY: `disconnect_callback` is only ever called after a successful call to // `probe_callback`, hence it's guaranteed that `Device::set_drvdata()` has been called // and stored a `Pin>`. - let data = unsafe { dev.drvdata_obtain::() }; + let data = unsafe { dev.drvdata_borrow::() }; - T::disconnect(intf, data.as_ref()); + T::disconnect(intf, data); } } -- cgit v1.2.3 From 10d28cffb3f6ec7ad67f0a4cd32c2afa92909452 Mon Sep 17 00:00:00 2001 From: Ian Abbott Date: Wed, 3 Dec 2025 16:24:38 +0000 Subject: comedi: Fix getting range information for subdevices 16 to 255 The `COMEDI_RANGEINFO` ioctl does not work properly for subdevice indices above 15. Currently, the only in-tree COMEDI drivers that support more than 16 subdevices are the "8255" driver and the "comedi_bond" driver. Making the ioctl work for subdevice indices up to 255 is achievable. It needs minor changes to the handling of the `COMEDI_RANGEINFO` and `COMEDI_CHANINFO` ioctls that should be mostly harmless to user-space, apart from making them less broken. Details follow... The `COMEDI_RANGEINFO` ioctl command gets the list of supported ranges (usually with units of volts or milliamps) for a COMEDI subdevice or channel. (Only some subdevices have per-channel range tables, indicated by the `SDF_RANGETYPE` flag in the subdevice information.) It uses a `range_type` value and a user-space pointer, both supplied by user-space, but the `range_type` value should match what was obtained using the `COMEDI_CHANINFO` ioctl (if the subdevice has per-channel range tables) or `COMEDI_SUBDINFO` ioctl (if the subdevice uses a single range table for all channels). Bits 15 to 0 of the `range_type` value contain the length of the range table, which is the only part that user-space should care about (so it can use a suitably sized buffer to fetch the range table). Bits 23 to 16 store the channel index, which is assumed to be no more than 255 if the subdevice has per-channel range tables, and is set to 0 if the subdevice has a single range table. For `range_type` values produced by the `COMEDI_SUBDINFO` ioctl, bits 31 to 24 contain the subdevice index, which is assumed to be no more than 255. But for `range_type` values produced by the `COMEDI_CHANINFO` ioctl, bits 27 to 24 contain the subdevice index, which is assumed to be no more than 15, and bits 31 to 28 contain the COMEDI device's minor device number for some unknown reason lost in the mists of time. The `COMEDI_RANGEINFO` ioctl extract the length from bits 15 to 0 of the user-supplied `range_type` value, extracts the channel index from bits 23 to 16 (only used if the subdevice has per-channel range tables), extracts the subdevice index from bits 27 to 24, and ignores bits 31 to 28. So for subdevice indices 16 to 255, the `COMEDI_SUBDINFO` or `COMEDI_CHANINFO` ioctl will report a `range_type` value that doesn't work with the `COMEDI_RANGEINFO` ioctl. It will either get the range table for the subdevice index modulo 16, or will fail with `-EINVAL`. To fix this, always use bits 31 to 24 of the `range_type` value to hold the subdevice index (assumed to be no more than 255). This affects the `COMEDI_CHANINFO` and `COMEDI_RANGEINFO` ioctls. There should not be anything in user-space that depends on the old, broken usage, although it may now see different values in bits 31 to 28 of the `range_type` values reported by the `COMEDI_CHANINFO` ioctl for subdevices that have per-channel subdevices. User-space should not be trying to decode bits 31 to 16 of the `range_type` values anyway. Fixes: ed9eccbe8970 ("Staging: add comedi core") Cc: stable@vger.kernel.org #5.17+ Signed-off-by: Ian Abbott Link: https://patch.msgid.link/20251203162438.176841-1-abbotti@mev.co.uk Signed-off-by: Greg Kroah-Hartman --- drivers/comedi/comedi_fops.c | 2 +- drivers/comedi/range.c | 2 +- include/uapi/linux/comedi.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/comedi/comedi_fops.c b/drivers/comedi/comedi_fops.c index 657c98cd723e..2c3eb9e89571 100644 --- a/drivers/comedi/comedi_fops.c +++ b/drivers/comedi/comedi_fops.c @@ -1155,7 +1155,7 @@ static int do_chaninfo_ioctl(struct comedi_device *dev, for (i = 0; i < s->n_chan; i++) { int x; - x = (dev->minor << 28) | (it->subdev << 24) | (i << 16) | + x = (it->subdev << 24) | (i << 16) | (s->range_table_list[i]->length); if (put_user(x, it->rangelist + i)) return -EFAULT; diff --git a/drivers/comedi/range.c b/drivers/comedi/range.c index 8f43cf88d784..5b8f662365e3 100644 --- a/drivers/comedi/range.c +++ b/drivers/comedi/range.c @@ -52,7 +52,7 @@ int do_rangeinfo_ioctl(struct comedi_device *dev, const struct comedi_lrange *lr; struct comedi_subdevice *s; - subd = (it->range_type >> 24) & 0xf; + subd = (it->range_type >> 24) & 0xff; chan = (it->range_type >> 16) & 0xff; if (!dev->attached) diff --git a/include/uapi/linux/comedi.h b/include/uapi/linux/comedi.h index 7314e5ee0a1e..798ec9a39e12 100644 --- a/include/uapi/linux/comedi.h +++ b/include/uapi/linux/comedi.h @@ -640,7 +640,7 @@ struct comedi_chaninfo { /** * struct comedi_rangeinfo - used to retrieve the range table for a channel - * @range_type: Encodes subdevice index (bits 27:24), channel index + * @range_type: Encodes subdevice index (bits 31:24), channel index * (bits 23:16) and range table length (bits 15:0). * @range_ptr: Pointer to array of @struct comedi_krange to be filled * in with the range table for the channel or subdevice. -- cgit v1.2.3 From cd16edba1c6a24af138e1a5ded2711231fffa99f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 4 Dec 2025 11:19:10 +0100 Subject: ext4: fix ext4_tune_sb_params padding The padding at the end of struct ext4_tune_sb_params is architecture specific and in particular is different between x86-32 and x86-64, since the __u64 member only enforces struct alignment on the latter. This shows up as a new warning when test-building the headers with -Wpadded: include/linux/ext4.h:144:1: error: padding struct size to alignment boundary with 4 bytes [-Werror=padded] All members inside the structure are naturally aligned, so the only difference here is the amount of padding at the end. Make the padding explicit, to have a consistent sizeof(struct ext4_tune_sb_params) of 232 on all architectures and avoid adding compat ioctl handling for EXT4_IOC_GET_TUNE_SB_PARAM/EXT4_IOC_SET_TUNE_SB_PARAM. This is an ABI break on x86-32 but hopefully this can go into 6.18.y early enough as a fixup so no actual users will be affected. Alternatively, the kernel could handle the ioctl commands for both sizes (232 and 228 bytes) on all architectures. Fixes: 04a91570ac67 ("ext4: implemet new ioctls to set and get superblock parameters") Signed-off-by: Arnd Bergmann Reviewed-by: Jan Kara Link: https://patch.msgid.link/20251204101914.1037148-1-arnd@kernel.org Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- include/uapi/linux/ext4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/ext4.h b/include/uapi/linux/ext4.h index 411dcc1e4a35..9c683991c32f 100644 --- a/include/uapi/linux/ext4.h +++ b/include/uapi/linux/ext4.h @@ -139,7 +139,7 @@ struct ext4_tune_sb_params { __u32 clear_feature_incompat_mask; __u32 clear_feature_ro_compat_mask; __u8 mount_opts[64]; - __u8 pad[64]; + __u8 pad[68]; }; #define EXT4_TUNE_FL_ERRORS_BEHAVIOR 0x00000001 -- cgit v1.2.3 From 2c28769a51deb6022d7fbd499987e237a01dd63a Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 14 Jan 2026 22:03:23 +0000 Subject: rxrpc: Fix recvmsg() unconditional requeue If rxrpc_recvmsg() fails because MSG_DONTWAIT was specified but the call at the front of the recvmsg queue already has its mutex locked, it requeues the call - whether or not the call is already queued. The call may be on the queue because MSG_PEEK was also passed and so the call was not dequeued or because the I/O thread requeued it. The unconditional requeue may then corrupt the recvmsg queue, leading to things like UAFs or refcount underruns. Fix this by only requeuing the call if it isn't already on the queue - and moving it to the front if it is already queued. If we don't queue it, we have to put the ref we obtained by dequeuing it. Also, MSG_PEEK doesn't dequeue the call so shouldn't call rxrpc_notify_socket() for the call if we didn't use up all the data on the queue, so fix that also. Fixes: 540b1c48c37a ("rxrpc: Fix deadlock between call creation and sendmsg/recvmsg") Reported-by: Faith Reported-by: Pumpkin Chang Signed-off-by: David Howells Acked-by: Marc Dionne cc: Nir Ohfeld cc: Willy Tarreau cc: Simon Horman cc: linux-afs@lists.infradead.org cc: stable@kernel.org Link: https://patch.msgid.link/95163.1768428203@warthog.procyon.org.uk Signed-off-by: Jakub Kicinski --- include/trace/events/rxrpc.h | 4 ++++ net/rxrpc/recvmsg.c | 19 +++++++++++++++---- 2 files changed, 19 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index de6f6d25767c..869f97c9bf73 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -322,6 +322,7 @@ EM(rxrpc_call_put_kernel, "PUT kernel ") \ EM(rxrpc_call_put_poke, "PUT poke ") \ EM(rxrpc_call_put_recvmsg, "PUT recvmsg ") \ + EM(rxrpc_call_put_recvmsg_peek_nowait, "PUT peek-nwt") \ EM(rxrpc_call_put_release_recvmsg_q, "PUT rls-rcmq") \ EM(rxrpc_call_put_release_sock, "PUT rls-sock") \ EM(rxrpc_call_put_release_sock_tba, "PUT rls-sk-a") \ @@ -340,6 +341,9 @@ EM(rxrpc_call_see_input, "SEE input ") \ EM(rxrpc_call_see_notify_released, "SEE nfy-rlsd") \ EM(rxrpc_call_see_recvmsg, "SEE recvmsg ") \ + EM(rxrpc_call_see_recvmsg_requeue, "SEE recv-rqu") \ + EM(rxrpc_call_see_recvmsg_requeue_first, "SEE recv-rqF") \ + EM(rxrpc_call_see_recvmsg_requeue_move, "SEE recv-rqM") \ EM(rxrpc_call_see_release, "SEE release ") \ EM(rxrpc_call_see_userid_exists, "SEE u-exists") \ EM(rxrpc_call_see_waiting_call, "SEE q-conn ") \ diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 7fa7e77f6bb9..e1f7513a46db 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -518,7 +518,8 @@ try_again: if (rxrpc_call_has_failed(call)) goto call_failed; - if (!skb_queue_empty(&call->recvmsg_queue)) + if (!(flags & MSG_PEEK) && + !skb_queue_empty(&call->recvmsg_queue)) rxrpc_notify_socket(call); goto not_yet_complete; @@ -549,11 +550,21 @@ error_unlock_call: error_requeue_call: if (!(flags & MSG_PEEK)) { spin_lock_irq(&rx->recvmsg_lock); - list_add(&call->recvmsg_link, &rx->recvmsg_q); - spin_unlock_irq(&rx->recvmsg_lock); + if (list_empty(&call->recvmsg_link)) { + list_add(&call->recvmsg_link, &rx->recvmsg_q); + rxrpc_see_call(call, rxrpc_call_see_recvmsg_requeue); + spin_unlock_irq(&rx->recvmsg_lock); + } else if (list_is_first(&call->recvmsg_link, &rx->recvmsg_q)) { + spin_unlock_irq(&rx->recvmsg_lock); + rxrpc_put_call(call, rxrpc_call_see_recvmsg_requeue_first); + } else { + list_move(&call->recvmsg_link, &rx->recvmsg_q); + spin_unlock_irq(&rx->recvmsg_lock); + rxrpc_put_call(call, rxrpc_call_see_recvmsg_requeue_move); + } trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_requeue, 0); } else { - rxrpc_put_call(call, rxrpc_call_put_recvmsg); + rxrpc_put_call(call, rxrpc_call_put_recvmsg_peek_nowait); } error_no_call: release_sock(&rx->sk); -- cgit v1.2.3 From 6ac433f8b2590b09ca00863d218665729ac985f7 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 24 Dec 2025 12:33:57 -0500 Subject: mm: rename cpu_bitmap field to flexible_array MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cpu_bitmap flexible array now contains more than just the cpu_bitmap. In preparation for changing the static mm_struct definitions to cover for the additional space required, change the cpu_bitmap type from "unsigned long" to "char", require an unsigned long alignment of the flexible array, and rename the field from "cpu_bitmap" to "flexible_array". Introduce the MM_STRUCT_FLEXIBLE_ARRAY_INIT macro to statically initialize the flexible array. This covers the init_mm and efi_mm static definitions. This is a preparation step for fixing the missing mm_cid size for static mm_struct definitions. Link: https://lkml.kernel.org/r/20251224173358.647691-3-mathieu.desnoyers@efficios.com Fixes: af7f588d8f73 ("sched: Introduce per-memory-map concurrency ID") Signed-off-by: Mathieu Desnoyers Reviewed-by: Thomas Gleixner Cc: Mark Brown Cc: Aboorva Devarajan Cc: Al Viro Cc: Baolin Wang Cc: Christan König Cc: Christian Brauner Cc: Christoph Lameter Cc: David Hildenbrand Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: "Liam R . Howlett" Cc: Lorenzo Stoakes Cc: Martin Liu Cc: Masami Hiramatsu Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: "Paul E. McKenney" Cc: Roman Gushchin Cc: SeongJae Park Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Sweet Tea Dorminy Cc: Tejun Heo Cc: Vlastimil Babka Cc: Wei Yang Cc: Yu Zhao Cc: Peter Zijlstra (Intel) Cc: Signed-off-by: Andrew Morton --- drivers/firmware/efi/efi.c | 2 +- include/linux/mm_types.h | 13 +++++++++---- mm/init-mm.c | 2 +- 3 files changed, 11 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index f5ff6e84a9b7..17b5f3415465 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -74,10 +74,10 @@ struct mm_struct efi_mm = { .page_table_lock = __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(efi_mm.mmlist), .user_ns = &init_user_ns, - .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0}, #ifdef CONFIG_SCHED_MM_CID .mm_cid.lock = __RAW_SPIN_LOCK_UNLOCKED(efi_mm.mm_cid.lock), #endif + .flexible_array = MM_STRUCT_FLEXIBLE_ARRAY_INIT, }; struct workqueue_struct *efi_rts_wq; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 42af2292951d..110b319a2ffb 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1329,7 +1329,7 @@ struct mm_struct { * The mm_cpumask needs to be at the end of mm_struct, because it * is dynamically sized based on nr_cpu_ids. */ - unsigned long cpu_bitmap[]; + char flexible_array[] __aligned(__alignof__(unsigned long)); }; /* Copy value to the first system word of mm flags, non-atomically. */ @@ -1366,19 +1366,24 @@ static inline void __mm_flags_set_mask_bits_word(struct mm_struct *mm, MT_FLAGS_USE_RCU) extern struct mm_struct init_mm; +#define MM_STRUCT_FLEXIBLE_ARRAY_INIT \ +{ \ + [0 ... sizeof(cpumask_t)-1] = 0 \ +} + /* Pointer magic because the dynamic array size confuses some compilers. */ static inline void mm_init_cpumask(struct mm_struct *mm) { unsigned long cpu_bitmap = (unsigned long)mm; - cpu_bitmap += offsetof(struct mm_struct, cpu_bitmap); + cpu_bitmap += offsetof(struct mm_struct, flexible_array); cpumask_clear((struct cpumask *)cpu_bitmap); } /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) { - return (struct cpumask *)&mm->cpu_bitmap; + return (struct cpumask *)&mm->flexible_array; } #ifdef CONFIG_LRU_GEN @@ -1469,7 +1474,7 @@ static inline cpumask_t *mm_cpus_allowed(struct mm_struct *mm) { unsigned long bitmap = (unsigned long)mm; - bitmap += offsetof(struct mm_struct, cpu_bitmap); + bitmap += offsetof(struct mm_struct, flexible_array); /* Skip cpu_bitmap */ bitmap += cpumask_size(); return (struct cpumask *)bitmap; diff --git a/mm/init-mm.c b/mm/init-mm.c index a514f8ce47e3..c5556bb9d5f0 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -47,7 +47,7 @@ struct mm_struct init_mm = { #ifdef CONFIG_SCHED_MM_CID .mm_cid.lock = __RAW_SPIN_LOCK_UNLOCKED(init_mm.mm_cid.lock), #endif - .cpu_bitmap = CPU_BITS_NONE, + .flexible_array = MM_STRUCT_FLEXIBLE_ARRAY_INIT, INIT_MM_CONTEXT(init_mm) }; -- cgit v1.2.3 From be31340a4cc259340044b7fc4f7e97f58c74ee8e Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 24 Dec 2025 12:33:58 -0500 Subject: mm: take into account mm_cid size for mm_struct static definitions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both init_mm and efi_mm static definitions need to make room for the 2 mm_cid cpumasks. This fixes possible out-of-bounds accesses to init_mm and efi_mm. Add a space between # and define for the mm_alloc_cid() definition to make it consistent with the coding style used in the rest of this header file. Link: https://lkml.kernel.org/r/20251224173358.647691-4-mathieu.desnoyers@efficios.com Fixes: af7f588d8f73 ("sched: Introduce per-memory-map concurrency ID") Signed-off-by: Mathieu Desnoyers Reviewed-by: Thomas Gleixner Cc: Mark Brown Cc: Aboorva Devarajan Cc: Al Viro Cc: Baolin Wang Cc: Christan König Cc: Christian Brauner Cc: Christoph Lameter Cc: David Hildenbrand Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: "Liam R . Howlett" Cc: Lorenzo Stoakes Cc: Martin Liu Cc: Masami Hiramatsu Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: "Paul E. McKenney" Cc: Roman Gushchin Cc: SeongJae Park Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Sweet Tea Dorminy Cc: Tejun Heo Cc: Vlastimil Babka Cc: Wei Yang Cc: Yu Zhao Cc: Peter Zijlstra (Intel) Cc: Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 110b319a2ffb..aa4639888f89 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1368,7 +1368,7 @@ extern struct mm_struct init_mm; #define MM_STRUCT_FLEXIBLE_ARRAY_INIT \ { \ - [0 ... sizeof(cpumask_t)-1] = 0 \ + [0 ... sizeof(cpumask_t) + MM_CID_STATIC_SIZE - 1] = 0 \ } /* Pointer magic because the dynamic array size confuses some compilers. */ @@ -1500,7 +1500,7 @@ static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct * mm_init_cid(mm, p); return 0; } -#define mm_alloc_cid(...) alloc_hooks(mm_alloc_cid_noprof(__VA_ARGS__)) +# define mm_alloc_cid(...) alloc_hooks(mm_alloc_cid_noprof(__VA_ARGS__)) static inline void mm_destroy_cid(struct mm_struct *mm) { @@ -1514,6 +1514,8 @@ static inline unsigned int mm_cid_size(void) return cpumask_size() + bitmap_size(num_possible_cpus()); } +/* Use 2 * NR_CPUS as worse case for static allocation. */ +# define MM_CID_STATIC_SIZE (2 * sizeof(cpumask_t)) #else /* CONFIG_SCHED_MM_CID */ static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { } static inline int mm_alloc_cid(struct mm_struct *mm, struct task_struct *p) { return 0; } @@ -1522,6 +1524,7 @@ static inline unsigned int mm_cid_size(void) { return 0; } +# define MM_CID_STATIC_SIZE 0 #endif /* CONFIG_SCHED_MM_CID */ struct mmu_gather; -- cgit v1.2.3 From f9a49aa302a05e91ca01f69031cb79a0ea33031f Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 5 Jan 2026 13:17:27 -0800 Subject: fs/writeback: skip AS_NO_DATA_INTEGRITY mappings in wait_sb_inodes() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Above the while() loop in wait_sb_inodes(), we document that we must wait for all pages under writeback for data integrity. Consequently, if a mapping, like fuse, traditionally does not have data integrity semantics, there is no need to wait at all; we can simply skip these inodes. This restores fuse back to prior behavior where syncs are no-ops. This fixes a user regression where if a system is running a faulty fuse server that does not reply to issued write requests, this causes wait_sb_inodes() to wait forever. Link: https://lkml.kernel.org/r/20260105211737.4105620-2-joannelkoong@gmail.com Fixes: 0c58a97f919c ("fuse: remove tmp folio for writebacks and internal rb tree") Signed-off-by: Joanne Koong Reported-by: Athul Krishna Reported-by: J. Neuschäfer Reviewed-by: Bernd Schubert Tested-by: J. Neuschäfer Cc: Alexander Viro Cc: Bernd Schubert Cc: Bonaccorso Salvatore Cc: Christian Brauner Cc: David Hildenbrand Cc: Jan Kara Cc: "Liam R. Howlett" Cc: Lorenzo Stoakes Cc: "Matthew Wilcox (Oracle)" Cc: Michal Hocko Cc: Mike Rapoport Cc: Miklos Szeredi Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- fs/fs-writeback.c | 7 ++++++- fs/fuse/file.c | 4 +++- include/linux/pagemap.h | 11 +++++++++++ 3 files changed, 20 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 6800886c4d10..baa2f2141146 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2750,8 +2750,13 @@ static void wait_sb_inodes(struct super_block *sb) * The mapping can appear untagged while still on-list since we * do not have the mapping lock. Skip it here, wb completion * will remove it. + * + * If the mapping does not have data integrity semantics, + * there's no need to wait for the writeout to complete, as the + * mapping cannot guarantee that data is persistently stored. */ - if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) + if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK) || + mapping_no_data_integrity(mapping)) continue; spin_unlock_irq(&sb->s_inode_wblist_lock); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 01bc894e9c2b..3b2a171e652f 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -3200,8 +3200,10 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) inode->i_fop = &fuse_file_operations; inode->i_data.a_ops = &fuse_file_aops; - if (fc->writeback_cache) + if (fc->writeback_cache) { mapping_set_writeback_may_deadlock_on_reclaim(&inode->i_data); + mapping_set_no_data_integrity(&inode->i_data); + } INIT_LIST_HEAD(&fi->write_files); INIT_LIST_HEAD(&fi->queued_writes); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 31a848485ad9..ec442af3f886 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -210,6 +210,7 @@ enum mapping_flags { AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM = 9, AS_KERNEL_FILE = 10, /* mapping for a fake kernel file that shouldn't account usage to user cgroups */ + AS_NO_DATA_INTEGRITY = 11, /* no data integrity guarantees */ /* Bits 16-25 are used for FOLIO_ORDER */ AS_FOLIO_ORDER_BITS = 5, AS_FOLIO_ORDER_MIN = 16, @@ -345,6 +346,16 @@ static inline bool mapping_writeback_may_deadlock_on_reclaim(const struct addres return test_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags); } +static inline void mapping_set_no_data_integrity(struct address_space *mapping) +{ + set_bit(AS_NO_DATA_INTEGRITY, &mapping->flags); +} + +static inline bool mapping_no_data_integrity(const struct address_space *mapping) +{ + return test_bit(AS_NO_DATA_INTEGRITY, &mapping->flags); +} + static inline gfp_t mapping_gfp_mask(const struct address_space *mapping) { return mapping->gfp_mask; -- cgit v1.2.3 From 50b359896fe55d0443ed550e1fabba71d242031a Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Sun, 18 Jan 2026 09:51:15 +0200 Subject: wifi: cfg80211: ignore link disabled flag from userspace When the AP has an advertised TID to Link Mapping (TTLM) it shall include the element in the association response. As such, when this element is present it needs to be used for the currently dormant links. See Draft P802.11REVmf_D1.0 section 35.3.7.2.3 ("Negotiation of TTLM") for the details. The flag is also not usable in case userspace wants to specify a negotiated TTLM during association. Note that for the link reconfiguration case, mac80211 did not use the information. Draft P802.11REVmf_D1.0 states in section 35.3.6.4 ("Link reconfiguration to the setup links) that we "shall operate with all the TIDs mapped to the newly added links ..." All this means that the flag is not needed. The implementation should parse the information from the association response. Signed-off-by: Benjamin Berg Reviewed-by: Johannes Berg Reviewed-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20260118093904.754e057896a5.Ifd06f5ef839a93bfd54d0593dc932870f95f3242@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 --- include/uapi/linux/nl80211.h | 5 +++-- net/wireless/nl80211.c | 10 ---------- 3 files changed, 3 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 899f267b7cf9..2900202588a5 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3221,8 +3221,6 @@ struct cfg80211_auth_request { * if this is %NULL for a link, that link is not requested * @elems: extra elements for the per-STA profile for this link * @elems_len: length of the elements - * @disabled: If set this link should be included during association etc. but it - * should not be used until enabled by the AP MLD. * @error: per-link error code, must be <= 0. If there is an error, then the * operation as a whole must fail. */ @@ -3230,7 +3228,6 @@ struct cfg80211_assoc_link { struct cfg80211_bss *bss; const u8 *elems; size_t elems_len; - bool disabled; int error; }; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 8134f10e4e6c..8433bac48112 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2880,8 +2880,9 @@ enum nl80211_commands { * index. If the userspace includes more RNR elements than number of * MBSSID elements then these will be added in every EMA beacon. * - * @NL80211_ATTR_MLO_LINK_DISABLED: Flag attribute indicating that the link is - * disabled. + * @NL80211_ATTR_MLO_LINK_DISABLED: Unused. It was used to indicate that a link + * is disabled during association. However, the AP will send the + * information by including a TTLM in the association response. * * @NL80211_ATTR_BSS_DUMP_INCLUDE_USE_DATA: Include BSS usage data, i.e. * include BSSes that can only be used in restricted scenarios and/or diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index c961cd42a832..03efd45c007f 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -12241,9 +12241,6 @@ static int nl80211_process_links(struct cfg80211_registered_device *rdev, return -EINVAL; } } - - links[link_id].disabled = - nla_get_flag(attrs[NL80211_ATTR_MLO_LINK_DISABLED]); } return 0; @@ -12423,13 +12420,6 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) goto free; } - if (req.links[req.link_id].disabled) { - GENL_SET_ERR_MSG(info, - "cannot have assoc link disabled"); - err = -EINVAL; - goto free; - } - if (info->attrs[NL80211_ATTR_ASSOC_MLD_EXT_CAPA_OPS]) req.ext_mld_capa_ops = nla_get_u16(info->attrs[NL80211_ATTR_ASSOC_MLD_EXT_CAPA_OPS]); -- cgit v1.2.3 From ca1a47cd3f5f4c46ca188b1c9a27af87d1ab2216 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 23 Dec 2025 22:40:34 +0100 Subject: mm/hugetlb: fix hugetlb_pmd_shared() Patch series "mm/hugetlb: fixes for PMD table sharing (incl. using mmu_gather)", v3. One functional fix, one performance regression fix, and two related comment fixes. I cleaned up my prototype I recently shared [1] for the performance fix, deferring most of the cleanups I had in the prototype to a later point. While doing that I identified the other things. The goal of this patch set is to be backported to stable trees "fairly" easily. At least patch #1 and #4. Patch #1 fixes hugetlb_pmd_shared() not detecting any sharing Patch #2 + #3 are simple comment fixes that patch #4 interacts with. Patch #4 is a fix for the reported performance regression due to excessive IPI broadcasts during fork()+exit(). The last patch is all about TLB flushes, IPIs and mmu_gather. Read: complicated There are plenty of cleanups in the future to be had + one reasonable optimization on x86. But that's all out of scope for this series. Runtime tested, with a focus on fixing the performance regression using the original reproducer [2] on x86. This patch (of 4): We switched from (wrongly) using the page count to an independent shared count. Now, shared page tables have a refcount of 1 (excluding speculative references) and instead use ptdesc->pt_share_count to identify sharing. We didn't convert hugetlb_pmd_shared(), so right now, we would never detect a shared PMD table as such, because sharing/unsharing no longer touches the refcount of a PMD table. Page migration, like mbind() or migrate_pages() would allow for migrating folios mapped into such shared PMD tables, even though the folios are not exclusive. In smaps we would account them as "private" although they are "shared", and we would be wrongly setting the PM_MMAP_EXCLUSIVE in the pagemap interface. Fix it by properly using ptdesc_pmd_is_shared() in hugetlb_pmd_shared(). Link: https://lkml.kernel.org/r/20251223214037.580860-1-david@kernel.org Link: https://lkml.kernel.org/r/20251223214037.580860-2-david@kernel.org Link: https://lore.kernel.org/all/8cab934d-4a56-44aa-b641-bfd7e23bd673@kernel.org/ [1] Link: https://lore.kernel.org/all/8cab934d-4a56-44aa-b641-bfd7e23bd673@kernel.org/ [2] Fixes: 59d9094df3d7 ("mm: hugetlb: independent PMD page table shared count") Signed-off-by: David Hildenbrand (Red Hat) Reviewed-by: Rik van Riel Reviewed-by: Lance Yang Tested-by: Lance Yang Reviewed-by: Harry Yoo Tested-by: Laurence Oberman Reviewed-by: Lorenzo Stoakes Acked-by: Oscar Salvador Cc: Liu Shixin Cc: Uschakow, Stanislav" Cc: Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 019a1c5281e4..03c8725efa28 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -1326,7 +1326,7 @@ static inline __init void hugetlb_cma_reserve(int order) #ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING static inline bool hugetlb_pmd_shared(pte_t *pte) { - return page_count(virt_to_page(pte)) > 1; + return ptdesc_pmd_is_shared(virt_to_ptdesc(pte)); } #else static inline bool hugetlb_pmd_shared(pte_t *pte) -- cgit v1.2.3 From 8ce720d5bd91e9dc16db3604aa4b1bf76770a9a1 Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Red Hat)" Date: Tue, 23 Dec 2025 22:40:37 +0100 Subject: mm/hugetlb: fix excessive IPI broadcasts when unsharing PMD tables using mmu_gather As reported, ever since commit 1013af4f585f ("mm/hugetlb: fix huge_pmd_unshare() vs GUP-fast race") we can end up in some situations where we perform so many IPI broadcasts when unsharing hugetlb PMD page tables that it severely regresses some workloads. In particular, when we fork()+exit(), or when we munmap() a large area backed by many shared PMD tables, we perform one IPI broadcast per unshared PMD table. There are two optimizations to be had: (1) When we process (unshare) multiple such PMD tables, such as during exit(), it is sufficient to send a single IPI broadcast (as long as we respect locking rules) instead of one per PMD table. Locking prevents that any of these PMD tables could get reused before we drop the lock. (2) When we are not the last sharer (> 2 users including us), there is no need to send the IPI broadcast. The shared PMD tables cannot become exclusive (fully unshared) before an IPI will be broadcasted by the last sharer. Concurrent GUP-fast could walk into a PMD table just before we unshared it. It could then succeed in grabbing a page from the shared page table even after munmap() etc succeeded (and supressed an IPI). But there is not difference compared to GUP-fast just sleeping for a while after grabbing the page and re-enabling IRQs. Most importantly, GUP-fast will never walk into page tables that are no-longer shared, because the last sharer will issue an IPI broadcast. (if ever required, checking whether the PUD changed in GUP-fast after grabbing the page like we do in the PTE case could handle this) So let's rework PMD sharing TLB flushing + IPI sync to use the mmu_gather infrastructure so we can implement these optimizations and demystify the code at least a bit. Extend the mmu_gather infrastructure to be able to deal with our special hugetlb PMD table sharing implementation. To make initialization of the mmu_gather easier when working on a single VMA (in particular, when dealing with hugetlb), provide tlb_gather_mmu_vma(). We'll consolidate the handling for (full) unsharing of PMD tables in tlb_unshare_pmd_ptdesc() and tlb_flush_unshared_tables(), and track in "struct mmu_gather" whether we had (full) unsharing of PMD tables. Because locking is very special (concurrent unsharing+reuse must be prevented), we disallow deferring flushing to tlb_finish_mmu() and instead require an explicit earlier call to tlb_flush_unshared_tables(). From hugetlb code, we call huge_pmd_unshare_flush() where we make sure that the expected lock protecting us from concurrent unsharing+reuse is still held. Check with a VM_WARN_ON_ONCE() in tlb_finish_mmu() that tlb_flush_unshared_tables() was properly called earlier. Document it all properly. Notes about tlb_remove_table_sync_one() interaction with unsharing: There are two fairly tricky things: (1) tlb_remove_table_sync_one() is a NOP on architectures without CONFIG_MMU_GATHER_RCU_TABLE_FREE. Here, the assumption is that the previous TLB flush would send an IPI to all relevant CPUs. Careful: some architectures like x86 only send IPIs to all relevant CPUs when tlb->freed_tables is set. The relevant architectures should be selecting MMU_GATHER_RCU_TABLE_FREE, but x86 might not do that in stable kernels and it might have been problematic before this patch. Also, the arch flushing behavior (independent of IPIs) is different when tlb->freed_tables is set. Do we have to enlighten them to also take care of tlb->unshared_tables? So far we didn't care, so hopefully we are fine. Of course, we could be setting tlb->freed_tables as well, but that might then unnecessarily flush too much, because the semantics of tlb->freed_tables are a bit fuzzy. This patch changes nothing in this regard. (2) tlb_remove_table_sync_one() is not a NOP on architectures with CONFIG_MMU_GATHER_RCU_TABLE_FREE that actually don't need a sync. Take x86 as an example: in the common case (!pv, !X86_FEATURE_INVLPGB) we still issue IPIs during TLB flushes and don't actually need the second tlb_remove_table_sync_one(). This optimized can be implemented on top of this, by checking e.g., in tlb_remove_table_sync_one() whether we really need IPIs. But as described in (1), it really must honor tlb->freed_tables then to send IPIs to all relevant CPUs. Notes on TLB flushing changes: (1) Flushing for non-shared PMD tables We're converting from flush_hugetlb_tlb_range() to tlb_remove_huge_tlb_entry(). Given that we properly initialize the MMU gather in tlb_gather_mmu_vma() to be hugetlb aware, similar to __unmap_hugepage_range(), that should be fine. (2) Flushing for shared PMD tables We're converting from various things (flush_hugetlb_tlb_range(), tlb_flush_pmd_range(), flush_tlb_range()) to tlb_flush_pmd_range(). tlb_flush_pmd_range() achieves the same that tlb_remove_huge_tlb_entry() would achieve in these scenarios. Note that tlb_remove_huge_tlb_entry() also calls __tlb_remove_tlb_entry(), however that is only implemented on powerpc, which does not support PMD table sharing. Similar to (1), tlb_gather_mmu_vma() should make sure that TLB flushing keeps on working as expected. Further, note that the ptdesc_pmd_pts_dec() in huge_pmd_share() is not a concern, as we are holding the i_mmap_lock the whole time, preventing concurrent unsharing. That ptdesc_pmd_pts_dec() usage will be removed separately as a cleanup later. There are plenty more cleanups to be had, but they have to wait until this is fixed. [david@kernel.org: fix kerneldoc] Link: https://lkml.kernel.org/r/f223dd74-331c-412d-93fc-69e360a5006c@kernel.org Link: https://lkml.kernel.org/r/20251223214037.580860-5-david@kernel.org Fixes: 1013af4f585f ("mm/hugetlb: fix huge_pmd_unshare() vs GUP-fast race") Signed-off-by: David Hildenbrand (Red Hat) Reported-by: Uschakow, Stanislav" Closes: https://lore.kernel.org/all/4d3878531c76479d9f8ca9789dc6485d@amazon.de/ Tested-by: Laurence Oberman Acked-by: Harry Yoo Reviewed-by: Lorenzo Stoakes Cc: Lance Yang Cc: Liu Shixin Cc: Oscar Salvador Cc: Rik van Riel Cc: Signed-off-by: Andrew Morton --- include/asm-generic/tlb.h | 77 ++++++++++++++++++++++++++++- include/linux/hugetlb.h | 15 ++++-- include/linux/mm_types.h | 1 + mm/hugetlb.c | 123 +++++++++++++++++++++++++++------------------- mm/mmu_gather.c | 33 +++++++++++++ mm/rmap.c | 25 +++++++--- 6 files changed, 208 insertions(+), 66 deletions(-) (limited to 'include') diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 1fff717cae51..4d679d2a206b 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -46,7 +46,8 @@ * * The mmu_gather API consists of: * - * - tlb_gather_mmu() / tlb_gather_mmu_fullmm() / tlb_finish_mmu() + * - tlb_gather_mmu() / tlb_gather_mmu_fullmm() / tlb_gather_mmu_vma() / + * tlb_finish_mmu() * * start and finish a mmu_gather * @@ -364,6 +365,20 @@ struct mmu_gather { unsigned int vma_huge : 1; unsigned int vma_pfn : 1; + /* + * Did we unshare (unmap) any shared page tables? For now only + * used for hugetlb PMD table sharing. + */ + unsigned int unshared_tables : 1; + + /* + * Did we unshare any page tables such that they are now exclusive + * and could get reused+modified by the new owner? When setting this + * flag, "unshared_tables" will be set as well. For now only used + * for hugetlb PMD table sharing. + */ + unsigned int fully_unshared_tables : 1; + unsigned int batch_count; #ifndef CONFIG_MMU_GATHER_NO_GATHER @@ -400,6 +415,7 @@ static inline void __tlb_reset_range(struct mmu_gather *tlb) tlb->cleared_pmds = 0; tlb->cleared_puds = 0; tlb->cleared_p4ds = 0; + tlb->unshared_tables = 0; /* * Do not reset mmu_gather::vma_* fields here, we do not * call into tlb_start_vma() again to set them if there is an @@ -484,7 +500,7 @@ static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) * these bits. */ if (!(tlb->freed_tables || tlb->cleared_ptes || tlb->cleared_pmds || - tlb->cleared_puds || tlb->cleared_p4ds)) + tlb->cleared_puds || tlb->cleared_p4ds || tlb->unshared_tables)) return; tlb_flush(tlb); @@ -773,6 +789,63 @@ static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd) } #endif +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING +static inline void tlb_unshare_pmd_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt, + unsigned long addr) +{ + /* + * The caller must make sure that concurrent unsharing + exclusive + * reuse is impossible until tlb_flush_unshared_tables() was called. + */ + VM_WARN_ON_ONCE(!ptdesc_pmd_is_shared(pt)); + ptdesc_pmd_pts_dec(pt); + + /* Clearing a PUD pointing at a PMD table with PMD leaves. */ + tlb_flush_pmd_range(tlb, addr & PUD_MASK, PUD_SIZE); + + /* + * If the page table is now exclusively owned, we fully unshared + * a page table. + */ + if (!ptdesc_pmd_is_shared(pt)) + tlb->fully_unshared_tables = true; + tlb->unshared_tables = true; +} + +static inline void tlb_flush_unshared_tables(struct mmu_gather *tlb) +{ + /* + * As soon as the caller drops locks to allow for reuse of + * previously-shared tables, these tables could get modified and + * even reused outside of hugetlb context, so we have to make sure that + * any page table walkers (incl. TLB, GUP-fast) are aware of that + * change. + * + * Even if we are not fully unsharing a PMD table, we must + * flush the TLB for the unsharer now. + */ + if (tlb->unshared_tables) + tlb_flush_mmu_tlbonly(tlb); + + /* + * Similarly, we must make sure that concurrent GUP-fast will not + * walk previously-shared page tables that are getting modified+reused + * elsewhere. So broadcast an IPI to wait for any concurrent GUP-fast. + * + * We only perform this when we are the last sharer of a page table, + * as the IPI will reach all CPUs: any GUP-fast. + * + * Note that on configs where tlb_remove_table_sync_one() is a NOP, + * the expectation is that the tlb_flush_mmu_tlbonly() would have issued + * required IPIs already for us. + */ + if (tlb->fully_unshared_tables) { + tlb_remove_table_sync_one(); + tlb->fully_unshared_tables = false; + } +} +#endif /* CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */ + #endif /* CONFIG_MMU */ #endif /* _ASM_GENERIC__TLB_H */ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 03c8725efa28..e51b8ef0cebd 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -240,8 +240,9 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz); unsigned long hugetlb_mask_last_page(struct hstate *h); -int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep); +int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep); +void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma); void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end); @@ -300,13 +301,17 @@ static inline struct address_space *hugetlb_folio_mapping_lock_write( return NULL; } -static inline int huge_pmd_unshare(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +static inline int huge_pmd_unshare(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { return 0; } +static inline void huge_pmd_unshare_flush(struct mmu_gather *tlb, + struct vm_area_struct *vma) +{ +} + static inline void adjust_range_if_pmd_sharing_possible( struct vm_area_struct *vma, unsigned long *start, unsigned long *end) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index aa4639888f89..78950eb8926d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1530,6 +1530,7 @@ static inline unsigned int mm_cid_size(void) struct mmu_gather; extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); +void tlb_gather_mmu_vma(struct mmu_gather *tlb, struct vm_area_struct *vma); extern void tlb_finish_mmu(struct mmu_gather *tlb); struct vm_fault; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 67131aa24d77..a1832da0f623 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5112,7 +5112,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, unsigned long last_addr_mask; pte_t *src_pte, *dst_pte; struct mmu_notifier_range range; - bool shared_pmd = false; + struct mmu_gather tlb; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, old_addr, old_end); @@ -5122,6 +5122,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, * range. */ flush_cache_range(vma, range.start, range.end); + tlb_gather_mmu_vma(&tlb, vma); mmu_notifier_invalidate_range_start(&range); last_addr_mask = hugetlb_mask_last_page(h); @@ -5138,8 +5139,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, if (huge_pte_none(huge_ptep_get(mm, old_addr, src_pte))) continue; - if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) { - shared_pmd = true; + if (huge_pmd_unshare(&tlb, vma, old_addr, src_pte)) { old_addr |= last_addr_mask; new_addr |= last_addr_mask; continue; @@ -5150,15 +5150,16 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, break; move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte, sz); + tlb_remove_huge_tlb_entry(h, &tlb, src_pte, old_addr); } - if (shared_pmd) - flush_hugetlb_tlb_range(vma, range.start, range.end); - else - flush_hugetlb_tlb_range(vma, old_end - len, old_end); + tlb_flush_mmu_tlbonly(&tlb); + huge_pmd_unshare_flush(&tlb, vma); + mmu_notifier_invalidate_range_end(&range); i_mmap_unlock_write(mapping); hugetlb_vma_unlock_write(vma); + tlb_finish_mmu(&tlb); return len + old_addr - old_end; } @@ -5177,7 +5178,6 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long sz = huge_page_size(h); bool adjust_reservation; unsigned long last_addr_mask; - bool force_flush = false; WARN_ON(!is_vm_hugetlb_page(vma)); BUG_ON(start & ~huge_page_mask(h)); @@ -5200,10 +5200,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, } ptl = huge_pte_lock(h, mm, ptep); - if (huge_pmd_unshare(mm, vma, address, ptep)) { + if (huge_pmd_unshare(tlb, vma, address, ptep)) { spin_unlock(ptl); - tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE); - force_flush = true; address |= last_addr_mask; continue; } @@ -5319,14 +5317,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, } tlb_end_vma(tlb, vma); - /* - * There is nothing protecting a previously-shared page table that we - * unshared through huge_pmd_unshare() from getting freed after we - * release i_mmap_rwsem, so flush the TLB now. If huge_pmd_unshare() - * succeeded, flush the range corresponding to the pud. - */ - if (force_flush) - tlb_flush_mmu_tlbonly(tlb); + huge_pmd_unshare_flush(tlb, vma); } void __hugetlb_zap_begin(struct vm_area_struct *vma, @@ -6425,11 +6416,11 @@ long hugetlb_change_protection(struct vm_area_struct *vma, pte_t pte; struct hstate *h = hstate_vma(vma); long pages = 0, psize = huge_page_size(h); - bool shared_pmd = false; struct mmu_notifier_range range; unsigned long last_addr_mask; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; + struct mmu_gather tlb; /* * In the case of shared PMDs, the area to flush could be beyond @@ -6442,6 +6433,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma, BUG_ON(address >= end); flush_cache_range(vma, range.start, range.end); + tlb_gather_mmu_vma(&tlb, vma); mmu_notifier_invalidate_range_start(&range); hugetlb_vma_lock_write(vma); @@ -6468,7 +6460,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma, } } ptl = huge_pte_lock(h, mm, ptep); - if (huge_pmd_unshare(mm, vma, address, ptep)) { + if (huge_pmd_unshare(&tlb, vma, address, ptep)) { /* * When uffd-wp is enabled on the vma, unshare * shouldn't happen at all. Warn about it if it @@ -6477,7 +6469,6 @@ long hugetlb_change_protection(struct vm_area_struct *vma, WARN_ON_ONCE(uffd_wp || uffd_wp_resolve); pages++; spin_unlock(ptl); - shared_pmd = true; address |= last_addr_mask; continue; } @@ -6538,22 +6529,16 @@ long hugetlb_change_protection(struct vm_area_struct *vma, pte = huge_pte_clear_uffd_wp(pte); huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); pages++; + tlb_remove_huge_tlb_entry(h, &tlb, ptep, address); } next: spin_unlock(ptl); cond_resched(); } - /* - * There is nothing protecting a previously-shared page table that we - * unshared through huge_pmd_unshare() from getting freed after we - * release i_mmap_rwsem, so flush the TLB now. If huge_pmd_unshare() - * succeeded, flush the range corresponding to the pud. - */ - if (shared_pmd) - flush_hugetlb_tlb_range(vma, range.start, range.end); - else - flush_hugetlb_tlb_range(vma, start, end); + + tlb_flush_mmu_tlbonly(&tlb); + huge_pmd_unshare_flush(&tlb, vma); /* * No need to call mmu_notifier_arch_invalidate_secondary_tlbs() we are * downgrading page table protection not changing it to point to a new @@ -6564,6 +6549,7 @@ next: i_mmap_unlock_write(vma->vm_file->f_mapping); hugetlb_vma_unlock_write(vma); mmu_notifier_invalidate_range_end(&range); + tlb_finish_mmu(&tlb); return pages > 0 ? (pages << h->order) : pages; } @@ -6920,18 +6906,27 @@ out: return pte; } -/* - * unmap huge page backed by shared pte. +/** + * huge_pmd_unshare - Unmap a pmd table if it is shared by multiple users + * @tlb: the current mmu_gather. + * @vma: the vma covering the pmd table. + * @addr: the address we are trying to unshare. + * @ptep: pointer into the (pmd) page table. + * + * Called with the page table lock held, the i_mmap_rwsem held in write mode + * and the hugetlb vma lock held in write mode. * - * Called with page table lock held. + * Note: The caller must call huge_pmd_unshare_flush() before dropping the + * i_mmap_rwsem. * - * returns: 1 successfully unmapped a shared pte page - * 0 the underlying pte page is not shared, or it is the last user + * Returns: 1 if it was a shared PMD table and it got unmapped, or 0 if it + * was not a shared PMD table. */ -int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { unsigned long sz = huge_page_size(hstate_vma(vma)); + struct mm_struct *mm = vma->vm_mm; pgd_t *pgd = pgd_offset(mm, addr); p4d_t *p4d = p4d_offset(pgd, addr); pud_t *pud = pud_offset(p4d, addr); @@ -6943,18 +6938,36 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, i_mmap_assert_write_locked(vma->vm_file->f_mapping); hugetlb_vma_assert_locked(vma); pud_clear(pud); - /* - * Once our caller drops the rmap lock, some other process might be - * using this page table as a normal, non-hugetlb page table. - * Wait for pending gup_fast() in other threads to finish before letting - * that happen. - */ - tlb_remove_table_sync_one(); - ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep)); + + tlb_unshare_pmd_ptdesc(tlb, virt_to_ptdesc(ptep), addr); + mm_dec_nr_pmds(mm); return 1; } +/* + * huge_pmd_unshare_flush - Complete a sequence of huge_pmd_unshare() calls + * @tlb: the current mmu_gather. + * @vma: the vma covering the pmd table. + * + * Perform necessary TLB flushes or IPI broadcasts to synchronize PMD table + * unsharing with concurrent page table walkers. + * + * This function must be called after a sequence of huge_pmd_unshare() + * calls while still holding the i_mmap_rwsem. + */ +void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma) +{ + /* + * We must synchronize page table unsharing such that nobody will + * try reusing a previously-shared page table while it might still + * be in use by previous sharers (TLB, GUP_fast). + */ + i_mmap_assert_write_locked(vma->vm_file->f_mapping); + + tlb_flush_unshared_tables(tlb); +} + #else /* !CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, @@ -6963,12 +6976,16 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, return NULL; } -int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { return 0; } +void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma) +{ +} + void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { @@ -7235,6 +7252,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long sz = huge_page_size(h); struct mm_struct *mm = vma->vm_mm; struct mmu_notifier_range range; + struct mmu_gather tlb; unsigned long address; spinlock_t *ptl; pte_t *ptep; @@ -7246,6 +7264,8 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, return; flush_cache_range(vma, start, end); + tlb_gather_mmu_vma(&tlb, vma); + /* * No need to call adjust_range_if_pmd_sharing_possible(), because * we have already done the PUD_SIZE alignment. @@ -7264,10 +7284,10 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, if (!ptep) continue; ptl = huge_pte_lock(h, mm, ptep); - huge_pmd_unshare(mm, vma, address, ptep); + huge_pmd_unshare(&tlb, vma, address, ptep); spin_unlock(ptl); } - flush_hugetlb_tlb_range(vma, start, end); + huge_pmd_unshare_flush(&tlb, vma); if (take_locks) { i_mmap_unlock_write(vma->vm_file->f_mapping); hugetlb_vma_unlock_write(vma); @@ -7277,6 +7297,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, * Documentation/mm/mmu_notifier.rst. */ mmu_notifier_invalidate_range_end(&range); + tlb_finish_mmu(&tlb); } /* diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 247e3f9db6c7..7468ec388455 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -10,6 +10,7 @@ #include #include #include +#include #include @@ -426,6 +427,7 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, #endif tlb->vma_pfn = 0; + tlb->fully_unshared_tables = 0; __tlb_reset_range(tlb); inc_tlb_flush_pending(tlb->mm); } @@ -459,6 +461,31 @@ void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm) __tlb_gather_mmu(tlb, mm, true); } +/** + * tlb_gather_mmu_vma - initialize an mmu_gather structure for operating on a + * single VMA + * @tlb: the mmu_gather structure to initialize + * @vma: the vm_area_struct + * + * Called to initialize an (on-stack) mmu_gather structure for operating on + * a single VMA. In contrast to tlb_gather_mmu(), calling this function will + * not require another call to tlb_start_vma(). In contrast to tlb_start_vma(), + * this function will *not* call flush_cache_range(). + * + * For hugetlb VMAs, this function will also initialize the mmu_gather + * page_size accordingly, not requiring a separate call to + * tlb_change_page_size(). + * + */ +void tlb_gather_mmu_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) +{ + tlb_gather_mmu(tlb, vma->vm_mm); + tlb_update_vma_flags(tlb, vma); + if (is_vm_hugetlb_page(vma)) + /* All entries have the same size. */ + tlb_change_page_size(tlb, huge_page_size(hstate_vma(vma))); +} + /** * tlb_finish_mmu - finish an mmu_gather structure * @tlb: the mmu_gather structure to finish @@ -468,6 +495,12 @@ void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm) */ void tlb_finish_mmu(struct mmu_gather *tlb) { + /* + * We expect an earlier huge_pmd_unshare_flush() call to sort this out, + * due to complicated locking requirements with page table unsharing. + */ + VM_WARN_ON_ONCE(tlb->fully_unshared_tables); + /* * If there are parallel threads are doing PTE changes on same range * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB diff --git a/mm/rmap.c b/mm/rmap.c index 748f48727a16..7b9879ef442d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -76,7 +76,7 @@ #include #include -#include +#include #define CREATE_TRACE_POINTS #include @@ -2008,13 +2008,17 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * if unsuccessful. */ if (!anon) { + struct mmu_gather tlb; + VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); if (!hugetlb_vma_trylock_write(vma)) goto walk_abort; - if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { + + tlb_gather_mmu_vma(&tlb, vma); + if (huge_pmd_unshare(&tlb, vma, address, pvmw.pte)) { hugetlb_vma_unlock_write(vma); - flush_tlb_range(vma, - range.start, range.end); + huge_pmd_unshare_flush(&tlb, vma); + tlb_finish_mmu(&tlb); /* * The PMD table was unmapped, * consequently unmapping the folio. @@ -2022,6 +2026,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, goto walk_done; } hugetlb_vma_unlock_write(vma); + tlb_finish_mmu(&tlb); } pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); if (pte_dirty(pteval)) @@ -2398,17 +2403,20 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, * fail if unsuccessful. */ if (!anon) { + struct mmu_gather tlb; + VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); if (!hugetlb_vma_trylock_write(vma)) { page_vma_mapped_walk_done(&pvmw); ret = false; break; } - if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { - hugetlb_vma_unlock_write(vma); - flush_tlb_range(vma, - range.start, range.end); + tlb_gather_mmu_vma(&tlb, vma); + if (huge_pmd_unshare(&tlb, vma, address, pvmw.pte)) { + hugetlb_vma_unlock_write(vma); + huge_pmd_unshare_flush(&tlb, vma); + tlb_finish_mmu(&tlb); /* * The PMD table was unmapped, * consequently unmapping the folio. @@ -2417,6 +2425,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, break; } hugetlb_vma_unlock_write(vma); + tlb_finish_mmu(&tlb); } /* Nuke the hugetlb page table entry */ pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); -- cgit v1.2.3 From 35e247032606f06c2f19d90a6562bc315206b7a7 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 14 Jan 2026 11:00:06 +0000 Subject: mm: do not copy page tables unnecessarily for VM_UFFD_WP Commit ab04b530e7e8 ("mm: introduce copy-on-fork VMAs and make VM_MAYBE_GUARD one") aggregates flags checks in vma_needs_copy(), including VM_UFFD_WP. However in doing so, it incorrectly performed this check against src_vma. This check was done on the assumption that all relevant flags are copied upon fork. However the userfaultfd logic is very innovative in that it implements custom logic on fork in dup_userfaultfd(), including a rather well hidden case where lacking UFFD_FEATURE_EVENT_FORK causes VM_UFFD_WP to not be propagated to the destination VMA. And indeed, vma_needs_copy(), prior to this patch, did check this property on dst_vma, not src_vma. Since all the other relevant flags are copied on fork, we can simply fix this by checking against dst_vma. While we're here, we fix a comment against VM_COPY_ON_FORK (noting that it did indeed already reference dst_vma) to make it abundantly clear that we must check against the destination VMA. Link: https://lkml.kernel.org/r/20260114110006.1047071-1-lorenzo.stoakes@oracle.com Fixes: ab04b530e7e8 ("mm: introduce copy-on-fork VMAs and make VM_MAYBE_GUARD one") Signed-off-by: Lorenzo Stoakes Reported-by: Chris Mason Closes: https://lore.kernel.org/all/20260113231257.3002271-1-clm@meta.com/ Acked-by: David Hildenbrand (Red Hat) Acked-by: Pedro Falcato Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 +++++- mm/memory.c | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6f959d8ca4b4..f0d5be9dc736 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -608,7 +608,11 @@ enum { /* * Flags which should result in page tables being copied on fork. These are * flags which indicate that the VMA maps page tables which cannot be - * reconsistuted upon page fault, so necessitate page table copying upon + * reconsistuted upon page fault, so necessitate page table copying upon fork. + * + * Note that these flags should be compared with the DESTINATION VMA not the + * source, as VM_UFFD_WP may not be propagated to destination, while all other + * flags will be. * * VM_PFNMAP / VM_MIXEDMAP - These contain kernel-mapped data which cannot be * reasonably reconstructed on page fault. diff --git a/mm/memory.c b/mm/memory.c index a0822b564cc0..da360a6eb8a4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1465,7 +1465,11 @@ copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, static bool vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { - if (src_vma->vm_flags & VM_COPY_ON_FORK) + /* + * We check against dst_vma as while sane VMA flags will have been + * copied, VM_UFFD_WP may be set only on dst_vma. + */ + if (dst_vma->vm_flags & VM_COPY_ON_FORK) return true; /* * The presence of an anon_vma indicates an anonymous VMA has page -- cgit v1.2.3 From f5f2bad67a45cd1ef6f5b727da104694a81b3666 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Jan 2026 08:31:49 +0100 Subject: block: make the new blkzoned UAPI constants discoverable The Linux 6.19 merge window added the new BLKREPORTZONESV2 ioctl, and with it the new BLK_ZONE_REP_CACHED and BLK_ZONE_COND_ACTIVE constants. The two constants are defined as part of enums, which makes it very painful for userspace to discover if they are present in the installed system headers. Use the #define to the same name trick to make them trivially discoverable using CPP directives. Fixes: 0bf0e2e46668 ("block: track zone conditions") Fixes: b30ffcdc0c15 ("block: introduce BLKREPORTZONESV2 ioctl") Reported-by: Andrey Albershteyn Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/uapi/linux/blkzoned.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h index e33f02703350..663836120966 100644 --- a/include/uapi/linux/blkzoned.h +++ b/include/uapi/linux/blkzoned.h @@ -81,7 +81,8 @@ enum blk_zone_cond { BLK_ZONE_COND_FULL = 0xE, BLK_ZONE_COND_OFFLINE = 0xF, - BLK_ZONE_COND_ACTIVE = 0xFF, + BLK_ZONE_COND_ACTIVE = 0xFF, /* added in Linux 6.19 */ +#define BLK_ZONE_COND_ACTIVE BLK_ZONE_COND_ACTIVE }; /** @@ -100,7 +101,8 @@ enum blk_zone_report_flags { BLK_ZONE_REP_CAPACITY = (1U << 0), /* Input flags */ - BLK_ZONE_REP_CACHED = (1U << 31), + BLK_ZONE_REP_CACHED = (1U << 31), /* added in Linux 6.19 */ +#define BLK_ZONE_REP_CACHED BLK_ZONE_REP_CACHED }; /** -- cgit v1.2.3 From bdcdf968be314b6fc8835b99fb4519e7619671e6 Mon Sep 17 00:00:00 2001 From: Thomas Hellström Date: Wed, 21 Jan 2026 10:10:47 +0100 Subject: drm, drm/xe: Fix xe userptr in the absence of CONFIG_DEVICE_PRIVATE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CONFIG_DEVICE_PRIVATE is not selected by default by some distros, for example Fedora, and that leads to a regression in the xe driver since userptr support gets compiled out. It turns out that DRM_GPUSVM, which is needed for xe userptr support compiles also without CONFIG_DEVICE_PRIVATE, but doesn't compile without CONFIG_ZONE_DEVICE. Exclude the drm_pagemap files from compilation with !CONFIG_ZONE_DEVICE, and remove the CONFIG_DEVICE_PRIVATE dependency from CONFIG_DRM_GPUSVM and the xe driver's selection of it, re-enabling xe userptr for those configs. v2: - Don't compile the drm_pagemap files unless CONFIG_ZONE_DEVICE is set. - Adjust the drm_pagemap.h header accordingly. Fixes: 9e9787414882 ("drm/xe/userptr: replace xe_hmm with gpusvm") Cc: Matthew Auld Cc: Himal Prasad Ghimiray Cc: Thomas Hellström Cc: Matthew Brost Cc: "Thomas Hellström" Cc: Rodrigo Vivi Cc: dri-devel@lists.freedesktop.org Cc: # v6.18+ Signed-off-by: Thomas Hellström Reviewed-by: Matthew Auld Acked-by: Maarten Lankhorst Link: https://patch.msgid.link/20260121091048.41371-2-thomas.hellstrom@linux.intel.com (cherry picked from commit 1e372b246199ca7a35f930177fea91b557dac16e) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/Kconfig | 2 +- drivers/gpu/drm/Makefile | 4 +++- drivers/gpu/drm/xe/Kconfig | 2 +- include/drm/drm_pagemap.h | 19 +++++++++++++++++-- 4 files changed, 22 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig index 7e6bc0b3a589..ed85d0ceee3b 100644 --- a/drivers/gpu/drm/Kconfig +++ b/drivers/gpu/drm/Kconfig @@ -210,7 +210,7 @@ config DRM_GPUVM config DRM_GPUSVM tristate - depends on DRM && DEVICE_PRIVATE + depends on DRM select HMM_MIRROR select MMU_NOTIFIER help diff --git a/drivers/gpu/drm/Makefile b/drivers/gpu/drm/Makefile index 0e1c668b46d2..d26191717428 100644 --- a/drivers/gpu/drm/Makefile +++ b/drivers/gpu/drm/Makefile @@ -108,8 +108,10 @@ obj-$(CONFIG_DRM_EXEC) += drm_exec.o obj-$(CONFIG_DRM_GPUVM) += drm_gpuvm.o drm_gpusvm_helper-y := \ - drm_gpusvm.o\ + drm_gpusvm.o +drm_gpusvm_helper-$(CONFIG_ZONE_DEVICE) += \ drm_pagemap.o + obj-$(CONFIG_DRM_GPUSVM) += drm_gpusvm_helper.o obj-$(CONFIG_DRM_BUDDY) += drm_buddy.o diff --git a/drivers/gpu/drm/xe/Kconfig b/drivers/gpu/drm/xe/Kconfig index 4b288eb3f5b0..c34be1be155b 100644 --- a/drivers/gpu/drm/xe/Kconfig +++ b/drivers/gpu/drm/xe/Kconfig @@ -39,7 +39,7 @@ config DRM_XE select DRM_TTM select DRM_TTM_HELPER select DRM_EXEC - select DRM_GPUSVM if !UML && DEVICE_PRIVATE + select DRM_GPUSVM if !UML select DRM_GPUVM select DRM_SCHED select MMU_NOTIFIER diff --git a/include/drm/drm_pagemap.h b/include/drm/drm_pagemap.h index 70a7991f784f..eb29e5309f0a 100644 --- a/include/drm/drm_pagemap.h +++ b/include/drm/drm_pagemap.h @@ -209,6 +209,19 @@ struct drm_pagemap_devmem_ops { struct dma_fence *pre_migrate_fence); }; +#if IS_ENABLED(CONFIG_ZONE_DEVICE) + +struct drm_pagemap *drm_pagemap_page_to_dpagemap(struct page *page); + +#else + +static inline struct drm_pagemap *drm_pagemap_page_to_dpagemap(struct page *page) +{ + return NULL; +} + +#endif /* IS_ENABLED(CONFIG_ZONE_DEVICE) */ + /** * struct drm_pagemap_devmem - Structure representing a GPU SVM device memory allocation * @@ -233,6 +246,8 @@ struct drm_pagemap_devmem { struct dma_fence *pre_migrate_fence; }; +#if IS_ENABLED(CONFIG_ZONE_DEVICE) + int drm_pagemap_migrate_to_devmem(struct drm_pagemap_devmem *devmem_allocation, struct mm_struct *mm, unsigned long start, unsigned long end, @@ -243,8 +258,6 @@ int drm_pagemap_evict_to_ram(struct drm_pagemap_devmem *devmem_allocation); const struct dev_pagemap_ops *drm_pagemap_pagemap_ops_get(void); -struct drm_pagemap *drm_pagemap_page_to_dpagemap(struct page *page); - void drm_pagemap_devmem_init(struct drm_pagemap_devmem *devmem_allocation, struct device *dev, struct mm_struct *mm, const struct drm_pagemap_devmem_ops *ops, @@ -256,4 +269,6 @@ int drm_pagemap_populate_mm(struct drm_pagemap *dpagemap, struct mm_struct *mm, unsigned long timeslice_ms); +#endif /* IS_ENABLED(CONFIG_ZONE_DEVICE) */ + #endif -- cgit v1.2.3