From 6fe437cfe2cdc797b03f63b338a13fac96ed6a08 Mon Sep 17 00:00:00 2001 From: Levi Yun Date: Tue, 3 Dec 2024 14:31:08 +0000 Subject: firmware: arm_ffa: Fix the race around setting ffa_dev->properties Currently, ffa_dev->properties is set after the ffa_device_register() call return in ffa_setup_partitions(). This could potentially result in a race where the partition's properties is accessed while probing struct ffa_device before it is set. Update the ffa_device_register() to receive ffa_partition_info so all the data from the partition information received from the firmware can be updated into the struct ffa_device before the calling device_register() in ffa_device_register(). Fixes: e781858488b9 ("firmware: arm_ffa: Add initial FFA bus support for device enumeration") Signed-off-by: Levi Yun Message-Id: <20241203143109.1030514-2-yeoreum.yun@arm.com> Signed-off-by: Sudeep Holla --- include/linux/arm_ffa.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index a28e2a6a13d0..74169dd0f659 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -166,9 +166,12 @@ static inline void *ffa_dev_get_drvdata(struct ffa_device *fdev) return dev_get_drvdata(&fdev->dev); } +struct ffa_partition_info; + #if IS_REACHABLE(CONFIG_ARM_FFA_TRANSPORT) -struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id, - const struct ffa_ops *ops); +struct ffa_device * +ffa_device_register(const struct ffa_partition_info *part_info, + const struct ffa_ops *ops); void ffa_device_unregister(struct ffa_device *ffa_dev); int ffa_driver_register(struct ffa_driver *driver, struct module *owner, const char *mod_name); @@ -176,9 +179,9 @@ void ffa_driver_unregister(struct ffa_driver *driver); bool ffa_device_is_valid(struct ffa_device *ffa_dev); #else -static inline -struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id, - const struct ffa_ops *ops) +static inline struct ffa_device * +ffa_device_register(const struct ffa_partition_info *part_info, + const struct ffa_ops *ops) { return NULL; } -- cgit v1.2.3 From 790fb9956eead785b720ccc0851f09a5ca3a093e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 2 Dec 2024 09:20:04 -0800 Subject: linux/dmaengine.h: fix a few kernel-doc warnings The comment block for "Interleaved Transfer Request" should not begin with "/**" since it is not in kernel-doc format. Fix doc name for enum sum_check_flags. Fix all (4) missing struct member warnings. Use "Warning:" for one "Note:" in enum dma_desc_metadata_mode since scripts/kernel-doc does not allow more than one Note: per function or identifier description. This leaves around 49 kernel-doc warnings like: include/linux/dmaengine.h:43: warning: Enum value 'DMA_OUT_OF_ORDER' not described in enum 'dma_status' and another scripts/kernel-doc problem with it not being able to parse some typedefs. Fixes: b14dab792dee ("DMAEngine: Define interleaved transfer request api") Fixes: ad283ea4a3ce ("async_tx: add sum check flags") Fixes: 272420214d26 ("dmaengine: Add DMA_CTRL_REUSE") Fixes: f067025bc676 ("dmaengine: add support to provide error result from a DMA transation") Fixes: d38a8c622a1b ("dmaengine: prepare for generic 'unmap' data") Fixes: 5878853fc938 ("dmaengine: Add API function dmaengine_prep_peripheral_dma_vec()") Signed-off-by: Randy Dunlap Cc: Dan Williams Cc: Dave Jiang Cc: Paul Cercueil Cc: Nuno Sa Cc: Vinod Koul Cc: dmaengine@vger.kernel.org Link: https://lore.kernel.org/r/20241202172004.76020-1-rdunlap@infradead.org Signed-off-by: Vinod Koul --- include/linux/dmaengine.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index b137fdb56093..346251bf1026 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -84,7 +84,7 @@ enum dma_transfer_direction { DMA_TRANS_NONE, }; -/** +/* * Interleaved Transfer Request * ---------------------------- * A chunk is collection of contiguous bytes to be transferred. @@ -223,7 +223,7 @@ enum sum_check_bits { }; /** - * enum pq_check_flags - result of async_{xor,pq}_zero_sum operations + * enum sum_check_flags - result of async_{xor,pq}_zero_sum operations * @SUM_CHECK_P_RESULT - 1 if xor zero sum error, 0 otherwise * @SUM_CHECK_Q_RESULT - 1 if reed-solomon zero sum error, 0 otherwise */ @@ -286,7 +286,7 @@ typedef struct { DECLARE_BITMAP(bits, DMA_TX_TYPE_END); } dma_cap_mask_t; * pointer to the engine's metadata area * 4. Read out the metadata from the pointer * - * Note: the two mode is not compatible and clients must use one mode for a + * Warning: the two modes are not compatible and clients must use one mode for a * descriptor. */ enum dma_desc_metadata_mode { @@ -594,9 +594,13 @@ struct dma_descriptor_metadata_ops { * @phys: physical address of the descriptor * @chan: target channel for this operation * @tx_submit: accept the descriptor, assign ordered cookie and mark the + * @desc_free: driver's callback function to free a resusable descriptor + * after completion * descriptor pending. To be pushed on .issue_pending() call * @callback: routine to call after this operation is complete + * @callback_result: error result from a DMA transaction * @callback_param: general parameter to pass to the callback routine + * @unmap: hook for generic DMA unmap data * @desc_metadata_mode: core managed metadata mode to protect mixed use of * DESC_METADATA_CLIENT or DESC_METADATA_ENGINE. Otherwise * DESC_METADATA_NONE @@ -827,6 +831,9 @@ struct dma_filter { * @device_prep_dma_memset: prepares a memset operation * @device_prep_dma_memset_sg: prepares a memset operation over a scatter list * @device_prep_dma_interrupt: prepares an end of chain interrupt operation + * @device_prep_peripheral_dma_vec: prepares a scatter-gather DMA transfer, + * where the address and size of each segment is located in one entry of + * the dma_vec array. * @device_prep_slave_sg: prepares a slave dma operation * @device_prep_dma_cyclic: prepare a cyclic dma operation suitable for audio. * The function takes a buffer of size buf_len. The callback function will -- cgit v1.2.3 From dcbef0798eb825cd584f7a93f62bed63f7fbbfc9 Mon Sep 17 00:00:00 2001 From: Lizhi Hou Date: Wed, 18 Sep 2024 11:10:22 -0700 Subject: dmaengine: amd: qdma: Remove using the private get and set dma_ops APIs The get_dma_ops and set_dma_ops APIs were never for driver to use. Remove these calls from QDMA driver. Instead, pass the DMA device pointer from the qdma_platdata structure. Fixes: 73d5fc92a11c ("dmaengine: amd: qdma: Add AMD QDMA driver") Signed-off-by: Lizhi Hou Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240918181022.2155715-1-lizhi.hou@amd.com Signed-off-by: Vinod Koul --- drivers/dma/amd/qdma/qdma.c | 28 ++++++++++++---------------- include/linux/platform_data/amd_qdma.h | 2 ++ 2 files changed, 14 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/drivers/dma/amd/qdma/qdma.c b/drivers/dma/amd/qdma/qdma.c index 6d9079458fe9..66f00ad67351 100644 --- a/drivers/dma/amd/qdma/qdma.c +++ b/drivers/dma/amd/qdma/qdma.c @@ -7,9 +7,9 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -492,18 +492,9 @@ static int qdma_device_verify(struct qdma_device *qdev) static int qdma_device_setup(struct qdma_device *qdev) { - struct device *dev = &qdev->pdev->dev; u32 ring_sz = QDMA_DEFAULT_RING_SIZE; int ret = 0; - while (dev && get_dma_ops(dev)) - dev = dev->parent; - if (!dev) { - qdma_err(qdev, "dma device not found"); - return -EINVAL; - } - set_dma_ops(&qdev->pdev->dev, get_dma_ops(dev)); - ret = qdma_setup_fmap_context(qdev); if (ret) { qdma_err(qdev, "Failed setup fmap context"); @@ -548,11 +539,12 @@ static void qdma_free_queue_resources(struct dma_chan *chan) { struct qdma_queue *queue = to_qdma_queue(chan); struct qdma_device *qdev = queue->qdev; - struct device *dev = qdev->dma_dev.dev; + struct qdma_platdata *pdata; qdma_clear_queue_context(queue); vchan_free_chan_resources(&queue->vchan); - dma_free_coherent(dev, queue->ring_size * QDMA_MM_DESC_SIZE, + pdata = dev_get_platdata(&qdev->pdev->dev); + dma_free_coherent(pdata->dma_dev, queue->ring_size * QDMA_MM_DESC_SIZE, queue->desc_base, queue->dma_desc_base); } @@ -565,6 +557,7 @@ static int qdma_alloc_queue_resources(struct dma_chan *chan) struct qdma_queue *queue = to_qdma_queue(chan); struct qdma_device *qdev = queue->qdev; struct qdma_ctxt_sw_desc desc; + struct qdma_platdata *pdata; size_t size; int ret; @@ -572,8 +565,9 @@ static int qdma_alloc_queue_resources(struct dma_chan *chan) if (ret) return ret; + pdata = dev_get_platdata(&qdev->pdev->dev); size = queue->ring_size * QDMA_MM_DESC_SIZE; - queue->desc_base = dma_alloc_coherent(qdev->dma_dev.dev, size, + queue->desc_base = dma_alloc_coherent(pdata->dma_dev, size, &queue->dma_desc_base, GFP_KERNEL); if (!queue->desc_base) { @@ -588,7 +582,7 @@ static int qdma_alloc_queue_resources(struct dma_chan *chan) if (ret) { qdma_err(qdev, "Failed to setup SW desc ctxt for %s", chan->name); - dma_free_coherent(qdev->dma_dev.dev, size, queue->desc_base, + dma_free_coherent(pdata->dma_dev, size, queue->desc_base, queue->dma_desc_base); return ret; } @@ -948,8 +942,9 @@ static int qdma_init_error_irq(struct qdma_device *qdev) static int qdmam_alloc_qintr_rings(struct qdma_device *qdev) { - u32 ctxt[QDMA_CTXT_REGMAP_LEN]; + struct qdma_platdata *pdata = dev_get_platdata(&qdev->pdev->dev); struct device *dev = &qdev->pdev->dev; + u32 ctxt[QDMA_CTXT_REGMAP_LEN]; struct qdma_intr_ring *ring; struct qdma_ctxt_intr intr_ctxt; u32 vector; @@ -969,7 +964,8 @@ static int qdmam_alloc_qintr_rings(struct qdma_device *qdev) ring->msix_id = qdev->err_irq_idx + i + 1; ring->ridx = i; ring->color = 1; - ring->base = dmam_alloc_coherent(dev, QDMA_INTR_RING_SIZE, + ring->base = dmam_alloc_coherent(pdata->dma_dev, + QDMA_INTR_RING_SIZE, &ring->dev_base, GFP_KERNEL); if (!ring->base) { qdma_err(qdev, "Failed to alloc intr ring %d", i); diff --git a/include/linux/platform_data/amd_qdma.h b/include/linux/platform_data/amd_qdma.h index 576d952f97ed..967a6ef31cf9 100644 --- a/include/linux/platform_data/amd_qdma.h +++ b/include/linux/platform_data/amd_qdma.h @@ -26,11 +26,13 @@ struct dma_slave_map; * @max_mm_channels: Maximum number of MM DMA channels in each direction * @device_map: DMA slave map * @irq_index: The index of first IRQ + * @dma_dev: The device pointer for dma operations */ struct qdma_platdata { u32 max_mm_channels; u32 irq_index; struct dma_slave_map *device_map; + struct device *dma_dev; }; #endif /* _PLATDATA_AMD_QDMA_H */ -- cgit v1.2.3 From e05feab22fd7dabcd6d272c4e2401ec1acdfdb9b Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Tue, 3 Dec 2024 15:45:37 +0200 Subject: RDMA/mlx5: Enforce same type port association for multiport RoCE Different core device types such as PFs and VFs shouldn't be affiliated together since they have different capabilities, fix that by enforcing type check before doing the affiliation. Fixes: 32f69e4be269 ("{net, IB}/mlx5: Manage port association for multiport RoCE") Reviewed-by: Mark Bloch Signed-off-by: Patrisious Haddad Link: https://patch.msgid.link/88699500f690dff1c1852c1ddb71f8a1cc8b956e.1733233480.git.leonro@nvidia.com Reviewed-by: Mateusz Polchlopek Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 6 ++++-- include/linux/mlx5/driver.h | 6 ++++++ 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index bc7930d0c564..c2314797afc9 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3639,7 +3639,8 @@ static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev) list_for_each_entry(mpi, &mlx5_ib_unaffiliated_port_list, list) { if (dev->sys_image_guid == mpi->sys_image_guid && - (mlx5_core_native_port_num(mpi->mdev) - 1) == i) { + (mlx5_core_native_port_num(mpi->mdev) - 1) == i && + mlx5_core_same_coredev_type(dev->mdev, mpi->mdev)) { bound = mlx5_ib_bind_slave_port(dev, mpi); } @@ -4785,7 +4786,8 @@ static int mlx5r_mp_probe(struct auxiliary_device *adev, mutex_lock(&mlx5_ib_multiport_mutex); list_for_each_entry(dev, &mlx5_ib_dev_list, ib_dev_list) { - if (dev->sys_image_guid == mpi->sys_image_guid) + if (dev->sys_image_guid == mpi->sys_image_guid && + mlx5_core_same_coredev_type(dev->mdev, mpi->mdev)) bound = mlx5_ib_bind_slave_port(dev, mpi); if (bound) { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index fc7e6153b73d..4f9e6f6dbaab 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1202,6 +1202,12 @@ static inline bool mlx5_core_is_vf(const struct mlx5_core_dev *dev) return dev->coredev_type == MLX5_COREDEV_VF; } +static inline bool mlx5_core_same_coredev_type(const struct mlx5_core_dev *dev1, + const struct mlx5_core_dev *dev2) +{ + return dev1->coredev_type == dev2->coredev_type; +} + static inline bool mlx5_core_is_ecpf(const struct mlx5_core_dev *dev) { return dev->caps.embedded_cpu; -- cgit v1.2.3 From 07a756a49f4b4290b49ea46e089cbe6f79ff8d26 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Wed, 6 Nov 2024 07:42:47 -0800 Subject: Drivers: hv: util: Avoid accessing a ringbuffer not initialized yet If the KVP (or VSS) daemon starts before the VMBus channel's ringbuffer is fully initialized, we can hit the panic below: hv_utils: Registering HyperV Utility Driver hv_vmbus: registering driver hv_utils ... BUG: kernel NULL pointer dereference, address: 0000000000000000 CPU: 44 UID: 0 PID: 2552 Comm: hv_kvp_daemon Tainted: G E 6.11.0-rc3+ #1 RIP: 0010:hv_pkt_iter_first+0x12/0xd0 Call Trace: ... vmbus_recvpacket hv_kvp_onchannelcallback vmbus_on_event tasklet_action_common tasklet_action handle_softirqs irq_exit_rcu sysvec_hyperv_stimer0 asm_sysvec_hyperv_stimer0 ... kvp_register_done hvt_op_read vfs_read ksys_read __x64_sys_read This can happen because the KVP/VSS channel callback can be invoked even before the channel is fully opened: 1) as soon as hv_kvp_init() -> hvutil_transport_init() creates /dev/vmbus/hv_kvp, the kvp daemon can open the device file immediately and register itself to the driver by writing a message KVP_OP_REGISTER1 to the file (which is handled by kvp_on_msg() ->kvp_handle_handshake()) and reading the file for the driver's response, which is handled by hvt_op_read(), which calls hvt->on_read(), i.e. kvp_register_done(). 2) the problem with kvp_register_done() is that it can cause the channel callback to be called even before the channel is fully opened, and when the channel callback is starting to run, util_probe()-> vmbus_open() may have not initialized the ringbuffer yet, so the callback can hit the panic of NULL pointer dereference. To reproduce the panic consistently, we can add a "ssleep(10)" for KVP in __vmbus_open(), just before the first hv_ringbuffer_init(), and then we unload and reload the driver hv_utils, and run the daemon manually within the 10 seconds. Fix the panic by reordering the steps in util_probe() so the char dev entry used by the KVP or VSS daemon is not created until after vmbus_open() has completed. This reordering prevents the race condition from happening. Reported-by: Dexuan Cui Fixes: e0fa3e5e7df6 ("Drivers: hv: utils: fix a race on userspace daemons registration") Cc: stable@vger.kernel.org Signed-off-by: Michael Kelley Acked-by: Wei Liu Link: https://lore.kernel.org/r/20241106154247.2271-3-mhklinux@outlook.com Signed-off-by: Wei Liu Message-ID: <20241106154247.2271-3-mhklinux@outlook.com> --- drivers/hv/hv_kvp.c | 6 ++++++ drivers/hv/hv_snapshot.c | 6 ++++++ drivers/hv/hv_util.c | 9 +++++++++ drivers/hv/hyperv_vmbus.h | 2 ++ include/linux/hyperv.h | 1 + 5 files changed, 24 insertions(+) (limited to 'include/linux') diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c index 29e01247a087..7400a5a4d2bd 100644 --- a/drivers/hv/hv_kvp.c +++ b/drivers/hv/hv_kvp.c @@ -767,6 +767,12 @@ hv_kvp_init(struct hv_util_service *srv) */ kvp_transaction.state = HVUTIL_DEVICE_INIT; + return 0; +} + +int +hv_kvp_init_transport(void) +{ hvt = hvutil_transport_init(kvp_devname, CN_KVP_IDX, CN_KVP_VAL, kvp_on_msg, kvp_on_reset); if (!hvt) diff --git a/drivers/hv/hv_snapshot.c b/drivers/hv/hv_snapshot.c index 86d87486ed40..bde637a96c37 100644 --- a/drivers/hv/hv_snapshot.c +++ b/drivers/hv/hv_snapshot.c @@ -389,6 +389,12 @@ hv_vss_init(struct hv_util_service *srv) */ vss_transaction.state = HVUTIL_DEVICE_INIT; + return 0; +} + +int +hv_vss_init_transport(void) +{ hvt = hvutil_transport_init(vss_devname, CN_VSS_IDX, CN_VSS_VAL, vss_on_msg, vss_on_reset); if (!hvt) { diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c index 370722220134..36ee89c0358b 100644 --- a/drivers/hv/hv_util.c +++ b/drivers/hv/hv_util.c @@ -141,6 +141,7 @@ static struct hv_util_service util_heartbeat = { static struct hv_util_service util_kvp = { .util_cb = hv_kvp_onchannelcallback, .util_init = hv_kvp_init, + .util_init_transport = hv_kvp_init_transport, .util_pre_suspend = hv_kvp_pre_suspend, .util_pre_resume = hv_kvp_pre_resume, .util_deinit = hv_kvp_deinit, @@ -149,6 +150,7 @@ static struct hv_util_service util_kvp = { static struct hv_util_service util_vss = { .util_cb = hv_vss_onchannelcallback, .util_init = hv_vss_init, + .util_init_transport = hv_vss_init_transport, .util_pre_suspend = hv_vss_pre_suspend, .util_pre_resume = hv_vss_pre_resume, .util_deinit = hv_vss_deinit, @@ -611,6 +613,13 @@ static int util_probe(struct hv_device *dev, if (ret) goto error; + if (srv->util_init_transport) { + ret = srv->util_init_transport(); + if (ret) { + vmbus_close(dev->channel); + goto error; + } + } return 0; error: diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index d2856023d53c..52cb744b4d7f 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -370,12 +370,14 @@ void vmbus_on_event(unsigned long data); void vmbus_on_msg_dpc(unsigned long data); int hv_kvp_init(struct hv_util_service *srv); +int hv_kvp_init_transport(void); void hv_kvp_deinit(void); int hv_kvp_pre_suspend(void); int hv_kvp_pre_resume(void); void hv_kvp_onchannelcallback(void *context); int hv_vss_init(struct hv_util_service *srv); +int hv_vss_init_transport(void); void hv_vss_deinit(void); int hv_vss_pre_suspend(void); int hv_vss_pre_resume(void); diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 22c22fb91042..02a226bcf0ed 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1559,6 +1559,7 @@ struct hv_util_service { void *channel; void (*util_cb)(void *); int (*util_init)(struct hv_util_service *); + int (*util_init_transport)(void); void (*util_deinit)(void); int (*util_pre_suspend)(void); int (*util_pre_resume)(void); -- cgit v1.2.3 From 51d20d1dacbec589d459e11fc88fbca419f84a99 Mon Sep 17 00:00:00 2001 From: Long Li Date: Mon, 9 Dec 2024 19:42:40 +0800 Subject: iomap: fix zero padding data issue in concurrent append writes During concurrent append writes to XFS filesystem, zero padding data may appear in the file after power failure. This happens due to imprecise disk size updates when handling write completion. Consider this scenario with concurrent append writes same file: Thread 1: Thread 2: ------------ ----------- write [A, A+B] update inode size to A+B submit I/O [A, A+BS] write [A+B, A+B+C] update inode size to A+B+C After reboot: 1) with A+B+C < A+BS, the file has zero padding in range [A+B, A+B+C] |< Block Size (BS) >| |DDDDDDDDDDDDDDDD0000000000000000| ^ ^ ^ A A+B A+B+C (EOF) 2) with A+B+C > A+BS, the file has zero padding in range [A+B, A+BS] |< Block Size (BS) >|< Block Size (BS) >| |DDDDDDDDDDDDDDDD0000000000000000|00000000000000000000000000000000| ^ ^ ^ ^ A A+B A+BS A+B+C (EOF) D = Valid Data 0 = Zero Padding The issue stems from disk size being set to min(io_offset + io_size, inode->i_size) at I/O completion. Since io_offset+io_size is block size granularity, it may exceed the actual valid file data size. In the case of concurrent append writes, inode->i_size may be larger than the actual range of valid file data written to disk, leading to inaccurate disk size updates. This patch modifies the meaning of io_size to represent the size of valid data within EOF in an ioend. If the ioend spans beyond i_size, io_size will be trimmed to provide the file with more accurate size information. This is particularly useful for on-disk size updates at completion time. After this change, ioends that span i_size will not grow or merge with other ioends in concurrent scenarios. However, these cases that need growth/merging rarely occur and it seems no noticeable performance impact. Although rounding up io_size could enable ioend growth/merging in these scenarios, we decided to keep the code simple after discussion [1]. Another benefit is that it makes the xfs_ioend_is_append() check more accurate, which can reduce unnecessary end bio callbacks of xfs_end_bio() in certain scenarios, such as repeated writes at the file tail without extending the file size. Link [1]: https://patchwork.kernel.org/project/xfs/patch/20241113091907.56937-1-leo.lilong@huawei.com Fixes: ae259a9c8593 ("fs: introduce iomap infrastructure") # goes further back than this Signed-off-by: Long Li Link: https://lore.kernel.org/r/20241209114241.3725722-3-leo.lilong@huawei.com Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/iomap.h | 2 +- 2 files changed, 46 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index bcc7831d03af..54dc27d92781 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1794,7 +1794,52 @@ new_ioend: if (ifs) atomic_add(len, &ifs->write_bytes_pending); + + /* + * Clamp io_offset and io_size to the incore EOF so that ondisk + * file size updates in the ioend completion are byte-accurate. + * This avoids recovering files with zeroed tail regions when + * writeback races with appending writes: + * + * Thread 1: Thread 2: + * ------------ ----------- + * write [A, A+B] + * update inode size to A+B + * submit I/O [A, A+BS] + * write [A+B, A+B+C] + * update inode size to A+B+C + * + * + * + * After reboot: + * 1) with A+B+C < A+BS, the file has zero padding in range + * [A+B, A+B+C] + * + * |< Block Size (BS) >| + * |DDDDDDDDDDDD0000000000000| + * ^ ^ ^ + * A A+B A+B+C + * (EOF) + * + * 2) with A+B+C > A+BS, the file has zero padding in range + * [A+B, A+BS] + * + * |< Block Size (BS) >|< Block Size (BS) >| + * |DDDDDDDDDDDD0000000000000|00000000000000000000000000| + * ^ ^ ^ ^ + * A A+B A+BS A+B+C + * (EOF) + * + * D = Valid Data + * 0 = Zero Padding + * + * Note that this defeats the ability to chain the ioends of + * appending writes. + */ wpc->ioend->io_size += len; + if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos) + wpc->ioend->io_size = end_pos - wpc->ioend->io_offset; + wbc_account_cgroup_owner(wbc, folio, len); return 0; } diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 5675af6b740c..75bf54e76f3b 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -335,7 +335,7 @@ struct iomap_ioend { u16 io_type; u16 io_flags; /* IOMAP_F_* */ struct inode *io_inode; /* file being written to */ - size_t io_size; /* size of the extent */ + size_t io_size; /* size of data within eof */ loff_t io_offset; /* offset in the file */ sector_t io_sector; /* start sector of ioend */ struct bio io_bio; /* MUST BE LAST! */ -- cgit v1.2.3 From 0ef8047b737d7480a5d4c46d956e97c190f13050 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 29 Nov 2024 16:15:54 +0100 Subject: x86/static-call: provide a way to do very early static-call updates Add static_call_update_early() for updating static-call targets in very early boot. This will be needed for support of Xen guest type specific hypercall functions. This is part of XSA-466 / CVE-2024-53241. Reported-by: Andrew Cooper Signed-off-by: Juergen Gross Co-developed-by: Peter Zijlstra Co-developed-by: Josh Poimboeuf --- arch/x86/include/asm/static_call.h | 15 +++++++++++++++ arch/x86/include/asm/sync_core.h | 6 +++--- arch/x86/kernel/static_call.c | 9 +++++++++ include/linux/compiler.h | 37 ++++++++++++++++++++++++++----------- include/linux/static_call.h | 1 + kernel/static_call_inline.c | 2 +- 6 files changed, 55 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/static_call.h b/arch/x86/include/asm/static_call.h index 125c407e2abe..41502bd2afd6 100644 --- a/arch/x86/include/asm/static_call.h +++ b/arch/x86/include/asm/static_call.h @@ -65,4 +65,19 @@ extern bool __static_call_fixup(void *tramp, u8 op, void *dest); +extern void __static_call_update_early(void *tramp, void *func); + +#define static_call_update_early(name, _func) \ +({ \ + typeof(&STATIC_CALL_TRAMP(name)) __F = (_func); \ + if (static_call_initialized) { \ + __static_call_update(&STATIC_CALL_KEY(name), \ + STATIC_CALL_TRAMP_ADDR(name), __F);\ + } else { \ + WRITE_ONCE(STATIC_CALL_KEY(name).func, _func); \ + __static_call_update_early(STATIC_CALL_TRAMP_ADDR(name),\ + __F); \ + } \ +}) + #endif /* _ASM_STATIC_CALL_H */ diff --git a/arch/x86/include/asm/sync_core.h b/arch/x86/include/asm/sync_core.h index ab7382f92aff..96bda43538ee 100644 --- a/arch/x86/include/asm/sync_core.h +++ b/arch/x86/include/asm/sync_core.h @@ -8,7 +8,7 @@ #include #ifdef CONFIG_X86_32 -static inline void iret_to_self(void) +static __always_inline void iret_to_self(void) { asm volatile ( "pushfl\n\t" @@ -19,7 +19,7 @@ static inline void iret_to_self(void) : ASM_CALL_CONSTRAINT : : "memory"); } #else -static inline void iret_to_self(void) +static __always_inline void iret_to_self(void) { unsigned int tmp; @@ -55,7 +55,7 @@ static inline void iret_to_self(void) * Like all of Linux's memory ordering operations, this is a * compiler barrier as well. */ -static inline void sync_core(void) +static __always_inline void sync_core(void) { /* * The SERIALIZE instruction is the most straightforward way to diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index 4eefaac64c6c..9eed0c144dad 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -172,6 +172,15 @@ void arch_static_call_transform(void *site, void *tramp, void *func, bool tail) } EXPORT_SYMBOL_GPL(arch_static_call_transform); +noinstr void __static_call_update_early(void *tramp, void *func) +{ + BUG_ON(system_state != SYSTEM_BOOTING); + BUG_ON(!early_boot_irqs_disabled); + BUG_ON(static_call_initialized); + __text_gen_insn(tramp, JMP32_INSN_OPCODE, tramp, func, JMP32_INSN_SIZE); + sync_core(); +} + #ifdef CONFIG_MITIGATION_RETHUNK /* * This is called by apply_returns() to fix up static call trampolines, diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 469a64dd6495..240c632c5b95 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -216,28 +216,43 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #endif /* __KERNEL__ */ +/** + * offset_to_ptr - convert a relative memory offset to an absolute pointer + * @off: the address of the 32-bit offset value + */ +static inline void *offset_to_ptr(const int *off) +{ + return (void *)((unsigned long)off + *off); +} + +#endif /* __ASSEMBLY__ */ + +#ifdef CONFIG_64BIT +#define ARCH_SEL(a,b) a +#else +#define ARCH_SEL(a,b) b +#endif + /* * Force the compiler to emit 'sym' as a symbol, so that we can reference * it from inline assembler. Necessary in case 'sym' could be inlined * otherwise, or eliminated entirely due to lack of references that are * visible to the compiler. */ -#define ___ADDRESSABLE(sym, __attrs) \ - static void * __used __attrs \ +#define ___ADDRESSABLE(sym, __attrs) \ + static void * __used __attrs \ __UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)(uintptr_t)&sym; + #define __ADDRESSABLE(sym) \ ___ADDRESSABLE(sym, __section(".discard.addressable")) -/** - * offset_to_ptr - convert a relative memory offset to an absolute pointer - * @off: the address of the 32-bit offset value - */ -static inline void *offset_to_ptr(const int *off) -{ - return (void *)((unsigned long)off + *off); -} +#define __ADDRESSABLE_ASM(sym) \ + .pushsection .discard.addressable,"aw"; \ + .align ARCH_SEL(8,4); \ + ARCH_SEL(.quad, .long) __stringify(sym); \ + .popsection; -#endif /* __ASSEMBLY__ */ +#define __ADDRESSABLE_ASM_STR(sym) __stringify(__ADDRESSABLE_ASM(sym)) #ifdef __CHECKER__ #define __BUILD_BUG_ON_ZERO_MSG(e, msg) (0) diff --git a/include/linux/static_call.h b/include/linux/static_call.h index 141e6b176a1b..785980af8972 100644 --- a/include/linux/static_call.h +++ b/include/linux/static_call.h @@ -138,6 +138,7 @@ #ifdef CONFIG_HAVE_STATIC_CALL #include +extern int static_call_initialized; /* * Either @site or @tramp can be NULL. */ diff --git a/kernel/static_call_inline.c b/kernel/static_call_inline.c index 5259cda486d0..bb7d066a7c39 100644 --- a/kernel/static_call_inline.c +++ b/kernel/static_call_inline.c @@ -15,7 +15,7 @@ extern struct static_call_site __start_static_call_sites[], extern struct static_call_tramp_key __start_static_call_tramp_key[], __stop_static_call_tramp_key[]; -static int static_call_initialized; +int static_call_initialized; /* * Must be called before early_initcall() to be effective. -- cgit v1.2.3 From 239d87327dcd361b0098038995f8908f3296864f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 12 Dec 2024 17:28:06 -0800 Subject: fortify: Hide run-time copy size from value range tracking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GCC performs value range tracking for variables as a way to provide better diagnostics. One place this is regularly seen is with warnings associated with bounds-checking, e.g. -Wstringop-overflow, -Wstringop-overread, -Warray-bounds, etc. In order to keep the signal-to-noise ratio high, warnings aren't emitted when a value range spans the entire value range representable by a given variable. For example: unsigned int len; char dst[8]; ... memcpy(dst, src, len); If len's value is unknown, it has the full "unsigned int" range of [0, UINT_MAX], and GCC's compile-time bounds checks against memcpy() will be ignored. However, when a code path has been able to narrow the range: if (len > 16) return; memcpy(dst, src, len); Then the range will be updated for the execution path. Above, len is now [0, 16] when reading memcpy(), so depending on other optimizations, we might see a -Wstringop-overflow warning like: error: '__builtin_memcpy' writing between 9 and 16 bytes into region of size 8 [-Werror=stringop-overflow] When building with CONFIG_FORTIFY_SOURCE, the fortified run-time bounds checking can appear to narrow value ranges of lengths for memcpy(), depending on how the compiler constructs the execution paths during optimization passes, due to the checks against the field sizes. For example: if (p_size_field != SIZE_MAX && p_size != p_size_field && p_size_field < size) As intentionally designed, these checks only affect the kernel warnings emitted at run-time and do not block the potentially overflowing memcpy(), so GCC thinks it needs to produce a warning about the resulting value range that might be reaching the memcpy(). We have seen this manifest a few times now, with the most recent being with cpumasks: In function ‘bitmap_copy’, inlined from ‘cpumask_copy’ at ./include/linux/cpumask.h:839:2, inlined from ‘__padata_set_cpumasks’ at kernel/padata.c:730:2: ./include/linux/fortify-string.h:114:33: error: ‘__builtin_memcpy’ reading between 257 and 536870904 bytes from a region of size 256 [-Werror=stringop-overread] 114 | #define __underlying_memcpy __builtin_memcpy | ^ ./include/linux/fortify-string.h:633:9: note: in expansion of macro ‘__underlying_memcpy’ 633 | __underlying_##op(p, q, __fortify_size); \ | ^~~~~~~~~~~~~ ./include/linux/fortify-string.h:678:26: note: in expansion of macro ‘__fortify_memcpy_chk’ 678 | #define memcpy(p, q, s) __fortify_memcpy_chk(p, q, s, \ | ^~~~~~~~~~~~~~~~~~~~ ./include/linux/bitmap.h:259:17: note: in expansion of macro ‘memcpy’ 259 | memcpy(dst, src, len); | ^~~~~~ kernel/padata.c: In function ‘__padata_set_cpumasks’: kernel/padata.c:713:48: note: source object ‘pcpumask’ of size [0, 256] 713 | cpumask_var_t pcpumask, | ~~~~~~~~~~~~~~^~~~~~~~ This warning is _not_ emitted when CONFIG_FORTIFY_SOURCE is disabled, and with the recent -fdiagnostics-details we can confirm the origin of the warning is due to FORTIFY's bounds checking: ../include/linux/bitmap.h:259:17: note: in expansion of macro 'memcpy' 259 | memcpy(dst, src, len); | ^~~~~~ '__padata_set_cpumasks': events 1-2 ../include/linux/fortify-string.h:613:36: 612 | if (p_size_field != SIZE_MAX && | ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 613 | p_size != p_size_field && p_size_field < size) | ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~ | | | (1) when the condition is evaluated to false | (2) when the condition is evaluated to true '__padata_set_cpumasks': event 3 114 | #define __underlying_memcpy __builtin_memcpy | ^ | | | (3) out of array bounds here Note that the cpumask warning started appearing since bitmap functions were recently marked __always_inline in commit ed8cd2b3bd9f ("bitmap: Switch from inline to __always_inline"), which allowed GCC to gain visibility into the variables as they passed through the FORTIFY implementation. In order to silence these false positives but keep otherwise deterministic compile-time warnings intact, hide the length variable from GCC with OPTIMIZE_HIDE_VAR() before calling the builtin memcpy. Additionally add a comment about why all the macro args have copies with const storage. Reported-by: "Thomas Weißschuh" Closes: https://lore.kernel.org/all/db7190c8-d17f-4a0d-bc2f-5903c79f36c2@t-8ch.de/ Reported-by: Nilay Shroff Closes: https://lore.kernel.org/all/20241112124127.1666300-1-nilay@linux.ibm.com/ Tested-by: Nilay Shroff Acked-by: Yury Norov Acked-by: Greg Kroah-Hartman Signed-off-by: Kees Cook --- include/linux/fortify-string.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 0d99bf11d260..e4ce1cae03bf 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -616,6 +616,12 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, return false; } +/* + * To work around what seems to be an optimizer bug, the macro arguments + * need to have const copies or the values end up changed by the time they + * reach fortify_warn_once(). See commit 6f7630b1b5bc ("fortify: Capture + * __bos() results in const temp vars") for more details. + */ #define __fortify_memcpy_chk(p, q, size, p_size, q_size, \ p_size_field, q_size_field, op) ({ \ const size_t __fortify_size = (size_t)(size); \ @@ -623,6 +629,8 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, const size_t __q_size = (q_size); \ const size_t __p_size_field = (p_size_field); \ const size_t __q_size_field = (q_size_field); \ + /* Keep a mutable version of the size for the final copy. */ \ + size_t __copy_size = __fortify_size; \ fortify_warn_once(fortify_memcpy_chk(__fortify_size, __p_size, \ __q_size, __p_size_field, \ __q_size_field, FORTIFY_FUNC_ ##op), \ @@ -630,7 +638,11 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, __fortify_size, \ "field \"" #p "\" at " FILE_LINE, \ __p_size_field); \ - __underlying_##op(p, q, __fortify_size); \ + /* Hide only the run-time size from value range tracking to */ \ + /* silence compile-time false positive bounds warnings. */ \ + if (!__builtin_constant_p(__copy_size)) \ + OPTIMIZER_HIDE_VAR(__copy_size); \ + __underlying_##op(p, q, __copy_size); \ }) /* -- cgit v1.2.3 From 020b40f3562495f3c703a283ece145ffec19e82d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Dec 2024 08:21:46 -0700 Subject: io_uring: make ctx->timeout_lock a raw spinlock Chase reports that their tester complaints about a locking context mismatch: ============================= [ BUG: Invalid wait context ] 6.13.0-rc1-gf137f14b7ccb-dirty #9 Not tainted ----------------------------- syz.1.25198/182604 is trying to lock: ffff88805e66a358 (&ctx->timeout_lock){-.-.}-{3:3}, at: spin_lock_irq include/linux/spinlock.h:376 [inline] ffff88805e66a358 (&ctx->timeout_lock){-.-.}-{3:3}, at: io_match_task_safe io_uring/io_uring.c:218 [inline] ffff88805e66a358 (&ctx->timeout_lock){-.-.}-{3:3}, at: io_match_task_safe+0x187/0x250 io_uring/io_uring.c:204 other info that might help us debug this: context-{5:5} 1 lock held by syz.1.25198/182604: #0: ffff88802b7d48c0 (&acct->lock){+.+.}-{2:2}, at: io_acct_cancel_pending_work+0x2d/0x6b0 io_uring/io-wq.c:1049 stack backtrace: CPU: 0 UID: 0 PID: 182604 Comm: syz.1.25198 Not tainted 6.13.0-rc1-gf137f14b7ccb-dirty #9 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x82/0xd0 lib/dump_stack.c:120 print_lock_invalid_wait_context kernel/locking/lockdep.c:4826 [inline] check_wait_context kernel/locking/lockdep.c:4898 [inline] __lock_acquire+0x883/0x3c80 kernel/locking/lockdep.c:5176 lock_acquire.part.0+0x11b/0x370 kernel/locking/lockdep.c:5849 __raw_spin_lock_irq include/linux/spinlock_api_smp.h:119 [inline] _raw_spin_lock_irq+0x36/0x50 kernel/locking/spinlock.c:170 spin_lock_irq include/linux/spinlock.h:376 [inline] io_match_task_safe io_uring/io_uring.c:218 [inline] io_match_task_safe+0x187/0x250 io_uring/io_uring.c:204 io_acct_cancel_pending_work+0xb8/0x6b0 io_uring/io-wq.c:1052 io_wq_cancel_pending_work io_uring/io-wq.c:1074 [inline] io_wq_cancel_cb+0xb0/0x390 io_uring/io-wq.c:1112 io_uring_try_cancel_requests+0x15e/0xd70 io_uring/io_uring.c:3062 io_uring_cancel_generic+0x6ec/0x8c0 io_uring/io_uring.c:3140 io_uring_files_cancel include/linux/io_uring.h:20 [inline] do_exit+0x494/0x27a0 kernel/exit.c:894 do_group_exit+0xb3/0x250 kernel/exit.c:1087 get_signal+0x1d77/0x1ef0 kernel/signal.c:3017 arch_do_signal_or_restart+0x79/0x5b0 arch/x86/kernel/signal.c:337 exit_to_user_mode_loop kernel/entry/common.c:111 [inline] exit_to_user_mode_prepare include/linux/entry-common.h:329 [inline] __syscall_exit_to_user_mode_work kernel/entry/common.c:207 [inline] syscall_exit_to_user_mode+0x150/0x2a0 kernel/entry/common.c:218 do_syscall_64+0xd8/0x250 arch/x86/entry/common.c:89 entry_SYSCALL_64_after_hwframe+0x77/0x7f which is because io_uring has ctx->timeout_lock nesting inside the io-wq acct lock, the latter of which is used from inside the scheduler and hence is a raw spinlock, while the former is a "normal" spinlock and can hence be sleeping on PREEMPT_RT. Change ctx->timeout_lock to be a raw spinlock to solve this nesting dependency on PREEMPT_RT=y. Reported-by: chase xd Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- io_uring/io_uring.c | 10 +++++----- io_uring/timeout.c | 40 ++++++++++++++++++++-------------------- 3 files changed, 26 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 011860ade268..fd4cdb0860a2 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -345,7 +345,7 @@ struct io_ring_ctx { /* timeouts */ struct { - spinlock_t timeout_lock; + raw_spinlock_t timeout_lock; struct list_head timeout_list; struct list_head ltimeout_list; unsigned cq_last_tm_flush; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 06ff41484e29..605625e932eb 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -215,9 +215,9 @@ bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, struct io_ring_ctx *ctx = head->ctx; /* protect against races with linked timeouts */ - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); matched = io_match_linked(head); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); } else { matched = io_match_linked(head); } @@ -333,7 +333,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->cq_wait); init_waitqueue_head(&ctx->poll_wq); spin_lock_init(&ctx->completion_lock); - spin_lock_init(&ctx->timeout_lock); + raw_spin_lock_init(&ctx->timeout_lock); INIT_WQ_LIST(&ctx->iopoll_list); INIT_LIST_HEAD(&ctx->io_buffers_comp); INIT_LIST_HEAD(&ctx->defer_list); @@ -498,10 +498,10 @@ static void io_prep_async_link(struct io_kiocb *req) if (req->flags & REQ_F_LINK_TIMEOUT) { struct io_ring_ctx *ctx = req->ctx; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); io_for_each_link(cur, req) io_prep_async_work(cur); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); } else { io_for_each_link(cur, req) io_prep_async_work(cur); diff --git a/io_uring/timeout.c b/io_uring/timeout.c index f3d502717aeb..bbe58638eca7 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -74,10 +74,10 @@ static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts) if (!io_timeout_finish(timeout, data)) { if (io_req_post_cqe(req, -ETIME, IORING_CQE_F_MORE)) { /* re-arm timer */ - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); list_add(&timeout->list, ctx->timeout_list.prev); hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); return; } } @@ -109,7 +109,7 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx) u32 seq; struct io_timeout *timeout, *tmp; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { @@ -134,7 +134,7 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx) io_kill_timeout(req, 0); } ctx->cq_last_tm_flush = seq; - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); } static void io_req_tw_fail_links(struct io_kiocb *link, struct io_tw_state *ts) @@ -200,9 +200,9 @@ void io_disarm_next(struct io_kiocb *req) } else if (req->flags & REQ_F_LINK_TIMEOUT) { struct io_ring_ctx *ctx = req->ctx; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); link = io_disarm_linked_timeout(req); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); if (link) io_req_queue_tw_complete(link, -ECANCELED); } @@ -238,11 +238,11 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) struct io_ring_ctx *ctx = req->ctx; unsigned long flags; - spin_lock_irqsave(&ctx->timeout_lock, flags); + raw_spin_lock_irqsave(&ctx->timeout_lock, flags); list_del_init(&timeout->list); atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); - spin_unlock_irqrestore(&ctx->timeout_lock, flags); + raw_spin_unlock_irqrestore(&ctx->timeout_lock, flags); if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) req_set_fail(req); @@ -285,9 +285,9 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) { struct io_kiocb *req; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); req = io_timeout_extract(ctx, cd); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); if (IS_ERR(req)) return PTR_ERR(req); @@ -330,7 +330,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) struct io_ring_ctx *ctx = req->ctx; unsigned long flags; - spin_lock_irqsave(&ctx->timeout_lock, flags); + raw_spin_lock_irqsave(&ctx->timeout_lock, flags); prev = timeout->head; timeout->head = NULL; @@ -345,7 +345,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) } list_del(&timeout->list); timeout->prev = prev; - spin_unlock_irqrestore(&ctx->timeout_lock, flags); + raw_spin_unlock_irqrestore(&ctx->timeout_lock, flags); req->io_task_work.func = io_req_task_link_timeout; io_req_task_work_add(req); @@ -472,12 +472,12 @@ int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) } else { enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags); - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); if (tr->ltimeout) ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode); else ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); } if (ret < 0) @@ -572,7 +572,7 @@ int io_timeout(struct io_kiocb *req, unsigned int issue_flags) struct list_head *entry; u32 tail, off = timeout->off; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); /* * sqe->off holds how many events that need to occur for this @@ -611,7 +611,7 @@ add: list_add(&timeout->list, entry); data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); return IOU_ISSUE_SKIP_COMPLETE; } @@ -620,7 +620,7 @@ void io_queue_linked_timeout(struct io_kiocb *req) struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_ring_ctx *ctx = req->ctx; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); /* * If the back reference is NULL, then our linked request finished * before we got a chance to setup the timer @@ -633,7 +633,7 @@ void io_queue_linked_timeout(struct io_kiocb *req) data->mode); list_add_tail(&timeout->list, &ctx->ltimeout_list); } - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); /* drop submission reference */ io_put_req(req); } @@ -668,7 +668,7 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct io_uring_task *tctx * timeout_lockfirst to keep locking ordering. */ spin_lock(&ctx->completion_lock); - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { struct io_kiocb *req = cmd_to_io_kiocb(timeout); @@ -676,7 +676,7 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct io_uring_task *tctx io_kill_timeout(req, -ECANCELED)) canceled++; } - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); spin_unlock(&ctx->completion_lock); return canceled != 0; } -- cgit v1.2.3 From afd2627f727b89496d79a6b934a025fc916d4ded Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 16 Dec 2024 21:41:22 -0500 Subject: tracing: Check "%s" dereference via the field and not the TP_printk format The TP_printk() portion of a trace event is executed at the time a event is read from the trace. This can happen seconds, minutes, hours, days, months, years possibly later since the event was recorded. If the print format contains a dereference to a string via "%s", and that string was allocated, there's a chance that string could be freed before it is read by the trace file. To protect against such bugs, there are two functions that verify the event. The first one is test_event_printk(), which is called when the event is created. It reads the TP_printk() format as well as its arguments to make sure nothing may be dereferencing a pointer that was not copied into the ring buffer along with the event. If it is, it will trigger a WARN_ON(). For strings that use "%s", it is not so easy. The string may not reside in the ring buffer but may still be valid. Strings that are static and part of the kernel proper which will not be freed for the life of the running system, are safe to dereference. But to know if it is a pointer to a static string or to something on the heap can not be determined until the event is triggered. This brings us to the second function that tests for the bad dereferencing of strings, trace_check_vprintf(). It would walk through the printf format looking for "%s", and when it finds it, it would validate that the pointer is safe to read. If not, it would produces a WARN_ON() as well and write into the ring buffer "[UNSAFE-MEMORY]". The problem with this is how it used va_list to have vsnprintf() handle all the cases that it didn't need to check. Instead of re-implementing vsnprintf(), it would make a copy of the format up to the %s part, and call vsnprintf() with the current va_list ap variable, where the ap would then be ready to point at the string in question. For architectures that passed va_list by reference this was possible. For architectures that passed it by copy it was not. A test_can_verify() function was used to differentiate between the two, and if it wasn't possible, it would disable it. Even for architectures where this was feasible, it was a stretch to rely on such a method that is undocumented, and could cause issues later on with new optimizations of the compiler. Instead, the first function test_event_printk() was updated to look at "%s" as well. If the "%s" argument is a pointer outside the event in the ring buffer, it would find the field type of the event that is the problem and mark the structure with a new flag called "needs_test". The event itself will be marked by TRACE_EVENT_FL_TEST_STR to let it be known that this event has a field that needs to be verified before the event can be printed using the printf format. When the event fields are created from the field type structure, the fields would copy the field type's "needs_test" value. Finally, before being printed, a new function ignore_event() is called which will check if the event has the TEST_STR flag set (if not, it returns false). If the flag is set, it then iterates through the events fields looking for the ones that have the "needs_test" flag set. Then it uses the offset field from the field structure to find the pointer in the ring buffer event. It runs the tests to make sure that pointer is safe to print and if not, it triggers the WARN_ON() and also adds to the trace output that the event in question has an unsafe memory access. The ignore_event() makes the trace_check_vprintf() obsolete so it is removed. Link: https://lore.kernel.org/all/CAHk-=wh3uOnqnZPpR0PeLZZtyWbZLboZ7cHLCKRWsocvs9Y7hQ@mail.gmail.com/ Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Al Viro Cc: Linus Torvalds Link: https://lore.kernel.org/20241217024720.848621576@goodmis.org Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers") Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 6 +- kernel/trace/trace.c | 255 +++++++++---------------------------------- kernel/trace/trace.h | 6 +- kernel/trace/trace_events.c | 32 ++++-- kernel/trace/trace_output.c | 6 +- 5 files changed, 88 insertions(+), 217 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 2a5df5b62cfc..91b8ffbdfa8c 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -273,7 +273,8 @@ struct trace_event_fields { const char *name; const int size; const int align; - const int is_signed; + const unsigned int is_signed:1; + unsigned int needs_test:1; const int filter_type; const int len; }; @@ -324,6 +325,7 @@ enum { TRACE_EVENT_FL_EPROBE_BIT, TRACE_EVENT_FL_FPROBE_BIT, TRACE_EVENT_FL_CUSTOM_BIT, + TRACE_EVENT_FL_TEST_STR_BIT, }; /* @@ -340,6 +342,7 @@ enum { * CUSTOM - Event is a custom event (to be attached to an exsiting tracepoint) * This is set when the custom event has not been attached * to a tracepoint yet, then it is cleared when it is. + * TEST_STR - The event has a "%s" that points to a string outside the event */ enum { TRACE_EVENT_FL_CAP_ANY = (1 << TRACE_EVENT_FL_CAP_ANY_BIT), @@ -352,6 +355,7 @@ enum { TRACE_EVENT_FL_EPROBE = (1 << TRACE_EVENT_FL_EPROBE_BIT), TRACE_EVENT_FL_FPROBE = (1 << TRACE_EVENT_FL_FPROBE_BIT), TRACE_EVENT_FL_CUSTOM = (1 << TRACE_EVENT_FL_CUSTOM_BIT), + TRACE_EVENT_FL_TEST_STR = (1 << TRACE_EVENT_FL_TEST_STR_BIT), }; #define TRACE_EVENT_FL_UKPROBE (TRACE_EVENT_FL_KPROBE | TRACE_EVENT_FL_UPROBE) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index be62f0ea1814..7cc18b9bce27 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3611,17 +3611,12 @@ char *trace_iter_expand_format(struct trace_iterator *iter) } /* Returns true if the string is safe to dereference from an event */ -static bool trace_safe_str(struct trace_iterator *iter, const char *str, - bool star, int len) +static bool trace_safe_str(struct trace_iterator *iter, const char *str) { unsigned long addr = (unsigned long)str; struct trace_event *trace_event; struct trace_event_call *event; - /* Ignore strings with no length */ - if (star && !len) - return true; - /* OK if part of the event data */ if ((addr >= (unsigned long)iter->ent) && (addr < (unsigned long)iter->ent + iter->ent_size)) @@ -3661,181 +3656,69 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str, return false; } -static DEFINE_STATIC_KEY_FALSE(trace_no_verify); - -static int test_can_verify_check(const char *fmt, ...) -{ - char buf[16]; - va_list ap; - int ret; - - /* - * The verifier is dependent on vsnprintf() modifies the va_list - * passed to it, where it is sent as a reference. Some architectures - * (like x86_32) passes it by value, which means that vsnprintf() - * does not modify the va_list passed to it, and the verifier - * would then need to be able to understand all the values that - * vsnprintf can use. If it is passed by value, then the verifier - * is disabled. - */ - va_start(ap, fmt); - vsnprintf(buf, 16, "%d", ap); - ret = va_arg(ap, int); - va_end(ap); - - return ret; -} - -static void test_can_verify(void) -{ - if (!test_can_verify_check("%d %d", 0, 1)) { - pr_info("trace event string verifier disabled\n"); - static_branch_inc(&trace_no_verify); - } -} - /** - * trace_check_vprintf - Check dereferenced strings while writing to the seq buffer + * ignore_event - Check dereferenced fields while writing to the seq buffer * @iter: The iterator that holds the seq buffer and the event being printed - * @fmt: The format used to print the event - * @ap: The va_list holding the data to print from @fmt. * - * This writes the data into the @iter->seq buffer using the data from - * @fmt and @ap. If the format has a %s, then the source of the string - * is examined to make sure it is safe to print, otherwise it will - * warn and print "[UNSAFE MEMORY]" in place of the dereferenced string - * pointer. + * At boot up, test_event_printk() will flag any event that dereferences + * a string with "%s" that does exist in the ring buffer. It may still + * be valid, as the string may point to a static string in the kernel + * rodata that never gets freed. But if the string pointer is pointing + * to something that was allocated, there's a chance that it can be freed + * by the time the user reads the trace. This would cause a bad memory + * access by the kernel and possibly crash the system. + * + * This function will check if the event has any fields flagged as needing + * to be checked at runtime and perform those checks. + * + * If it is found that a field is unsafe, it will write into the @iter->seq + * a message stating what was found to be unsafe. + * + * @return: true if the event is unsafe and should be ignored, + * false otherwise. */ -void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, - va_list ap) +bool ignore_event(struct trace_iterator *iter) { - long text_delta = 0; - long data_delta = 0; - const char *p = fmt; - const char *str; - bool good; - int i, j; + struct ftrace_event_field *field; + struct trace_event *trace_event; + struct trace_event_call *event; + struct list_head *head; + struct trace_seq *seq; + const void *ptr; - if (WARN_ON_ONCE(!fmt)) - return; + trace_event = ftrace_find_event(iter->ent->type); - if (static_branch_unlikely(&trace_no_verify)) - goto print; + seq = &iter->seq; - /* - * When the kernel is booted with the tp_printk command line - * parameter, trace events go directly through to printk(). - * It also is checked by this function, but it does not - * have an associated trace_array (tr) for it. - */ - if (iter->tr) { - text_delta = iter->tr->text_delta; - data_delta = iter->tr->data_delta; + if (!trace_event) { + trace_seq_printf(seq, "EVENT ID %d NOT FOUND?\n", iter->ent->type); + return true; } - /* Don't bother checking when doing a ftrace_dump() */ - if (iter->fmt == static_fmt_buf) - goto print; - - while (*p) { - bool star = false; - int len = 0; - - j = 0; - - /* - * We only care about %s and variants - * as well as %p[sS] if delta is non-zero - */ - for (i = 0; p[i]; i++) { - if (i + 1 >= iter->fmt_size) { - /* - * If we can't expand the copy buffer, - * just print it. - */ - if (!trace_iter_expand_format(iter)) - goto print; - } - - if (p[i] == '\\' && p[i+1]) { - i++; - continue; - } - if (p[i] == '%') { - /* Need to test cases like %08.*s */ - for (j = 1; p[i+j]; j++) { - if (isdigit(p[i+j]) || - p[i+j] == '.') - continue; - if (p[i+j] == '*') { - star = true; - continue; - } - break; - } - if (p[i+j] == 's') - break; - - if (text_delta && p[i+1] == 'p' && - ((p[i+2] == 's' || p[i+2] == 'S'))) - break; - - star = false; - } - j = 0; - } - /* If no %s found then just print normally */ - if (!p[i]) - break; - - /* Copy up to the %s, and print that */ - strncpy(iter->fmt, p, i); - iter->fmt[i] = '\0'; - trace_seq_vprintf(&iter->seq, iter->fmt, ap); + event = container_of(trace_event, struct trace_event_call, event); + if (!(event->flags & TRACE_EVENT_FL_TEST_STR)) + return false; - /* Add delta to %pS pointers */ - if (p[i+1] == 'p') { - unsigned long addr; - char fmt[4]; + head = trace_get_fields(event); + if (!head) { + trace_seq_printf(seq, "FIELDS FOR EVENT '%s' NOT FOUND?\n", + trace_event_name(event)); + return true; + } - fmt[0] = '%'; - fmt[1] = 'p'; - fmt[2] = p[i+2]; /* Either %ps or %pS */ - fmt[3] = '\0'; + /* Offsets are from the iter->ent that points to the raw event */ + ptr = iter->ent; - addr = va_arg(ap, unsigned long); - addr += text_delta; - trace_seq_printf(&iter->seq, fmt, (void *)addr); + list_for_each_entry(field, head, link) { + const char *str; + bool good; - p += i + 3; + if (!field->needs_test) continue; - } - /* - * If iter->seq is full, the above call no longer guarantees - * that ap is in sync with fmt processing, and further calls - * to va_arg() can return wrong positional arguments. - * - * Ensure that ap is no longer used in this case. - */ - if (iter->seq.full) { - p = ""; - break; - } - - if (star) - len = va_arg(ap, int); - - /* The ap now points to the string data of the %s */ - str = va_arg(ap, const char *); + str = *(const char **)(ptr + field->offset); - good = trace_safe_str(iter, str, star, len); - - /* Could be from the last boot */ - if (data_delta && !good) { - str += data_delta; - good = trace_safe_str(iter, str, star, len); - } + good = trace_safe_str(iter, str); /* * If you hit this warning, it is likely that the @@ -3846,44 +3729,14 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, * instead. See samples/trace_events/trace-events-sample.h * for reference. */ - if (WARN_ONCE(!good, "fmt: '%s' current_buffer: '%s'", - fmt, seq_buf_str(&iter->seq.seq))) { - int ret; - - /* Try to safely read the string */ - if (star) { - if (len + 1 > iter->fmt_size) - len = iter->fmt_size - 1; - if (len < 0) - len = 0; - ret = copy_from_kernel_nofault(iter->fmt, str, len); - iter->fmt[len] = 0; - star = false; - } else { - ret = strncpy_from_kernel_nofault(iter->fmt, str, - iter->fmt_size); - } - if (ret < 0) - trace_seq_printf(&iter->seq, "(0x%px)", str); - else - trace_seq_printf(&iter->seq, "(0x%px:%s)", - str, iter->fmt); - str = "[UNSAFE-MEMORY]"; - strcpy(iter->fmt, "%s"); - } else { - strncpy(iter->fmt, p + i, j + 1); - iter->fmt[j+1] = '\0'; + if (WARN_ONCE(!good, "event '%s' has unsafe pointer field '%s'", + trace_event_name(event), field->name)) { + trace_seq_printf(seq, "EVENT %s: HAS UNSAFE POINTER FIELD '%s'\n", + trace_event_name(event), field->name); + return true; } - if (star) - trace_seq_printf(&iter->seq, iter->fmt, len, str); - else - trace_seq_printf(&iter->seq, iter->fmt, str); - - p += i + j + 1; } - print: - if (*p) - trace_seq_vprintf(&iter->seq, p, ap); + return false; } const char *trace_event_format(struct trace_iterator *iter, const char *fmt) @@ -10777,8 +10630,6 @@ __init static int tracer_alloc_buffers(void) register_snapshot_cmd(); - test_can_verify(); - return 0; out_free_pipe_cpumask: diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 266740b4e121..9691b47b5f3d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -667,9 +667,8 @@ void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, bool trace_is_tracepoint_string(const char *str); const char *trace_event_format(struct trace_iterator *iter, const char *fmt); -void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, - va_list ap) __printf(2, 0); char *trace_iter_expand_format(struct trace_iterator *iter); +bool ignore_event(struct trace_iterator *iter); int trace_empty(struct trace_iterator *iter); @@ -1413,7 +1412,8 @@ struct ftrace_event_field { int filter_type; int offset; int size; - int is_signed; + unsigned int is_signed:1; + unsigned int needs_test:1; int len; }; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 521ad2fd1fe7..1545cc8b49d0 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -82,7 +82,7 @@ static int system_refcount_dec(struct event_subsystem *system) } static struct ftrace_event_field * -__find_event_field(struct list_head *head, char *name) +__find_event_field(struct list_head *head, const char *name) { struct ftrace_event_field *field; @@ -114,7 +114,8 @@ trace_find_event_field(struct trace_event_call *call, char *name) static int __trace_define_field(struct list_head *head, const char *type, const char *name, int offset, int size, - int is_signed, int filter_type, int len) + int is_signed, int filter_type, int len, + int need_test) { struct ftrace_event_field *field; @@ -133,6 +134,7 @@ static int __trace_define_field(struct list_head *head, const char *type, field->offset = offset; field->size = size; field->is_signed = is_signed; + field->needs_test = need_test; field->len = len; list_add(&field->link, head); @@ -151,13 +153,13 @@ int trace_define_field(struct trace_event_call *call, const char *type, head = trace_get_fields(call); return __trace_define_field(head, type, name, offset, size, - is_signed, filter_type, 0); + is_signed, filter_type, 0, 0); } EXPORT_SYMBOL_GPL(trace_define_field); static int trace_define_field_ext(struct trace_event_call *call, const char *type, const char *name, int offset, int size, int is_signed, - int filter_type, int len) + int filter_type, int len, int need_test) { struct list_head *head; @@ -166,13 +168,13 @@ static int trace_define_field_ext(struct trace_event_call *call, const char *typ head = trace_get_fields(call); return __trace_define_field(head, type, name, offset, size, - is_signed, filter_type, len); + is_signed, filter_type, len, need_test); } #define __generic_field(type, item, filter_type) \ ret = __trace_define_field(&ftrace_generic_fields, #type, \ #item, 0, 0, is_signed_type(type), \ - filter_type, 0); \ + filter_type, 0, 0); \ if (ret) \ return ret; @@ -181,7 +183,8 @@ static int trace_define_field_ext(struct trace_event_call *call, const char *typ "common_" #item, \ offsetof(typeof(ent), item), \ sizeof(ent.item), \ - is_signed_type(type), FILTER_OTHER, 0); \ + is_signed_type(type), FILTER_OTHER, \ + 0, 0); \ if (ret) \ return ret; @@ -332,6 +335,7 @@ static bool process_pointer(const char *fmt, int len, struct trace_event_call *c /* Return true if the string is safe */ static bool process_string(const char *fmt, int len, struct trace_event_call *call) { + struct trace_event_fields *field; const char *r, *e, *s; e = fmt + len; @@ -372,8 +376,16 @@ static bool process_string(const char *fmt, int len, struct trace_event_call *ca if (process_pointer(fmt, len, call)) return true; - /* Make sure the field is found, and consider it OK for now if it is */ - return find_event_field(fmt, call) != NULL; + /* Make sure the field is found */ + field = find_event_field(fmt, call); + if (!field) + return false; + + /* Test this field's string before printing the event */ + call->flags |= TRACE_EVENT_FL_TEST_STR; + field->needs_test = 1; + + return true; } /* @@ -2586,7 +2598,7 @@ event_define_fields(struct trace_event_call *call) ret = trace_define_field_ext(call, field->type, field->name, offset, field->size, field->is_signed, field->filter_type, - field->len); + field->len, field->needs_test); if (WARN_ON_ONCE(ret)) { pr_err("error code is %d\n", ret); break; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index da748b7cbc4d..03d56f711ad1 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -317,10 +317,14 @@ EXPORT_SYMBOL(trace_raw_output_prep); void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...) { + struct trace_seq *s = &iter->seq; va_list ap; + if (ignore_event(iter)) + return; + va_start(ap, fmt); - trace_check_vprintf(iter, trace_event_format(iter, fmt), ap); + trace_seq_vprintf(s, trace_event_format(iter, fmt), ap); va_end(ap); } EXPORT_SYMBOL(trace_event_printf); -- cgit v1.2.3 From 349f0086ba8b2a169877d21ff15a4d9da3a60054 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 18 Dec 2024 09:02:28 +0100 Subject: x86/static-call: fix 32-bit build In 32-bit x86 builds CONFIG_STATIC_CALL_INLINE isn't set, leading to static_call_initialized not being available. Define it as "0" in that case. Reported-by: Stephen Rothwell Fixes: 0ef8047b737d ("x86/static-call: provide a way to do very early static-call updates") Signed-off-by: Juergen Gross Acked-by: Peter Zijlstra (Intel) Signed-off-by: Linus Torvalds --- include/linux/static_call.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/static_call.h b/include/linux/static_call.h index 785980af8972..78a77a4ae0ea 100644 --- a/include/linux/static_call.h +++ b/include/linux/static_call.h @@ -138,7 +138,6 @@ #ifdef CONFIG_HAVE_STATIC_CALL #include -extern int static_call_initialized; /* * Either @site or @tramp can be NULL. */ @@ -161,6 +160,8 @@ extern void arch_static_call_transform(void *site, void *tramp, void *func, bool #ifdef CONFIG_HAVE_STATIC_CALL_INLINE +extern int static_call_initialized; + extern int __init static_call_init(void); extern void static_call_force_reinit(void); @@ -226,6 +227,8 @@ extern long __static_call_return0(void); #elif defined(CONFIG_HAVE_STATIC_CALL) +#define static_call_initialized 0 + static inline int static_call_init(void) { return 0; } #define DEFINE_STATIC_CALL(name, _func) \ @@ -282,6 +285,8 @@ extern long __static_call_return0(void); #else /* Generic implementation */ +#define static_call_initialized 0 + static inline int static_call_init(void) { return 0; } static inline long __static_call_return0(void) -- cgit v1.2.3 From 12d908116f7efd34f255a482b9afc729d7a5fb78 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 18 Dec 2024 17:56:25 +0100 Subject: io_uring: Fix registered ring file refcount leak Currently, io_uring_unreg_ringfd() (which cleans up registered rings) is only called on exit, but __io_uring_free (which frees the tctx in which the registered ring pointers are stored) is also called on execve (via begin_new_exec -> io_uring_task_cancel -> __io_uring_cancel -> io_uring_cancel_generic -> __io_uring_free). This means: A process going through execve while having registered rings will leak references to the rings' `struct file`. Fix it by zapping registered rings on execve(). This is implemented by moving the io_uring_unreg_ringfd() from io_uring_files_cancel() into its callee __io_uring_cancel(), which is called from io_uring_task_cancel() on execve. This could probably be exploited *on 32-bit kernels* by leaking 2^32 references to the same ring, because the file refcount is stored in a pointer-sized field and get_file() doesn't have protection against refcount overflow, just a WARN_ONCE(); but on 64-bit it should have no impact beyond a memory leak. Cc: stable@vger.kernel.org Fixes: e7a6c00dc77a ("io_uring: add support for registering ring file descriptors") Signed-off-by: Jann Horn Link: https://lore.kernel.org/r/20241218-uring-reg-ring-cleanup-v1-1-8f63e999045b@google.com Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 4 +--- io_uring/io_uring.c | 1 + 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index e123d5e17b52..85fe4e6b275c 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -15,10 +15,8 @@ bool io_is_uring_fops(struct file *file); static inline void io_uring_files_cancel(void) { - if (current->io_uring) { - io_uring_unreg_ringfd(); + if (current->io_uring) __io_uring_cancel(false); - } } static inline void io_uring_task_cancel(void) { diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 605625e932eb..432b95ca9c85 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3214,6 +3214,7 @@ end_wait: void __io_uring_cancel(bool cancel_all) { + io_uring_unreg_ringfd(); io_uring_cancel_generic(cancel_all, NULL); } -- cgit v1.2.3 From 31c5629920b82ddf66059f20f79be2bc00c4197b Mon Sep 17 00:00:00 2001 From: Petr Malat Date: Tue, 10 Dec 2024 01:06:04 +0100 Subject: mm: add RCU annotation to pte_offset_map(_lock) RCU lock is taken by ___pte_offset_map() unless it returns NULL. Add this information to its inline callers to avoid sparse warning about context imbalance in pte_unmap(). Link: https://lkml.kernel.org/r/20241210000604.700710-1-oss@malat.biz Signed-off-by: Petr Malat Signed-off-by: Andrew Morton --- include/linux/mm.h | 13 +++++++++++-- mm/pgtable-generic.c | 2 +- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index c39c4945946c..3a6ee6a05aa0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3010,7 +3010,15 @@ static inline void pagetable_pte_dtor(struct ptdesc *ptdesc) lruvec_stat_sub_folio(folio, NR_PAGETABLE); } -pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); +pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); +static inline pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, + pmd_t *pmdvalp) +{ + pte_t *pte; + + __cond_lock(RCU, pte = ___pte_offset_map(pmd, addr, pmdvalp)); + return pte; +} static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr) { return __pte_offset_map(pmd, addr, NULL); @@ -3023,7 +3031,8 @@ static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, { pte_t *pte; - __cond_lock(*ptlp, pte = __pte_offset_map_lock(mm, pmd, addr, ptlp)); + __cond_lock(RCU, __cond_lock(*ptlp, + pte = __pte_offset_map_lock(mm, pmd, addr, ptlp))); return pte; } diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 5297dcc38c37..5a882f2b10f9 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -279,7 +279,7 @@ static unsigned long pmdp_get_lockless_start(void) { return 0; } static void pmdp_get_lockless_end(unsigned long irqflags) { } #endif -pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) +pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) { unsigned long irqflags; pmd_t pmdval; -- cgit v1.2.3 From 5c0541e11c16bd2f162e23a22d07c09d58017e5a Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 9 Dec 2024 13:23:25 -0500 Subject: mm: introduce cpu_icache_is_aliasing() across all architectures In commit eacd0e950dc2 ("ARC: [mm] Lazy D-cache flush (non aliasing VIPT)"), arc adds the need to flush dcache to make icache see the code page change. This also requires special handling for clear_user_(high)page(). Introduce cpu_icache_is_aliasing() to make MM code query special clear_user_(high)page() easier. This will be used by the following commit. Link: https://lkml.kernel.org/r/20241209182326.2955963-1-ziy@nvidia.com Fixes: 5708d96da20b ("mm: avoid zeroing user movable page twice with init_on_alloc=1") Signed-off-by: Zi Yan Suggested-by: Mathieu Desnoyers Reviewed-by: Mathieu Desnoyers Acked-by: Vlastimil Babka Cc: Alexander Potapenko Cc: David Hildenbrand Cc: Geert Uytterhoeven Cc: John Hubbard Cc: Kees Cook Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Ryan Roberts Cc: Vineet Gupta Signed-off-by: Andrew Morton --- arch/arc/Kconfig | 1 + arch/arc/include/asm/cachetype.h | 8 ++++++++ include/linux/cacheinfo.h | 6 ++++++ 3 files changed, 15 insertions(+) create mode 100644 arch/arc/include/asm/cachetype.h (limited to 'include/linux') diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index ea5a1dcb133b..4f2eeda907ec 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -6,6 +6,7 @@ config ARC def_bool y select ARC_TIMERS + select ARCH_HAS_CPU_CACHE_ALIASING select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DMA_PREP_COHERENT diff --git a/arch/arc/include/asm/cachetype.h b/arch/arc/include/asm/cachetype.h new file mode 100644 index 000000000000..acd3b6cb4bf5 --- /dev/null +++ b/arch/arc/include/asm/cachetype.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_ARC_CACHETYPE_H +#define __ASM_ARC_CACHETYPE_H + +#define cpu_dcache_is_aliasing() false +#define cpu_icache_is_aliasing() true + +#endif diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index 108060612bb8..7ad736538649 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -155,8 +155,14 @@ static inline int get_cpu_cacheinfo_id(int cpu, int level) #ifndef CONFIG_ARCH_HAS_CPU_CACHE_ALIASING #define cpu_dcache_is_aliasing() false +#define cpu_icache_is_aliasing() cpu_dcache_is_aliasing() #else #include + +#ifndef cpu_icache_is_aliasing +#define cpu_icache_is_aliasing() cpu_dcache_is_aliasing() +#endif + #endif #endif /* _LINUX_CACHEINFO_H */ -- cgit v1.2.3 From c51a4f11e6d8246590b5e64908c1ed84b33e8ba2 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 9 Dec 2024 13:23:26 -0500 Subject: mm: use clear_user_(high)page() for arch with special user folio handling Some architectures have special handling after clearing user folios: architectures, which set cpu_dcache_is_aliasing() to true, require flushing dcache; arc, which sets cpu_icache_is_aliasing() to true, changes folio->flags to make icache coherent to dcache. So __GFP_ZERO using only clear_page() is not enough to zero user folios and clear_user_(high)page() must be used. Otherwise, user data will be corrupted. Fix it by always clearing user folios with clear_user_(high)page() when cpu_dcache_is_aliasing() is true or cpu_icache_is_aliasing() is true. Rename alloc_zeroed() to user_alloc_needs_zeroing() and invert the logic to clarify its intend. Link: https://lkml.kernel.org/r/20241209182326.2955963-2-ziy@nvidia.com Fixes: 5708d96da20b ("mm: avoid zeroing user movable page twice with init_on_alloc=1") Signed-off-by: Zi Yan Reported-by: Geert Uytterhoeven Closes: https://lore.kernel.org/linux-mm/CAMuHMdV1hRp_NtR5YnJo=HsfgKQeH91J537Gh4gKk3PFZhSkbA@mail.gmail.com/ Tested-by: Geert Uytterhoeven Acked-by: Vlastimil Babka Cc: Alexander Potapenko Cc: David Hildenbrand Cc: John Hubbard Cc: Kees Cook Cc: Kefeng Wang Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Ryan Roberts Cc: Vineet Gupta Signed-off-by: Andrew Morton --- include/linux/highmem.h | 8 +++++++- include/linux/mm.h | 18 ++++++++++++++++++ mm/huge_memory.c | 9 +++++---- mm/internal.h | 6 ------ mm/memory.c | 10 +++++----- 5 files changed, 35 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 6e452bd8e7e3..5c6bea81a90e 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -224,7 +224,13 @@ static inline struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr) { - return vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr); + struct folio *folio; + + folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr); + if (folio && user_alloc_needs_zeroing()) + clear_user_highpage(&folio->page, vaddr); + + return folio; } #endif diff --git a/include/linux/mm.h b/include/linux/mm.h index 3a6ee6a05aa0..338a76ce9083 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -31,6 +31,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -4184,6 +4185,23 @@ static inline int do_mseal(unsigned long start, size_t len_in, unsigned long fla } #endif +/* + * user_alloc_needs_zeroing checks if a user folio from page allocator needs to + * be zeroed or not. + */ +static inline bool user_alloc_needs_zeroing(void) +{ + /* + * for user folios, arch with cache aliasing requires cache flush and + * arc changes folio->flags to make icache coherent with dcache, so + * always return false to make caller use + * clear_user_page()/clear_user_highpage(). + */ + return cpu_dcache_is_aliasing() || cpu_icache_is_aliasing() || + !static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, + &init_on_alloc); +} + int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status); int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status); int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ee335d96fc39..9bb351caa619 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1176,11 +1176,12 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma, folio_throttle_swaprate(folio, gfp); /* - * When a folio is not zeroed during allocation (__GFP_ZERO not used), - * folio_zero_user() is used to make sure that the page corresponding - * to the faulting address will be hot in the cache after zeroing. + * When a folio is not zeroed during allocation (__GFP_ZERO not used) + * or user folios require special handling, folio_zero_user() is used to + * make sure that the page corresponding to the faulting address will be + * hot in the cache after zeroing. */ - if (!alloc_zeroed()) + if (user_alloc_needs_zeroing()) folio_zero_user(folio, addr); /* * The memory barrier inside __folio_mark_uptodate makes sure that diff --git a/mm/internal.h b/mm/internal.h index cb8d8e8e3ffa..3bd08bafad04 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1285,12 +1285,6 @@ void touch_pud(struct vm_area_struct *vma, unsigned long addr, void touch_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, bool write); -static inline bool alloc_zeroed(void) -{ - return static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, - &init_on_alloc); -} - /* * Parses a string with mem suffixes into its order. Useful to parse kernel * parameters. diff --git a/mm/memory.c b/mm/memory.c index 209885a4134f..398c031be9ba 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4733,12 +4733,12 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) folio_throttle_swaprate(folio, gfp); /* * When a folio is not zeroed during allocation - * (__GFP_ZERO not used), folio_zero_user() is used - * to make sure that the page corresponding to the - * faulting address will be hot in the cache after - * zeroing. + * (__GFP_ZERO not used) or user folios require special + * handling, folio_zero_user() is used to make sure + * that the page corresponding to the faulting address + * will be hot in the cache after zeroing. */ - if (!alloc_zeroed()) + if (user_alloc_needs_zeroing()) folio_zero_user(folio, vmf->address); return folio; } -- cgit v1.2.3 From 42b2eb69835b0fda797f70eb5b4fc213dbe3a7ea Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Thu, 12 Dec 2024 18:33:51 +0000 Subject: mm: convert partially_mapped set/clear operations to be atomic Other page flags in the 2nd page, like PG_hwpoison and PG_anon_exclusive can get modified concurrently. Changes to other page flags might be lost if they are happening at the same time as non-atomic partially_mapped operations. Hence, make partially_mapped operations atomic. Link: https://lkml.kernel.org/r/20241212183351.1345389-1-usamaarif642@gmail.com Fixes: 8422acdc97ed ("mm: introduce a pageflag for partially mapped folios") Reported-by: David Hildenbrand Link: https://lore.kernel.org/all/e53b04ad-1827-43a2-a1ab-864c7efecf6e@redhat.com/ Signed-off-by: Usama Arif Acked-by: David Hildenbrand Acked-by: Johannes Weiner Acked-by: Roman Gushchin Cc: Barry Song Cc: Domenico Cerasuolo Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Nico Pache Cc: Rik van Riel Cc: Ryan Roberts Cc: Shakeel Butt Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 12 ++---------- mm/huge_memory.c | 8 ++++---- 2 files changed, 6 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index cf46ac720802..691506bdf2c5 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -862,18 +862,10 @@ static inline void ClearPageCompound(struct page *page) ClearPageHead(page); } FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE) -FOLIO_TEST_FLAG(partially_mapped, FOLIO_SECOND_PAGE) -/* - * PG_partially_mapped is protected by deferred_split split_queue_lock, - * so its safe to use non-atomic set/clear. - */ -__FOLIO_SET_FLAG(partially_mapped, FOLIO_SECOND_PAGE) -__FOLIO_CLEAR_FLAG(partially_mapped, FOLIO_SECOND_PAGE) +FOLIO_FLAG(partially_mapped, FOLIO_SECOND_PAGE) #else FOLIO_FLAG_FALSE(large_rmappable) -FOLIO_TEST_FLAG_FALSE(partially_mapped) -__FOLIO_SET_FLAG_NOOP(partially_mapped) -__FOLIO_CLEAR_FLAG_NOOP(partially_mapped) +FOLIO_FLAG_FALSE(partially_mapped) #endif #define PG_head_mask ((1UL << PG_head)) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9bb351caa619..df0c4988dd88 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3577,7 +3577,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, !list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; if (folio_test_partially_mapped(folio)) { - __folio_clear_partially_mapped(folio); + folio_clear_partially_mapped(folio); mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); } @@ -3689,7 +3689,7 @@ bool __folio_unqueue_deferred_split(struct folio *folio) if (!list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; if (folio_test_partially_mapped(folio)) { - __folio_clear_partially_mapped(folio); + folio_clear_partially_mapped(folio); mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); } @@ -3733,7 +3733,7 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped) spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (partially_mapped) { if (!folio_test_partially_mapped(folio)) { - __folio_set_partially_mapped(folio); + folio_set_partially_mapped(folio); if (folio_test_pmd_mappable(folio)) count_vm_event(THP_DEFERRED_SPLIT_PAGE); count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED); @@ -3826,7 +3826,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, } else { /* We lost race with folio_put() */ if (folio_test_partially_mapped(folio)) { - __folio_clear_partially_mapped(folio); + folio_clear_partially_mapped(folio); mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); } -- cgit v1.2.3 From 30c2de0a267c04046d89e678cc0067a9cfb455df Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 12 Dec 2024 13:31:26 -0800 Subject: mm/vmstat: fix a W=1 clang compiler warning Fix the following clang compiler warning that is reported if the kernel is built with W=1: ./include/linux/vmstat.h:518:36: error: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Werror,-Wenum-enum-conversion] 518 | return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_" | ~~~~~~~~~~~ ^ ~~~ Link: https://lkml.kernel.org/r/20241212213126.1269116-1-bvanassche@acm.org Fixes: 9d7ea9a297e6 ("mm/vmstat: add helpers to get vmstat item names for each enum type") Signed-off-by: Bart Van Assche Cc: Konstantin Khlebnikov Cc: Nathan Chancellor Signed-off-by: Andrew Morton --- include/linux/vmstat.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index d2761bf8ff32..9f3a04345b86 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -515,7 +515,7 @@ static inline const char *node_stat_name(enum node_stat_item item) static inline const char *lru_list_name(enum lru_list lru) { - return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_" + return node_stat_name(NR_LRU_BASE + (enum node_stat_item)lru) + 3; // skip "nr_" } #if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG) -- cgit v1.2.3 From 640a603943a7659340c10044c0a1c98ae4e13189 Mon Sep 17 00:00:00 2001 From: David Wang <00107082@163.com> Date: Fri, 13 Dec 2024 09:33:32 +0800 Subject: mm/codetag: clear tags before swap When CONFIG_MEM_ALLOC_PROFILING_DEBUG is set, kernel WARN would be triggered when calling __alloc_tag_ref_set() during swap: alloc_tag was not cleared (got tag for mm/filemap.c:1951) WARNING: CPU: 0 PID: 816 at ./include/linux/alloc_tag.h... Clear code tags before swap can fix the warning. And this patch also fix a potential invalid address dereference in alloc_tag_add_check() when CONFIG_MEM_ALLOC_PROFILING_DEBUG is set and ref->ct is CODETAG_EMPTY, which is defined as ((void *)1). Link: https://lkml.kernel.org/r/20241213013332.89910-1-00107082@163.com Fixes: 51f43d5d82ed ("mm/codetag: swap tags when migrate pages") Signed-off-by: David Wang <00107082@163.com> Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202412112227.df61ebb-lkp@intel.com Acked-by: Suren Baghdasaryan Cc: Kent Overstreet Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- include/linux/alloc_tag.h | 2 +- lib/alloc_tag.c | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 7c0786bdf9af..cba024bf2db3 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -135,7 +135,7 @@ static inline struct alloc_tag_counters alloc_tag_read(struct alloc_tag *tag) #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag) { - WARN_ONCE(ref && ref->ct, + WARN_ONCE(ref && ref->ct && !is_codetag_empty(ref), "alloc_tag was not cleared (got tag for %s:%u)\n", ref->ct->filename, ref->ct->lineno); diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 35f7560a309a..3a0413462e9f 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -209,6 +209,13 @@ void pgalloc_tag_swap(struct folio *new, struct folio *old) return; } + /* + * Clear tag references to avoid debug warning when using + * __alloc_tag_ref_set() with non-empty reference. + */ + set_codetag_empty(&ref_old); + set_codetag_empty(&ref_new); + /* swap tags */ __alloc_tag_ref_set(&ref_old, tag_new); update_page_tag_ref(handle_old, &ref_old); -- cgit v1.2.3 From 60da7445a142bd15e67f3cda915497781c3f781f Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 29 Nov 2024 16:14:23 -0800 Subject: alloc_tag: fix set_codetag_empty() when !CONFIG_MEM_ALLOC_PROFILING_DEBUG It was recently noticed that set_codetag_empty() might be used not only to mark NULL alloctag references as empty to avoid warnings but also to reset valid tags (in clear_page_tag_ref()). Since set_codetag_empty() is defined as NOOP for CONFIG_MEM_ALLOC_PROFILING_DEBUG=n, such use of set_codetag_empty() leads to subtle bugs. Fix set_codetag_empty() for CONFIG_MEM_ALLOC_PROFILING_DEBUG=n to reset the tag reference. Link: https://lkml.kernel.org/r/20241130001423.1114965-2-surenb@google.com Fixes: a8fc28dad6d5 ("alloc_tag: introduce clear_page_tag_ref() helper function") Signed-off-by: Suren Baghdasaryan Reported-by: David Wang <00107082@163.com> Closes: https://lore.kernel.org/lkml/20241124074318.399027-1-00107082@163.com/ Cc: David Wang <00107082@163.com> Cc: Kent Overstreet Cc: Mike Rapoport (Microsoft) Cc: Pasha Tatashin Cc: Sourav Panda Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- include/linux/alloc_tag.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index cba024bf2db3..0bbbe537c5f9 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -63,7 +63,12 @@ static inline void set_codetag_empty(union codetag_ref *ref) #else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ static inline bool is_codetag_empty(union codetag_ref *ref) { return false; } -static inline void set_codetag_empty(union codetag_ref *ref) {} + +static inline void set_codetag_empty(union codetag_ref *ref) +{ + if (ref) + ref->ct = NULL; +} #endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ -- cgit v1.2.3 From d888b7af7c149c115dd6ac772cc11c375da3e17c Mon Sep 17 00:00:00 2001 From: Zijian Zhang Date: Tue, 10 Dec 2024 01:20:39 +0000 Subject: tcp_bpf: Add sk_rmem_alloc related logic for tcp_bpf ingress redirection When we do sk_psock_verdict_apply->sk_psock_skb_ingress, an sk_msg will be created out of the skb, and the rmem accounting of the sk_msg will be handled by the skb. For skmsgs in __SK_REDIRECT case of tcp_bpf_send_verdict, when redirecting to the ingress of a socket, although we sk_rmem_schedule and add sk_msg to the ingress_msg of sk_redir, we do not update sk_rmem_alloc. As a result, except for the global memory limit, the rmem of sk_redir is nearly unlimited. Thus, add sk_rmem_alloc related logic to limit the recv buffer. Since the function sk_msg_recvmsg and __sk_psock_purge_ingress_msg are used in these two paths. We use "msg->skb" to test whether the sk_msg is skb backed up. If it's not, we shall do the memory accounting explicitly. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Zijian Zhang Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20241210012039.1669389-3-zijianzhang@bytedance.com --- include/linux/skmsg.h | 11 ++++++++--- net/core/skmsg.c | 6 +++++- net/ipv4/tcp_bpf.c | 4 +++- 3 files changed, 16 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index d9b03e0746e7..2cbe0c22a32f 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -317,17 +317,22 @@ static inline void sock_drop(struct sock *sk, struct sk_buff *skb) kfree_skb(skb); } -static inline void sk_psock_queue_msg(struct sk_psock *psock, +static inline bool sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { + bool ret; + spin_lock_bh(&psock->ingress_lock); - if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { list_add_tail(&msg->list, &psock->ingress_msg); - else { + ret = true; + } else { sk_msg_free(psock->sk, msg); kfree(msg); + ret = false; } spin_unlock_bh(&psock->ingress_lock); + return ret; } static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index e90fbab703b2..8ad7e6755fd6 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -445,8 +445,10 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, if (likely(!peek)) { sge->offset += copy; sge->length -= copy; - if (!msg_rx->skb) + if (!msg_rx->skb) { sk_mem_uncharge(sk, copy); + atomic_sub(copy, &sk->sk_rmem_alloc); + } msg_rx->sg.size -= copy; if (!sge->length) { @@ -772,6 +774,8 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock) list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) { list_del(&msg->list); + if (!msg->skb) + atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc); sk_msg_free(psock->sk, msg); kfree(msg); } diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index b21ea634909c..392678ae80f4 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -56,6 +56,7 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, } sk_mem_charge(sk, size); + atomic_add(size, &sk->sk_rmem_alloc); sk_msg_xfer(tmp, msg, i, size); copied += size; if (sge->length) @@ -74,7 +75,8 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, if (!ret) { msg->sg.start = i; - sk_psock_queue_msg(psock, tmp); + if (!sk_psock_queue_msg(psock, tmp)) + atomic_sub(copied, &sk->sk_rmem_alloc); sk_psock_data_ready(sk, psock); } else { sk_msg_free(sk, tmp); -- cgit v1.2.3 From 4acb665cf4f3e5436844f17ece0a8a55ce688c7b Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 13 Dec 2024 13:50:08 +0000 Subject: netfs: Work around recursion by abandoning retry if nothing read syzkaller reported recursion with a loop of three calls (netfs_rreq_assess, netfs_retry_reads and netfs_rreq_terminated) hitting the limit of the stack during an unbuffered or direct I/O read. There are a number of issues: (1) There is no limit on the number of retries. (2) A subrequest is supposed to be abandoned if it does not transfer anything (NETFS_SREQ_NO_PROGRESS), but that isn't checked under all circumstances. (3) The actual root cause, which is this: if (atomic_dec_and_test(&rreq->nr_outstanding)) netfs_rreq_terminated(rreq, ...); When we do a retry, we bump the rreq->nr_outstanding counter to prevent the final cleanup phase running before we've finished dispatching the retries. The problem is if we hit 0, we have to do the cleanup phase - but we're in the cleanup phase and end up repeating the retry cycle, hence the recursion. Work around the problem by limiting the number of retries. This is based on Lizhi Xu's patch[1], and makes the following changes: (1) Replace NETFS_SREQ_NO_PROGRESS with NETFS_SREQ_MADE_PROGRESS and make the filesystem set it if it managed to read or write at least one byte of data. Clear this bit before issuing a subrequest. (2) Add a ->retry_count member to the subrequest and increment it any time we do a retry. (3) Remove the NETFS_SREQ_RETRYING flag as it is superfluous with ->retry_count. If the latter is non-zero, we're doing a retry. (4) Abandon a subrequest if retry_count is non-zero and we made no progress. (5) Use ->retry_count in both the write-side and the read-size. [?] Question: Should I set a hard limit on retry_count in both read and write? Say it hits 50, we always abandon it. The problem is that these changes only mitigate the issue. As long as it made at least one byte of progress, the recursion is still an issue. This patch mitigates the problem, but does not fix the underlying cause. I have patches that will do that, but it's an intrusive fix that's currently pending for the next merge window. The oops generated by KASAN looks something like: BUG: TASK stack guard page was hit at ffffc9000482ff48 (stack is ffffc90004830000..ffffc90004838000) Oops: stack guard page: 0000 [#1] PREEMPT SMP KASAN NOPTI ... RIP: 0010:mark_lock+0x25/0xc60 kernel/locking/lockdep.c:4686 ... mark_usage kernel/locking/lockdep.c:4646 [inline] __lock_acquire+0x906/0x3ce0 kernel/locking/lockdep.c:5156 lock_acquire.part.0+0x11b/0x380 kernel/locking/lockdep.c:5825 local_lock_acquire include/linux/local_lock_internal.h:29 [inline] ___slab_alloc+0x123/0x1880 mm/slub.c:3695 __slab_alloc.constprop.0+0x56/0xb0 mm/slub.c:3908 __slab_alloc_node mm/slub.c:3961 [inline] slab_alloc_node mm/slub.c:4122 [inline] kmem_cache_alloc_noprof+0x2a7/0x2f0 mm/slub.c:4141 radix_tree_node_alloc.constprop.0+0x1e8/0x350 lib/radix-tree.c:253 idr_get_free+0x528/0xa40 lib/radix-tree.c:1506 idr_alloc_u32+0x191/0x2f0 lib/idr.c:46 idr_alloc+0xc1/0x130 lib/idr.c:87 p9_tag_alloc+0x394/0x870 net/9p/client.c:321 p9_client_prepare_req+0x19f/0x4d0 net/9p/client.c:644 p9_client_zc_rpc.constprop.0+0x105/0x880 net/9p/client.c:793 p9_client_read_once+0x443/0x820 net/9p/client.c:1570 p9_client_read+0x13f/0x1b0 net/9p/client.c:1534 v9fs_issue_read+0x115/0x310 fs/9p/vfs_addr.c:74 netfs_retry_read_subrequests fs/netfs/read_retry.c:60 [inline] netfs_retry_reads+0x153a/0x1d00 fs/netfs/read_retry.c:232 netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371 netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407 netfs_retry_reads+0x155e/0x1d00 fs/netfs/read_retry.c:235 netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371 netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407 netfs_retry_reads+0x155e/0x1d00 fs/netfs/read_retry.c:235 netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371 ... netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407 netfs_retry_reads+0x155e/0x1d00 fs/netfs/read_retry.c:235 netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371 netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407 netfs_retry_reads+0x155e/0x1d00 fs/netfs/read_retry.c:235 netfs_rreq_assess+0x5d3/0x870 fs/netfs/read_collect.c:371 netfs_rreq_terminated+0xe5/0x110 fs/netfs/read_collect.c:407 netfs_dispatch_unbuffered_reads fs/netfs/direct_read.c:103 [inline] netfs_unbuffered_read fs/netfs/direct_read.c:127 [inline] netfs_unbuffered_read_iter_locked+0x12f6/0x19b0 fs/netfs/direct_read.c:221 netfs_unbuffered_read_iter+0xc5/0x100 fs/netfs/direct_read.c:256 v9fs_file_read_iter+0xbf/0x100 fs/9p/vfs_file.c:361 do_iter_readv_writev+0x614/0x7f0 fs/read_write.c:832 vfs_readv+0x4cf/0x890 fs/read_write.c:1025 do_preadv fs/read_write.c:1142 [inline] __do_sys_preadv fs/read_write.c:1192 [inline] __se_sys_preadv fs/read_write.c:1187 [inline] __x64_sys_preadv+0x22d/0x310 fs/read_write.c:1187 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x250 arch/x86/entry/common.c:83 Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading") Closes: https://syzkaller.appspot.com/bug?extid=1fc6f64c40a9d143cfb6 Signed-off-by: David Howells Link: https://lore.kernel.org/r/20241108034020.3695718-1-lizhi.xu@windriver.com/ [1] Link: https://lore.kernel.org/r/20241213135013.2964079-9-dhowells@redhat.com Tested-by: syzbot+885c03ad650731743489@syzkaller.appspotmail.com Suggested-by: Lizhi Xu cc: Dominique Martinet cc: Jeff Layton cc: v9fs@lists.linux.dev cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Reported-by: syzbot+885c03ad650731743489@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- fs/9p/vfs_addr.c | 6 +++++- fs/afs/write.c | 5 ++++- fs/netfs/read_collect.c | 15 +++++++++------ fs/netfs/read_retry.c | 6 ++++-- fs/netfs/write_collect.c | 5 ++--- fs/netfs/write_issue.c | 2 ++ fs/smb/client/cifssmb.c | 13 +++++++++---- fs/smb/client/smb2pdu.c | 9 ++++++--- include/linux/netfs.h | 6 +++--- 9 files changed, 44 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 819c75233235..3bc9ce6c575e 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -57,6 +57,8 @@ static void v9fs_issue_write(struct netfs_io_subrequest *subreq) int err, len; len = p9_client_write(fid, subreq->start, &subreq->io_iter, &err); + if (len > 0) + __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); netfs_write_subrequest_terminated(subreq, len ?: err, false); } @@ -80,8 +82,10 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq) if (pos + total >= i_size_read(rreq->inode)) __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags); - if (!err) + if (!err) { subreq->transferred += total; + __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); + } netfs_read_subreq_terminated(subreq, err, false); } diff --git a/fs/afs/write.c b/fs/afs/write.c index 34107b55f834..ccb6aa8027c5 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -122,7 +122,7 @@ static void afs_issue_write_worker(struct work_struct *work) if (subreq->debug_index == 3) return netfs_write_subrequest_terminated(subreq, -ENOANO, false); - if (!test_bit(NETFS_SREQ_RETRYING, &subreq->flags)) { + if (!subreq->retry_count) { set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); return netfs_write_subrequest_terminated(subreq, -EAGAIN, false); } @@ -149,6 +149,9 @@ static void afs_issue_write_worker(struct work_struct *work) afs_wait_for_operation(op); ret = afs_put_operation(op); switch (ret) { + case 0: + __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); + break; case -EACCES: case -EPERM: case -ENOKEY: diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c index 46ce3b7adf07..47ed3a5044e2 100644 --- a/fs/netfs/read_collect.c +++ b/fs/netfs/read_collect.c @@ -438,7 +438,7 @@ void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq, rreq->origin == NETFS_READPAGE || rreq->origin == NETFS_READ_FOR_WRITE)) { netfs_consume_read_data(subreq, was_async); - __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); + __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); } } EXPORT_SYMBOL(netfs_read_subreq_progress); @@ -497,7 +497,7 @@ void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq, rreq->origin == NETFS_READPAGE || rreq->origin == NETFS_READ_FOR_WRITE)) { netfs_consume_read_data(subreq, was_async); - __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); + __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); } rreq->transferred += subreq->transferred; } @@ -511,10 +511,13 @@ void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq, } else { trace_netfs_sreq(subreq, netfs_sreq_trace_short); if (subreq->transferred > subreq->consumed) { - __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); - __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); - set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags); - } else if (!__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) { + /* If we didn't read new data, abandon retry. */ + if (subreq->retry_count && + test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags)) { + __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); + set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags); + } + } else if (test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags)) { __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags); } else { diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c index 0350592ea804..0e72e9226fc8 100644 --- a/fs/netfs/read_retry.c +++ b/fs/netfs/read_retry.c @@ -56,6 +56,8 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) break; if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) { + __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); + subreq->retry_count++; netfs_reset_iter(subreq); netfs_reissue_read(rreq, subreq); } @@ -137,7 +139,8 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) stream0->sreq_max_len = subreq->len; __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); - __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); + __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); + subreq->retry_count++; spin_lock_bh(&rreq->lock); list_add_tail(&subreq->rreq_link, &rreq->subrequests); @@ -213,7 +216,6 @@ abandon: subreq->error = -ENOMEM; __clear_bit(NETFS_SREQ_FAILED, &subreq->flags); __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); - __clear_bit(NETFS_SREQ_RETRYING, &subreq->flags); } spin_lock_bh(&rreq->lock); list_splice_tail_init(&queue, &rreq->subrequests); diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index 82290c92ba7a..ca3a11ed9b54 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -179,7 +179,6 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, struct iov_iter source = subreq->io_iter; iov_iter_revert(&source, subreq->len - source.count); - __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); netfs_reissue_write(stream, subreq, &source); } @@ -234,7 +233,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, /* Renegotiate max_len (wsize) */ trace_netfs_sreq(subreq, netfs_sreq_trace_retry); __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); - __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); + subreq->retry_count++; stream->prepare_write(subreq); part = min(len, stream->sreq_max_len); @@ -279,7 +278,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, subreq->start = start; subreq->debug_index = atomic_inc_return(&wreq->subreq_counter); subreq->stream_nr = to->stream_nr; - __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); + subreq->retry_count = 1; trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index, refcount_read(&subreq->ref), diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index bf6d507578e5..ff0e82505a0b 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -244,6 +244,8 @@ void netfs_reissue_write(struct netfs_io_stream *stream, iov_iter_advance(source, size); iov_iter_truncate(&subreq->io_iter, size); + subreq->retry_count++; + __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); netfs_do_issue_write(stream, subreq); } diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index bd42a419458e..6cb1e81993f8 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -1319,14 +1319,16 @@ cifs_readv_callback(struct mid_q_entry *mid) } if (rdata->result == -ENODATA) { - __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); rdata->result = 0; + __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); } else { size_t trans = rdata->subreq.transferred + rdata->got_bytes; if (trans < rdata->subreq.len && rdata->subreq.start + trans == ictx->remote_i_size) { - __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); rdata->result = 0; + __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); + } else if (rdata->got_bytes > 0) { + __set_bit(NETFS_SREQ_MADE_PROGRESS, &rdata->subreq.flags); } } @@ -1670,10 +1672,13 @@ cifs_writev_callback(struct mid_q_entry *mid) if (written > wdata->subreq.len) written &= 0xFFFF; - if (written < wdata->subreq.len) + if (written < wdata->subreq.len) { result = -ENOSPC; - else + } else { result = written; + if (written > 0) + __set_bit(NETFS_SREQ_MADE_PROGRESS, &wdata->subreq.flags); + } break; case MID_REQUEST_SUBMITTED: case MID_RETRY_NEEDED: diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 010eae9d6c47..458b53d1f9cb 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -4615,6 +4615,7 @@ smb2_readv_callback(struct mid_q_entry *mid) __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); rdata->result = 0; } + __set_bit(NETFS_SREQ_MADE_PROGRESS, &rdata->subreq.flags); } trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, rdata->credits.value, server->credits, server->in_flight, @@ -4840,10 +4841,12 @@ smb2_writev_callback(struct mid_q_entry *mid) if (written > wdata->subreq.len) written &= 0xFFFF; - if (written < wdata->subreq.len) + if (written < wdata->subreq.len) { wdata->result = -ENOSPC; - else + } else if (written > 0) { wdata->subreq.len = written; + __set_bit(NETFS_SREQ_MADE_PROGRESS, &wdata->subreq.flags); + } break; case MID_REQUEST_SUBMITTED: case MID_RETRY_NEEDED: @@ -5012,7 +5015,7 @@ smb2_async_writev(struct cifs_io_subrequest *wdata) } #endif - if (test_bit(NETFS_SREQ_RETRYING, &wdata->subreq.flags)) + if (wdata->subreq.retry_count > 0) smb2_set_replay(server, &rqst); cifs_dbg(FYI, "async write at %llu %u bytes iter=%zx\n", diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 5eaceef41e6c..4083d77e3f39 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -185,6 +185,7 @@ struct netfs_io_subrequest { short error; /* 0 or error that occurred */ unsigned short debug_index; /* Index in list (for debugging output) */ unsigned int nr_segs; /* Number of segs in io_iter */ + u8 retry_count; /* The number of retries (0 on initial pass) */ enum netfs_io_source source; /* Where to read from/write to */ unsigned char stream_nr; /* I/O stream this belongs to */ unsigned char curr_folioq_slot; /* Folio currently being read */ @@ -194,14 +195,13 @@ struct netfs_io_subrequest { #define NETFS_SREQ_COPY_TO_CACHE 0 /* Set if should copy the data to the cache */ #define NETFS_SREQ_CLEAR_TAIL 1 /* Set if the rest of the read should be cleared */ #define NETFS_SREQ_SEEK_DATA_READ 3 /* Set if ->read() should SEEK_DATA first */ -#define NETFS_SREQ_NO_PROGRESS 4 /* Set if we didn't manage to read any data */ +#define NETFS_SREQ_MADE_PROGRESS 4 /* Set if we transferred at least some data */ #define NETFS_SREQ_ONDEMAND 5 /* Set if it's from on-demand read mode */ #define NETFS_SREQ_BOUNDARY 6 /* Set if ends on hard boundary (eg. ceph object) */ #define NETFS_SREQ_HIT_EOF 7 /* Set if short due to EOF */ #define NETFS_SREQ_IN_PROGRESS 8 /* Unlocked when the subrequest completes */ #define NETFS_SREQ_NEED_RETRY 9 /* Set if the filesystem requests a retry */ -#define NETFS_SREQ_RETRYING 10 /* Set if we're retrying */ -#define NETFS_SREQ_FAILED 11 /* Set if the subreq failed unretryably */ +#define NETFS_SREQ_FAILED 10 /* Set if the subreq failed unretryably */ }; enum netfs_io_origin { -- cgit v1.2.3 From d4e338de17cb6532bf805fae00db8b41e914009b Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 16 Dec 2024 20:34:45 +0000 Subject: netfs: Fix is-caching check in read-retry netfs: Fix is-caching check in read-retry The read-retry code checks the NETFS_RREQ_COPY_TO_CACHE flag to determine if there might be failed reads from the cache that need turning into reads from the server, with the intention of skipping the complicated part if it can. The code that set the flag, however, got lost during the read-side rewrite. Fix the check to see if the cache_resources are valid instead. The flag can then be removed. Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading") Signed-off-by: David Howells Link: https://lore.kernel.org/r/3752048.1734381285@warthog.procyon.org.uk cc: Jeff Layton cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/read_retry.c | 2 +- include/linux/netfs.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c index 0e72e9226fc8..21b4a54e545e 100644 --- a/fs/netfs/read_retry.c +++ b/fs/netfs/read_retry.c @@ -49,7 +49,7 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) * up to the first permanently failed one. */ if (!rreq->netfs_ops->prepare_read && - !test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags)) { + !rreq->cache_resources.ops) { struct netfs_io_subrequest *subreq; list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 4083d77e3f39..ecdd5ced16a8 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -269,7 +269,6 @@ struct netfs_io_request { size_t prev_donated; /* Fallback for subreq->prev_donated */ refcount_t ref; unsigned long flags; -#define NETFS_RREQ_COPY_TO_CACHE 1 /* Need to write to the cache */ #define NETFS_RREQ_NO_UNLOCK_FOLIO 2 /* Don't unlock no_unlock_folio on completion */ #define NETFS_RREQ_DONT_UNLOCK_FOLIOS 3 /* Don't unlock the folios on completion */ #define NETFS_RREQ_FAILED 4 /* The request failed */ -- cgit v1.2.3 From 0b7a66a2c864859fbf9bb16229c03172eef02c05 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 5 Dec 2024 17:06:02 +0100 Subject: preempt: Move PREEMPT_RT before PREEMPT in vermagic. Since the dynamic preemption has been enabled for PREEMPT_RT we have now CONFIG_PREEMPT and CONFIG_PREEMPT_RT set simultaneously. This affects the vermagic strings which comes now PREEMPT with PREEMPT_RT enabled. The PREEMPT_RT module usually can not be loaded on a PREEMPT kernel because some symbols are missing. However if the symbols are fine then it continues and it crashes later. The problem is that the struct module has a different layout and the num_exentries or init members are at a different position leading to a crash later on. This is not necessary caught by the size check in elf_validity_cache_index_mod() because the mem member has an alignment requirement of __module_memory_align which is big enough keep the total size unchanged. Therefore we should keep the string accurate instead of removing it. Move the PREEMPT_RT check before the PREEMPT so that it takes precedence if both symbols are enabled. Fixes: 35772d627b55c ("sched: Enable PREEMPT_DYNAMIC for PREEMPT_RT") Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Petr Pavlu Link: https://lore.kernel.org/r/20241205160602.3lIAsJRT@linutronix.de Signed-off-by: Petr Pavlu --- include/linux/vermagic.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vermagic.h b/include/linux/vermagic.h index a54046bf37e5..939ceabcaf06 100644 --- a/include/linux/vermagic.h +++ b/include/linux/vermagic.h @@ -15,10 +15,10 @@ #else #define MODULE_VERMAGIC_SMP "" #endif -#ifdef CONFIG_PREEMPT_BUILD -#define MODULE_VERMAGIC_PREEMPT "preempt " -#elif defined(CONFIG_PREEMPT_RT) +#ifdef CONFIG_PREEMPT_RT #define MODULE_VERMAGIC_PREEMPT "preempt_rt " +#elif defined(CONFIG_PREEMPT_BUILD) +#define MODULE_VERMAGIC_PREEMPT "preempt " #else #define MODULE_VERMAGIC_PREEMPT "" #endif -- cgit v1.2.3 From 2a4f56fbcc473d8faeb29b73082df39efbe5893c Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Fri, 20 Dec 2024 10:15:05 +0200 Subject: net/mlx5e: Keep netdev when leave switchdev for devlink set legacy only In the cited commit, when changing from switchdev to legacy mode, uplink representor's netdev is kept, and its profile is replaced with nic profile, so netdev is detached from old profile, then attach to new profile. During profile change, the hardware resources allocated by the old profile will be cleaned up. However, the cleanup is relying on the related kernel modules. And they may need to flush themselves first, which is triggered by netdev events, for example, NETDEV_UNREGISTER. However, netdev is kept, or netdev_register is called after the cleanup, which may cause troubles because the resources are still referred by kernel modules. The same process applies to all the caes when uplink is leaving switchdev mode, including devlink eswitch mode set legacy, driver unload and devlink reload. For the first one, it can be blocked and returns failure to users, whenever possible. But it's hard for the others. Besides, the attachment to nic profile is unnecessary as the netdev will be unregistered anyway for such cases. So in this patch, the original behavior is kept only for devlink eswitch set mode legacy. For the others, moves netdev unregistration before the profile change. Fixes: 7a9fb35e8c3a ("net/mlx5e: Do not reload ethernet ports when changing eswitch mode") Signed-off-by: Jianbo Liu Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241220081505.1286093-5-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 19 +++++++++++++++++-- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 15 +++++++++++++++ .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 2 ++ include/linux/mlx5/driver.h | 1 + 4 files changed, 35 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index dd16d73000c3..0ec17c276bdd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -6542,8 +6542,23 @@ static void _mlx5e_remove(struct auxiliary_device *adev) mlx5_core_uplink_netdev_set(mdev, NULL); mlx5e_dcbnl_delete_app(priv); - unregister_netdev(priv->netdev); - _mlx5e_suspend(adev, false); + /* When unload driver, the netdev is in registered state + * if it's from legacy mode. If from switchdev mode, it + * is already unregistered before changing to NIC profile. + */ + if (priv->netdev->reg_state == NETREG_REGISTERED) { + unregister_netdev(priv->netdev); + _mlx5e_suspend(adev, false); + } else { + struct mlx5_core_dev *pos; + int i; + + if (test_bit(MLX5E_STATE_DESTROYING, &priv->state)) + mlx5_sd_for_each_dev(i, mdev, pos) + mlx5e_destroy_mdev_resources(pos); + else + _mlx5e_suspend(adev, true); + } /* Avoid cleanup if profile rollback failed. */ if (priv->profile) priv->profile->cleanup(priv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 554f9cb5b53f..fdff9fd8a89e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -1509,6 +1509,21 @@ mlx5e_vport_uplink_rep_unload(struct mlx5e_rep_priv *rpriv) priv = netdev_priv(netdev); + /* This bit is set when using devlink to change eswitch mode from + * switchdev to legacy. As need to keep uplink netdev ifindex, we + * detach uplink representor profile and attach NIC profile only. + * The netdev will be unregistered later when unload NIC auxiliary + * driver for this case. + * We explicitly block devlink eswitch mode change if any IPSec rules + * offloaded, but can't block other cases, such as driver unload + * and devlink reload. We have to unregister netdev before profile + * change for those cases. This is to avoid resource leak because + * the offloaded rules don't have the chance to be unoffloaded before + * cleanup which is triggered by detach uplink representor profile. + */ + if (!(priv->mdev->priv.flags & MLX5_PRIV_FLAGS_SWITCH_LEGACY)) + unregister_netdev(netdev); + mlx5e_netdev_attach_nic_profile(priv); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 40359f320724..06076dd9ec64 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -3777,6 +3777,8 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode, esw->eswitch_operation_in_progress = true; up_write(&esw->mode_lock); + if (mode == DEVLINK_ESWITCH_MODE_LEGACY) + esw->dev->priv.flags |= MLX5_PRIV_FLAGS_SWITCH_LEGACY; mlx5_eswitch_disable_locked(esw); if (mode == DEVLINK_ESWITCH_MODE_SWITCHDEV) { if (mlx5_devlink_trap_get_num_active(esw->dev)) { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index fc7e6153b73d..8f5991168ccd 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -524,6 +524,7 @@ enum { * creation/deletion on drivers rescan. Unset during device attach. */ MLX5_PRIV_FLAGS_DETACH = 1 << 2, + MLX5_PRIV_FLAGS_SWITCH_LEGACY = 1 << 3, }; struct mlx5_adev { -- cgit v1.2.3 From 452f4b31e3f70a52b97890888eeb9eaa9a87139a Mon Sep 17 00:00:00 2001 From: Christian Göttsche Date: Mon, 25 Nov 2024 11:50:25 +0100 Subject: tracing: Constify string literal data member in struct trace_event_call MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The name member of the struct trace_event_call is assigned with generated string literals; declare them pointer to read-only. Reported by clang: security/landlock/syscalls.c:179:1: warning: initializing 'char *' with an expression of type 'const char[34]' discards qualifiers [-Wincompatible-pointer-types-discards-qualifiers] 179 | SYSCALL_DEFINE3(landlock_create_ruleset, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 180 | const struct landlock_ruleset_attr __user *const, attr, | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 181 | const size_t, size, const __u32, flags) | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/syscalls.h:226:36: note: expanded from macro 'SYSCALL_DEFINE3' 226 | #define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/syscalls.h:234:2: note: expanded from macro 'SYSCALL_DEFINEx' 234 | SYSCALL_METADATA(sname, x, __VA_ARGS__) \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/syscalls.h:184:2: note: expanded from macro 'SYSCALL_METADATA' 184 | SYSCALL_TRACE_ENTER_EVENT(sname); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/linux/syscalls.h:151:30: note: expanded from macro 'SYSCALL_TRACE_ENTER_EVENT' 151 | .name = "sys_enter"#sname, \ | ^~~~~~~~~~~~~~~~~ Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Mickaël Salaün Cc: Günther Noack Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Bill Wendling Cc: Justin Stitt Link: https://lore.kernel.org/20241125105028.42807-1-cgoettsche@seltendoof.de Fixes: b77e38aa240c3 ("tracing: add event trace infrastructure") Signed-off-by: Christian Göttsche Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 91b8ffbdfa8c..58ad4ead33fc 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -364,7 +364,7 @@ struct trace_event_call { struct list_head list; struct trace_event_class *class; union { - char *name; + const char *name; /* Set TRACE_EVENT_FL_TRACEPOINT flag when using "tp" */ struct tracepoint *tp; }; -- cgit v1.2.3 From f718faf3940e95d5d34af9041f279f598396ab7d Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Tue, 17 Dec 2024 00:48:18 +0000 Subject: freezer, sched: Report frozen tasks as 'D' instead of 'R' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before commit: f5d39b020809 ("freezer,sched: Rewrite core freezer logic") the frozen task stat was reported as 'D' in cgroup v1. However, after rewriting the core freezer logic, the frozen task stat is reported as 'R'. This is confusing, especially when a task with stat of 'S' is frozen. This bug can be reproduced with these steps: $ cd /sys/fs/cgroup/freezer/ $ mkdir test $ sleep 1000 & [1] 739 // task whose stat is 'S' $ echo 739 > test/cgroup.procs $ echo FROZEN > test/freezer.state $ ps -aux | grep 739 root 739 0.1 0.0 8376 1812 pts/0 R 10:56 0:00 sleep 1000 As shown above, a task whose stat is 'S' was changed to 'R' when it was frozen. To solve this regression, simply maintain the same reported state as before the rewrite. [ mingo: Enhanced the changelog and comments ] Fixes: f5d39b020809 ("freezer,sched: Rewrite core freezer logic") Signed-off-by: Chen Ridong Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Tejun Heo Acked-by: Michal Koutný Link: https://lore.kernel.org/r/20241217004818.3200515-1-chenridong@huaweicloud.com --- include/linux/sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 66b311fbd5d6..64934e0830af 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1637,8 +1637,9 @@ static inline unsigned int __task_state_index(unsigned int tsk_state, * We're lying here, but rather than expose a completely new task state * to userspace, we can make this appear as if the task has gone through * a regular rt_mutex_lock() call. + * Report frozen tasks as uninterruptible. */ - if (tsk_state & TASK_RTLOCK_WAIT) + if ((tsk_state & TASK_RTLOCK_WAIT) || (tsk_state & TASK_FROZEN)) state = TASK_UNINTERRUPTIBLE; return fls(state); -- cgit v1.2.3 From 8ec396d05d1b737c87311fb7311f753b02c2a6b1 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 28 Nov 2024 15:06:17 +0000 Subject: mm: reinstate ability to map write-sealed memfd mappings read-only Patch series "mm: reinstate ability to map write-sealed memfd mappings read-only". In commit 158978945f31 ("mm: perform the mapping_map_writable() check after call_mmap()") (and preceding changes in the same series) it became possible to mmap() F_SEAL_WRITE sealed memfd mappings read-only. Commit 5de195060b2e ("mm: resolve faulty mmap_region() error path behaviour") unintentionally undid this logic by moving the mapping_map_writable() check before the shmem_mmap() hook is invoked, thereby regressing this change. This series reworks how we both permit write-sealed mappings being mapped read-only and disallow mprotect() from undoing the write-seal, fixing this regression. We also add a regression test to ensure that we do not accidentally regress this in future. Thanks to Julian Orth for reporting this regression. This patch (of 2): In commit 158978945f31 ("mm: perform the mapping_map_writable() check after call_mmap()") (and preceding changes in the same series) it became possible to mmap() F_SEAL_WRITE sealed memfd mappings read-only. This was previously unnecessarily disallowed, despite the man page documentation indicating that it would be, thereby limiting the usefulness of F_SEAL_WRITE logic. We fixed this by adapting logic that existed for the F_SEAL_FUTURE_WRITE seal (one which disallows future writes to the memfd) to also be used for F_SEAL_WRITE. For background - the F_SEAL_FUTURE_WRITE seal clears VM_MAYWRITE for a read-only mapping to disallow mprotect() from overriding the seal - an operation performed by seal_check_write(), invoked from shmem_mmap(), the f_op->mmap() hook used by shmem mappings. By extending this to F_SEAL_WRITE and critically - checking mapping_map_writable() to determine if we may map the memfd AFTER we invoke shmem_mmap() - the desired logic becomes possible. This is because mapping_map_writable() explicitly checks for VM_MAYWRITE, which we will have cleared. Commit 5de195060b2e ("mm: resolve faulty mmap_region() error path behaviour") unintentionally undid this logic by moving the mapping_map_writable() check before the shmem_mmap() hook is invoked, thereby regressing this change. We reinstate this functionality by moving the check out of shmem_mmap() and instead performing it in do_mmap() at the point at which VMA flags are being determined, which seems in any case to be a more appropriate place in which to make this determination. In order to achieve this we rework memfd seal logic to allow us access to this information using existing logic and eliminate the clearing of VM_MAYWRITE from seal_check_write() which we are performing in do_mmap() instead. Link: https://lkml.kernel.org/r/99fc35d2c62bd2e05571cf60d9f8b843c56069e0.1732804776.git.lorenzo.stoakes@oracle.com Fixes: 5de195060b2e ("mm: resolve faulty mmap_region() error path behaviour") Signed-off-by: Lorenzo Stoakes Reported-by: Julian Orth Closes: https://lore.kernel.org/all/CAHijbEUMhvJTN9Xw1GmbM266FXXv=U7s4L_Jem5x3AaPZxrYpQ@mail.gmail.com/ Cc: Jann Horn Cc: Liam R. Howlett Cc: Linus Torvalds Cc: Shuah Khan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- include/linux/memfd.h | 14 +++++++++++++ include/linux/mm.h | 58 +++++++++++++++++++++++++++++++++++---------------- mm/memfd.c | 2 +- mm/mmap.c | 4 ++++ 4 files changed, 59 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memfd.h b/include/linux/memfd.h index 3f2cf339ceaf..d437e3070850 100644 --- a/include/linux/memfd.h +++ b/include/linux/memfd.h @@ -7,6 +7,7 @@ #ifdef CONFIG_MEMFD_CREATE extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg); struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx); +unsigned int *memfd_file_seals_ptr(struct file *file); #else static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned int a) { @@ -16,6 +17,19 @@ static inline struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) { return ERR_PTR(-EINVAL); } + +static inline unsigned int *memfd_file_seals_ptr(struct file *file) +{ + return NULL; +} #endif +/* Retrieve memfd seals associated with the file, if any. */ +static inline unsigned int memfd_file_seals(struct file *file) +{ + unsigned int *sealsp = memfd_file_seals_ptr(file); + + return sealsp ? *sealsp : 0; +} + #endif /* __LINUX_MEMFD_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 338a76ce9083..fb397918c43d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4101,6 +4101,37 @@ void mem_dump_obj(void *object); static inline void mem_dump_obj(void *object) {} #endif +static inline bool is_write_sealed(int seals) +{ + return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE); +} + +/** + * is_readonly_sealed - Checks whether write-sealed but mapped read-only, + * in which case writes should be disallowing moving + * forwards. + * @seals: the seals to check + * @vm_flags: the VMA flags to check + * + * Returns whether readonly sealed, in which case writess should be disallowed + * going forward. + */ +static inline bool is_readonly_sealed(int seals, vm_flags_t vm_flags) +{ + /* + * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as + * MAP_SHARED and read-only, take care to not allow mprotect to + * revert protections on such mappings. Do this only for shared + * mappings. For private mappings, don't need to mask + * VM_MAYWRITE as we still want them to be COW-writable. + */ + if (is_write_sealed(seals) && + ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_SHARED)) + return true; + + return false; +} + /** * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and * handle them. @@ -4112,24 +4143,15 @@ static inline void mem_dump_obj(void *object) {} */ static inline int seal_check_write(int seals, struct vm_area_struct *vma) { - if (seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { - /* - * New PROT_WRITE and MAP_SHARED mmaps are not allowed when - * write seals are active. - */ - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) - return -EPERM; - - /* - * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as - * MAP_SHARED and read-only, take care to not allow mprotect to - * revert protections on such mappings. Do this only for shared - * mappings. For private mappings, don't need to mask - * VM_MAYWRITE as we still want them to be COW-writable. - */ - if (vma->vm_flags & VM_SHARED) - vm_flags_clear(vma, VM_MAYWRITE); - } + if (!is_write_sealed(seals)) + return 0; + + /* + * New PROT_WRITE and MAP_SHARED mmaps are not allowed when + * write seals are active. + */ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) + return -EPERM; return 0; } diff --git a/mm/memfd.c b/mm/memfd.c index c17c3ea701a1..35a370d75c9a 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -170,7 +170,7 @@ static int memfd_wait_for_pins(struct address_space *mapping) return error; } -static unsigned int *memfd_file_seals_ptr(struct file *file) +unsigned int *memfd_file_seals_ptr(struct file *file) { if (shmem_file(file)) return &SHMEM_I(file_inode(file))->seals; diff --git a/mm/mmap.c b/mm/mmap.c index d32b7e701058..16f8e8be01f8 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -368,6 +369,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (file) { struct inode *inode = file_inode(file); + unsigned int seals = memfd_file_seals(file); unsigned long flags_mask; if (!file_mmap_ok(file, inode, pgoff, len)) @@ -408,6 +410,8 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags |= VM_SHARED | VM_MAYSHARE; if (!(file->f_mode & FMODE_WRITE)) vm_flags &= ~(VM_MAYWRITE | VM_SHARED); + else if (is_readonly_sealed(seals, vm_flags)) + vm_flags &= ~VM_MAYWRITE; fallthrough; case MAP_PRIVATE: if (!(file->f_mode & FMODE_READ)) -- cgit v1.2.3 From 59d9094df3d79443937add8700b2ef1a866b1081 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Mon, 16 Dec 2024 15:11:47 +0800 Subject: mm: hugetlb: independent PMD page table shared count The folio refcount may be increased unexpectly through try_get_folio() by caller such as split_huge_pages. In huge_pmd_unshare(), we use refcount to check whether a pmd page table is shared. The check is incorrect if the refcount is increased by the above caller, and this can cause the page table leaked: BUG: Bad page state in process sh pfn:109324 page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x66 pfn:0x109324 flags: 0x17ffff800000000(node=0|zone=2|lastcpupid=0xfffff) page_type: f2(table) raw: 017ffff800000000 0000000000000000 0000000000000000 0000000000000000 raw: 0000000000000066 0000000000000000 00000000f2000000 0000000000000000 page dumped because: nonzero mapcount ... CPU: 31 UID: 0 PID: 7515 Comm: sh Kdump: loaded Tainted: G B 6.13.0-rc2master+ #7 Tainted: [B]=BAD_PAGE Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 Call trace: show_stack+0x20/0x38 (C) dump_stack_lvl+0x80/0xf8 dump_stack+0x18/0x28 bad_page+0x8c/0x130 free_page_is_bad_report+0xa4/0xb0 free_unref_page+0x3cc/0x620 __folio_put+0xf4/0x158 split_huge_pages_all+0x1e0/0x3e8 split_huge_pages_write+0x25c/0x2d8 full_proxy_write+0x64/0xd8 vfs_write+0xcc/0x280 ksys_write+0x70/0x110 __arm64_sys_write+0x24/0x38 invoke_syscall+0x50/0x120 el0_svc_common.constprop.0+0xc8/0xf0 do_el0_svc+0x24/0x38 el0_svc+0x34/0x128 el0t_64_sync_handler+0xc8/0xd0 el0t_64_sync+0x190/0x198 The issue may be triggered by damon, offline_page, page_idle, etc, which will increase the refcount of page table. 1. The page table itself will be discarded after reporting the "nonzero mapcount". 2. The HugeTLB page mapped by the page table miss freeing since we treat the page table as shared and a shared page table will not be unmapped. Fix it by introducing independent PMD page table shared count. As described by comment, pt_index/pt_mm/pt_frag_refcount are used for s390 gmap, x86 pgds and powerpc, pt_share_count is used for x86/arm64/riscv pmds, so we can reuse the field as pt_share_count. Link: https://lkml.kernel.org/r/20241216071147.3984217-1-liushixin2@huawei.com Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page") Signed-off-by: Liu Shixin Cc: Kefeng Wang Cc: Ken Chen Cc: Muchun Song Cc: Nanyong Sun Cc: Jane Chu Cc: Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 + include/linux/mm_types.h | 30 ++++++++++++++++++++++++++++++ mm/hugetlb.c | 16 +++++++--------- 3 files changed, 38 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index fb397918c43d..b1c3db9cf355 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3125,6 +3125,7 @@ static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc) if (!pmd_ptlock_init(ptdesc)) return false; __folio_set_pgtable(folio); + ptdesc_pmd_pts_init(ptdesc); lruvec_stat_add_folio(folio, NR_PAGETABLE); return true; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7361a8f3ab68..332cee285662 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -445,6 +445,7 @@ FOLIO_MATCH(compound_head, _head_2a); * @pt_index: Used for s390 gmap. * @pt_mm: Used for x86 pgds. * @pt_frag_refcount: For fragmented page table tracking. Powerpc only. + * @pt_share_count: Used for HugeTLB PMD page table share count. * @_pt_pad_2: Padding to ensure proper alignment. * @ptl: Lock for the page table. * @__page_type: Same as page->page_type. Unused for page tables. @@ -471,6 +472,9 @@ struct ptdesc { pgoff_t pt_index; struct mm_struct *pt_mm; atomic_t pt_frag_refcount; +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING + atomic_t pt_share_count; +#endif }; union { @@ -516,6 +520,32 @@ static_assert(sizeof(struct ptdesc) <= sizeof(struct page)); const struct page *: (const struct ptdesc *)(p), \ struct page *: (struct ptdesc *)(p))) +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING +static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc) +{ + atomic_set(&ptdesc->pt_share_count, 0); +} + +static inline void ptdesc_pmd_pts_inc(struct ptdesc *ptdesc) +{ + atomic_inc(&ptdesc->pt_share_count); +} + +static inline void ptdesc_pmd_pts_dec(struct ptdesc *ptdesc) +{ + atomic_dec(&ptdesc->pt_share_count); +} + +static inline int ptdesc_pmd_pts_count(struct ptdesc *ptdesc) +{ + return atomic_read(&ptdesc->pt_share_count); +} +#else +static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc) +{ +} +#endif + /* * Used for sizing the vmemmap region on some architectures */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cec4b121193f..c498874a7170 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7211,7 +7211,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, spte = hugetlb_walk(svma, saddr, vma_mmu_pagesize(svma)); if (spte) { - get_page(virt_to_page(spte)); + ptdesc_pmd_pts_inc(virt_to_ptdesc(spte)); break; } } @@ -7226,7 +7226,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, (pmd_t *)((unsigned long)spte & PAGE_MASK)); mm_inc_nr_pmds(mm); } else { - put_page(virt_to_page(spte)); + ptdesc_pmd_pts_dec(virt_to_ptdesc(spte)); } spin_unlock(&mm->page_table_lock); out: @@ -7238,10 +7238,6 @@ out: /* * unmap huge page backed by shared pte. * - * Hugetlb pte page is ref counted at the time of mapping. If pte is shared - * indicated by page_count > 1, unmap is achieved by clearing pud and - * decrementing the ref count. If count == 1, the pte page is not shared. - * * Called with page table lock held. * * returns: 1 successfully unmapped a shared pte page @@ -7250,18 +7246,20 @@ out: int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { + unsigned long sz = huge_page_size(hstate_vma(vma)); pgd_t *pgd = pgd_offset(mm, addr); p4d_t *p4d = p4d_offset(pgd, addr); pud_t *pud = pud_offset(p4d, addr); i_mmap_assert_write_locked(vma->vm_file->f_mapping); hugetlb_vma_assert_locked(vma); - BUG_ON(page_count(virt_to_page(ptep)) == 0); - if (page_count(virt_to_page(ptep)) == 1) + if (sz != PMD_SIZE) + return 0; + if (!ptdesc_pmd_pts_count(virt_to_ptdesc(ptep))) return 0; pud_clear(pud); - put_page(virt_to_page(ptep)); + ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep)); mm_dec_nr_pmds(mm); return 1; } -- cgit v1.2.3 From 11673247700e2af3a6a95f7b3f1bb80b691c950e Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Thu, 19 Dec 2024 14:18:28 +0200 Subject: percpu: remove intermediate variable in PERCPU_PTR() The intermediate variable in the PERCPU_PTR() macro results in a kernel panic on boot [1] due to a compiler bug seen when compiling the kernel (+ KASAN) with gcc 11.3.1, but not when compiling with latest gcc (v14.2)/clang(v18.1). To solve it, remove the intermediate variable (which is not needed) and keep the casting that resolves the address space checks. [1] Oops: general protection fault, probably for non-canonical address 0xdffffc0000000003: 0000 [#1] SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000018-0x000000000000001f] CPU: 0 UID: 0 PID: 547 Comm: iptables Not tainted 6.13.0-rc1_external_tested-master #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:nf_ct_netns_do_get+0x139/0x540 Code: 03 00 00 48 81 c4 88 00 00 00 5b 5d 41 5c 41 5d 41 5e 41 5f c3 4d 8d 75 08 48 b8 00 00 00 00 00 fc ff df 4c 89 f2 48 c1 ea 03 <0f> b6 04 02 84 c0 74 08 3c 03 0f 8e 27 03 00 00 41 8b 45 08 83 c0 RSP: 0018:ffff888116df75e8 EFLAGS: 00010207 RAX: dffffc0000000000 RBX: 1ffff11022dbeebe RCX: ffffffff839a2382 RDX: 0000000000000003 RSI: 0000000000000008 RDI: ffff88842ec46d10 RBP: 0000000000000002 R08: 0000000000000000 R09: fffffbfff0b0860c R10: ffff888116df75e8 R11: 0000000000000001 R12: ffffffff879d6a80 R13: 0000000000000016 R14: 000000000000001e R15: ffff888116df7908 FS: 00007fba01646740(0000) GS:ffff88842ec00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055bd901800d8 CR3: 00000001205f0003 CR4: 0000000000172eb0 Call Trace: ? die_addr+0x3d/0xa0 ? exc_general_protection+0x144/0x220 ? asm_exc_general_protection+0x22/0x30 ? __mutex_lock+0x2c2/0x1d70 ? nf_ct_netns_do_get+0x139/0x540 ? nf_ct_netns_do_get+0xb5/0x540 ? net_generic+0x1f0/0x1f0 ? __create_object+0x5e/0x80 xt_check_target+0x1f0/0x930 ? textify_hooks.constprop.0+0x110/0x110 ? pcpu_alloc_noprof+0x7cd/0xcf0 ? xt_find_target+0x148/0x1e0 find_check_entry.constprop.0+0x6c0/0x920 ? get_info+0x380/0x380 ? __virt_addr_valid+0x1df/0x3b0 ? kasan_quarantine_put+0xe3/0x200 ? kfree+0x13e/0x3d0 ? translate_table+0xaf5/0x1750 translate_table+0xbd8/0x1750 ? ipt_unregister_table_exit+0x30/0x30 ? __might_fault+0xbb/0x170 do_ipt_set_ctl+0x408/0x1340 ? nf_sockopt_find.constprop.0+0x17b/0x1f0 ? lock_downgrade+0x680/0x680 ? lockdep_hardirqs_on_prepare+0x284/0x400 ? ipt_register_table+0x440/0x440 ? bit_wait_timeout+0x160/0x160 nf_setsockopt+0x6f/0xd0 raw_setsockopt+0x7e/0x200 ? raw_bind+0x590/0x590 ? do_user_addr_fault+0x812/0xd20 do_sock_setsockopt+0x1e2/0x3f0 ? move_addr_to_user+0x90/0x90 ? lock_downgrade+0x680/0x680 __sys_setsockopt+0x9e/0x100 __x64_sys_setsockopt+0xb9/0x150 ? do_syscall_64+0x33/0x140 do_syscall_64+0x6d/0x140 entry_SYSCALL_64_after_hwframe+0x4b/0x53 RIP: 0033:0x7fba015134ce Code: 0f 1f 40 00 48 8b 15 59 69 0e 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b1 0f 1f 00 f3 0f 1e fa 49 89 ca b8 36 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 0a c3 66 0f 1f 84 00 00 00 00 00 48 8b 15 21 RSP: 002b:00007ffd9de6f388 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 000055bd9017f490 RCX: 00007fba015134ce RDX: 0000000000000040 RSI: 0000000000000000 RDI: 0000000000000004 RBP: 0000000000000500 R08: 0000000000000560 R09: 0000000000000052 R10: 000055bd901800e0 R11: 0000000000000246 R12: 000055bd90180140 R13: 000055bd901800e0 R14: 000055bd9017f498 R15: 000055bd9017ff10 Modules linked in: xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay zram zsmalloc mlx4_ib mlx4_en mlx4_core rpcrdma rdma_ucm ib_uverbs ib_iser libiscsi scsi_transport_iscsi fuse ib_umad rdma_cm ib_ipoib iw_cm ib_cm ib_core ---[ end trace 0000000000000000 ]--- [akpm@linux-foundation.org: simplification, per Uros] Link: https://lkml.kernel.org/r/20241219121828.2120780-1-gal@nvidia.com Fixes: dabddd687c9e ("percpu: cast percpu pointer in PERCPU_PTR() via unsigned long") Signed-off-by: Gal Pressman Closes: https://lore.kernel.org/all/7590f546-4021-4602-9252-0d525de35b52@nvidia.com Cc: Uros Bizjak Cc: Bill Wendling Cc: Christoph Lameter Cc: Dennis Zhou Cc: Justin Stitt Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Tejun Heo Signed-off-by: Andrew Morton --- include/linux/percpu-defs.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 35842d1e3879..5b520fe86b60 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -221,10 +221,7 @@ do { \ } while (0) #define PERCPU_PTR(__p) \ -({ \ - unsigned long __pcpu_ptr = (__force unsigned long)(__p); \ - (typeof(*(__p)) __force __kernel *)(__pcpu_ptr); \ -}) + (typeof(*(__p)) __force __kernel *)((__force unsigned long)(__p)) #ifdef CONFIG_SMP -- cgit v1.2.3 From f91a5b8089389eb408501af2762f168c3aaa7b79 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Dec 2024 16:10:04 +0000 Subject: af_packet: fix vlan_get_protocol_dgram() vs MSG_PEEK Blamed commit forgot MSG_PEEK case, allowing a crash [1] as found by syzbot. Rework vlan_get_protocol_dgram() to not touch skb at all, so that it can be used from many cpus on the same skb. Add a const qualifier to skb argument. [1] skbuff: skb_under_panic: text:ffffffff8a8ccd05 len:29 put:14 head:ffff88807fc8e400 data:ffff88807fc8e3f4 tail:0x11 end:0x140 dev: ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:206 ! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 1 UID: 0 PID: 5892 Comm: syz-executor883 Not tainted 6.13.0-rc4-syzkaller-00054-gd6ef8b40d075 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 RIP: 0010:skb_panic net/core/skbuff.c:206 [inline] RIP: 0010:skb_under_panic+0x14b/0x150 net/core/skbuff.c:216 Code: 0b 8d 48 c7 c6 86 d5 25 8e 48 8b 54 24 08 8b 0c 24 44 8b 44 24 04 4d 89 e9 50 41 54 41 57 41 56 e8 5a 69 79 f7 48 83 c4 20 90 <0f> 0b 0f 1f 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 RSP: 0018:ffffc900038d7638 EFLAGS: 00010282 RAX: 0000000000000087 RBX: dffffc0000000000 RCX: 609ffd18ea660600 RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000 RBP: ffff88802483c8d0 R08: ffffffff817f0a8c R09: 1ffff9200071ae60 R10: dffffc0000000000 R11: fffff5200071ae61 R12: 0000000000000140 R13: ffff88807fc8e400 R14: ffff88807fc8e3f4 R15: 0000000000000011 FS: 00007fbac5e006c0(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fbac5e00d58 CR3: 000000001238e000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: skb_push+0xe5/0x100 net/core/skbuff.c:2636 vlan_get_protocol_dgram+0x165/0x290 net/packet/af_packet.c:585 packet_recvmsg+0x948/0x1ef0 net/packet/af_packet.c:3552 sock_recvmsg_nosec net/socket.c:1033 [inline] sock_recvmsg+0x22f/0x280 net/socket.c:1055 ____sys_recvmsg+0x1c6/0x480 net/socket.c:2803 ___sys_recvmsg net/socket.c:2845 [inline] do_recvmmsg+0x426/0xab0 net/socket.c:2940 __sys_recvmmsg net/socket.c:3014 [inline] __do_sys_recvmmsg net/socket.c:3037 [inline] __se_sys_recvmmsg net/socket.c:3030 [inline] __x64_sys_recvmmsg+0x199/0x250 net/socket.c:3030 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: 79eecf631c14 ("af_packet: Handle outgoing VLAN packets without hardware offloading") Reported-by: syzbot+74f70bb1cb968bf09e4f@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6772c485.050a0220.2f3838.04c5.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Chengen Du Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20241230161004.2681892-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/if_vlan.h | 16 +++++++++++++--- net/packet/af_packet.c | 16 ++++------------ 2 files changed, 17 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index c1645c86eed9..d65b5d71b93b 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -585,13 +585,16 @@ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) * vlan_get_protocol - get protocol EtherType. * @skb: skbuff to query * @type: first vlan protocol + * @mac_offset: MAC offset * @depth: buffer to store length of eth and vlan tags in bytes * * Returns the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ -static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, - int *depth) +static inline __be16 __vlan_get_protocol_offset(const struct sk_buff *skb, + __be16 type, + int mac_offset, + int *depth) { unsigned int vlan_depth = skb->mac_len, parse_depth = VLAN_MAX_DEPTH; @@ -610,7 +613,8 @@ static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, do { struct vlan_hdr vhdr, *vh; - vh = skb_header_pointer(skb, vlan_depth, sizeof(vhdr), &vhdr); + vh = skb_header_pointer(skb, mac_offset + vlan_depth, + sizeof(vhdr), &vhdr); if (unlikely(!vh || !--parse_depth)) return 0; @@ -625,6 +629,12 @@ static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, return type; } +static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, + int *depth) +{ + return __vlan_get_protocol_offset(skb, type, 0, depth); +} + /** * vlan_get_protocol - get protocol EtherType. * @skb: skbuff to query diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index e2e34a49e98d..2d73769d67f4 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -568,21 +568,13 @@ static u16 vlan_get_tci(const struct sk_buff *skb, struct net_device *dev) return ntohs(vh->h_vlan_TCI); } -static __be16 vlan_get_protocol_dgram(struct sk_buff *skb) +static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb) { __be16 proto = skb->protocol; - if (unlikely(eth_type_vlan(proto))) { - u8 *skb_orig_data = skb->data; - int skb_orig_len = skb->len; - - skb_push(skb, skb->data - skb_mac_header(skb)); - proto = __vlan_get_protocol(skb, proto, NULL); - if (skb_orig_data != skb->data) { - skb->data = skb_orig_data; - skb->len = skb_orig_len; - } - } + if (unlikely(eth_type_vlan(proto))) + proto = __vlan_get_protocol_offset(skb, proto, + skb_mac_offset(skb), NULL); return proto; } -- cgit v1.2.3 From 45d339fefaa3dcd237038769e0d34584fb867390 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Thu, 19 Dec 2024 14:23:36 +0200 Subject: RDMA/mlx5: Enable multiplane mode only when it is supported Driver queries vport_cxt.num_plane and enables multiplane when it is greater then 0, but some old FWs (versions from x.40.1000 till x.42.1000), report vport_cxt.num_plane = 1 unexpectedly. Fix it by querying num_plane only when HCA_CAP2.multiplane bit is set. Fixes: 2a5db20fa532 ("RDMA/mlx5: Add support to multi-plane device and port") Link: https://patch.msgid.link/r/1ef901acdf564716fcf550453cf5e94f343777ec.1734610916.git.leon@kernel.org Cc: stable@vger.kernel.org Reported-by: Francesco Poli Closes: https://lore.kernel.org/all/nvs4i2v7o6vn6zhmtq4sgazy2hu5kiulukxcntdelggmznnl7h@so3oul6uwgbl/ Signed-off-by: Mark Zhang Signed-off-by: Leon Romanovsky Reviewed-by: Michal Swiatkowski Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 2 +- include/linux/mlx5/mlx5_ifc.h | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index c2314797afc9..f5b59d02f4d3 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2839,7 +2839,7 @@ static int mlx5_ib_get_plane_num(struct mlx5_core_dev *mdev, u8 *num_plane) int err; *num_plane = 0; - if (!MLX5_CAP_GEN(mdev, ib_virt)) + if (!MLX5_CAP_GEN(mdev, ib_virt) || !MLX5_CAP_GEN_2(mdev, multiplane)) return 0; err = mlx5_query_hca_vport_context(mdev, 0, 1, 0, &vport_ctx); diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 4fbbcf35498b..48d47181c7cd 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -2119,7 +2119,9 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 migration_in_chunks[0x1]; u8 reserved_at_d1[0x1]; u8 sf_eq_usage[0x1]; - u8 reserved_at_d3[0xd]; + u8 reserved_at_d3[0x5]; + u8 multiplane[0x1]; + u8 reserved_at_d9[0x7]; u8 cross_vhca_object_to_object_supported[0x20]; -- cgit v1.2.3 From dadf03cfd4eaa09f1d0e8b2521de1e11d3e3bec1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 3 Jan 2025 15:02:23 +0000 Subject: io_uring/cmd: rename struct uring_cache to io_uring_cmd_data In preparation for making this more generically available for ->uring_cmd() usage that needs stable command data, rename it and move it to io_uring/cmd.h instead. Signed-off-by: Jens Axboe Signed-off-by: David Sterba --- include/linux/io_uring/cmd.h | 4 ++++ io_uring/io_uring.c | 2 +- io_uring/opdef.c | 3 ++- io_uring/uring_cmd.c | 10 +++++----- io_uring/uring_cmd.h | 4 ---- 5 files changed, 12 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index c189d36ad55e..24cff2b9b9d4 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -18,6 +18,10 @@ struct io_uring_cmd { u8 pdu[32]; /* available inline for free use */ }; +struct io_uring_cmd_data { + struct io_uring_sqe sqes[2]; +}; + static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) { return sqe->cmd; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index b2736e3491b8..8ae6bf746fcc 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -315,7 +315,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ret |= io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_async_rw)); ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX, - sizeof(struct uring_cache)); + sizeof(struct io_uring_cmd_data)); spin_lock_init(&ctx->msg_lock); ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_kiocb)); diff --git a/io_uring/opdef.c b/io_uring/opdef.c index a2be3bbca5ff..c7746f67cc65 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "io_uring.h" #include "opdef.h" @@ -414,7 +415,7 @@ const struct io_issue_def io_issue_defs[] = { .plug = 1, .iopoll = 1, .iopoll_queue = 1, - .async_size = 2 * sizeof(struct io_uring_sqe), + .async_size = sizeof(struct io_uring_cmd_data), .prep = io_uring_cmd_prep, .issue = io_uring_cmd, }, diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index e2e8485932d6..eefc203a1214 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -16,10 +16,10 @@ #include "rsrc.h" #include "uring_cmd.h" -static struct uring_cache *io_uring_async_get(struct io_kiocb *req) +static struct io_uring_cmd_data *io_uring_async_get(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - struct uring_cache *cache; + struct io_uring_cmd_data *cache; cache = io_alloc_cache_get(&ctx->uring_cache); if (cache) { @@ -35,7 +35,7 @@ static struct uring_cache *io_uring_async_get(struct io_kiocb *req) static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - struct uring_cache *cache = req->async_data; + struct io_uring_cmd_data *cache = req->async_data; if (issue_flags & IO_URING_F_UNLOCKED) return; @@ -183,7 +183,7 @@ static int io_uring_cmd_prep_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - struct uring_cache *cache; + struct io_uring_cmd_data *cache; cache = io_uring_async_get(req); if (unlikely(!cache)) @@ -256,7 +256,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) ret = file->f_op->uring_cmd(ioucmd, issue_flags); if (ret == -EAGAIN) { - struct uring_cache *cache = req->async_data; + struct io_uring_cmd_data *cache = req->async_data; if (ioucmd->sqe != (void *) cache) memcpy(cache, ioucmd->sqe, uring_sqe_size(req->ctx)); diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h index a361f98664d2..515823ca68b8 100644 --- a/io_uring/uring_cmd.h +++ b/io_uring/uring_cmd.h @@ -1,9 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -struct uring_cache { - struct io_uring_sqe sqes[2]; -}; - int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags); int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); -- cgit v1.2.3 From 3347fa658a1baecd61b007787d031b729cd86537 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 3 Jan 2025 15:02:24 +0000 Subject: io_uring/cmd: add per-op data to struct io_uring_cmd_data In case an op handler for ->uring_cmd() needs stable storage for user data, it can allocate io_uring_cmd_data->op_data and use it for the duration of the request. When the request gets cleaned up, uring_cmd will free it automatically. Signed-off-by: Jens Axboe Signed-off-by: David Sterba --- include/linux/io_uring/cmd.h | 1 + io_uring/uring_cmd.c | 13 +++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 24cff2b9b9d4..3df6636ec3a3 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -20,6 +20,7 @@ struct io_uring_cmd { struct io_uring_cmd_data { struct io_uring_sqe sqes[2]; + void *op_data; }; static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index eefc203a1214..019d6f49ff20 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -23,12 +23,16 @@ static struct io_uring_cmd_data *io_uring_async_get(struct io_kiocb *req) cache = io_alloc_cache_get(&ctx->uring_cache); if (cache) { + cache->op_data = NULL; req->flags |= REQ_F_ASYNC_DATA; req->async_data = cache; return cache; } - if (!io_alloc_async_data(req)) - return req->async_data; + if (!io_alloc_async_data(req)) { + cache = req->async_data; + cache->op_data = NULL; + return cache; + } return NULL; } @@ -37,6 +41,11 @@ static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags) struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); struct io_uring_cmd_data *cache = req->async_data; + if (cache->op_data) { + kfree(cache->op_data); + cache->op_data = NULL; + } + if (issue_flags & IO_URING_F_UNLOCKED) return; if (io_alloc_cache_put(&req->ctx->uring_cache, cache)) { -- cgit v1.2.3 From b0af20d33f63c74985a6dd98344326e5111b2fea Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Fri, 3 Jan 2025 15:02:25 +0000 Subject: io_uring: add io_uring_cmd_get_async_data helper Add a helper function in include/linux/io_uring/cmd.h to read the async_data pointer from a struct io_uring_cmd. Signed-off-by: Mark Harmstone Signed-off-by: David Sterba --- include/linux/io_uring/cmd.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 3df6636ec3a3..b0aeec834c1d 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -118,4 +118,9 @@ static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd return cmd_to_io_kiocb(cmd)->task; } +static inline struct io_uring_cmd_data *io_uring_cmd_get_async_data(struct io_uring_cmd *cmd) +{ + return cmd_to_io_kiocb(cmd)->async_data; +} + #endif /* _LINUX_IO_URING_CMD_H */ -- cgit v1.2.3 From 1156b5e8be98c97087f8971609c852e418daf03b Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Sat, 4 Jan 2025 17:20:57 +0530 Subject: regulator: Guard of_regulator_bulk_get_all() with CONFIG_OF Since the definition is in drivers/regulator/of_regulator.c and compiled only if CONFIG_OF is enabled, building the consumer driver without CONFIG_OF and with CONFIG_REGULATOR will result in below build error: ERROR: modpost: "of_regulator_bulk_get_all" [drivers/pci/pwrctrl/pci-pwrctl-slot.ko] undefined! Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202412181640.12Iufkvd-lkp@intel.com/ Fixes: 27b9ecc7a9ba ("regulator: Add of_regulator_bulk_get_all") Signed-off-by: Manivannan Sadhasivam Reviewed-by: Bartosz Golaszewski Link: https://patch.msgid.link/20250104115058.19216-2-manivannan.sadhasivam@linaro.org Signed-off-by: Mark Brown --- include/linux/regulator/consumer.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index 8c3c372ad735..85be83c8fa17 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -175,6 +175,8 @@ struct regulator *__must_check of_regulator_get_optional(struct device *dev, struct regulator *__must_check devm_of_regulator_get_optional(struct device *dev, struct device_node *node, const char *id); +int __must_check of_regulator_bulk_get_all(struct device *dev, struct device_node *np, + struct regulator_bulk_data **consumers); #else static inline struct regulator *__must_check of_regulator_get_optional(struct device *dev, struct device_node *node, @@ -189,6 +191,13 @@ static inline struct regulator *__must_check devm_of_regulator_get_optional(stru { return ERR_PTR(-ENODEV); } + +static inline int of_regulator_bulk_get_all(struct device *dev, struct device_node *np, + struct regulator_bulk_data **consumers) +{ + return 0; +} + #endif int regulator_register_supply_alias(struct device *dev, const char *id, @@ -223,8 +232,6 @@ int regulator_disable_deferred(struct regulator *regulator, int ms); int __must_check regulator_bulk_get(struct device *dev, int num_consumers, struct regulator_bulk_data *consumers); -int __must_check of_regulator_bulk_get_all(struct device *dev, struct device_node *np, - struct regulator_bulk_data **consumers); int __must_check devm_regulator_bulk_get(struct device *dev, int num_consumers, struct regulator_bulk_data *consumers); void devm_regulator_bulk_put(struct regulator_bulk_data *consumers); @@ -483,12 +490,6 @@ static inline int devm_regulator_bulk_get(struct device *dev, int num_consumers, return 0; } -static inline int of_regulator_bulk_get_all(struct device *dev, struct device_node *np, - struct regulator_bulk_data **consumers) -{ - return 0; -} - static inline int devm_regulator_bulk_get_const( struct device *dev, int num_consumers, const struct regulator_bulk_data *in_consumers, -- cgit v1.2.3 From 907af7d6e0c8cf4086b1bc5218281b2ca09f130b Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Sat, 4 Jan 2025 17:20:58 +0530 Subject: regulator: Move OF_ API declarations/definitions outside CONFIG_REGULATOR Since these are hidden inside CONFIG_REGULATOR, building the consumer drivers without CONFIG_REGULATOR will result in the following build error: >> drivers/pci/pwrctrl/slot.c:39:15: error: implicit declaration of function 'of_regulator_bulk_get_all'; did you mean 'regulator_bulk_get'? [-Werror=implicit-function-declaration] 39 | ret = of_regulator_bulk_get_all(dev, dev_of_node(dev), | ^~~~~~~~~~~~~~~~~~~~~~~~~ | regulator_bulk_get cc1: some warnings being treated as errors This also removes the duplicated definitions that were possibly added to fix the build issues. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202501020407.HmQQQKa0-lkp@intel.com/ Fixes: 27b9ecc7a9ba ("regulator: Add of_regulator_bulk_get_all") Signed-off-by: Manivannan Sadhasivam Link: https://patch.msgid.link/20250104115058.19216-3-manivannan.sadhasivam@linaro.org Signed-off-by: Mark Brown --- include/linux/regulator/consumer.h | 78 ++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 46 deletions(-) (limited to 'include/linux') diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index 85be83c8fa17..bcba3935c6f9 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -168,38 +168,6 @@ int devm_regulator_get_enable_read_voltage(struct device *dev, const char *id); void regulator_put(struct regulator *regulator); void devm_regulator_put(struct regulator *regulator); -#if IS_ENABLED(CONFIG_OF) -struct regulator *__must_check of_regulator_get_optional(struct device *dev, - struct device_node *node, - const char *id); -struct regulator *__must_check devm_of_regulator_get_optional(struct device *dev, - struct device_node *node, - const char *id); -int __must_check of_regulator_bulk_get_all(struct device *dev, struct device_node *np, - struct regulator_bulk_data **consumers); -#else -static inline struct regulator *__must_check of_regulator_get_optional(struct device *dev, - struct device_node *node, - const char *id) -{ - return ERR_PTR(-ENODEV); -} - -static inline struct regulator *__must_check devm_of_regulator_get_optional(struct device *dev, - struct device_node *node, - const char *id) -{ - return ERR_PTR(-ENODEV); -} - -static inline int of_regulator_bulk_get_all(struct device *dev, struct device_node *np, - struct regulator_bulk_data **consumers) -{ - return 0; -} - -#endif - int regulator_register_supply_alias(struct device *dev, const char *id, struct device *alias_dev, const char *alias_id); @@ -380,20 +348,6 @@ devm_regulator_get_optional(struct device *dev, const char *id) return ERR_PTR(-ENODEV); } -static inline struct regulator *__must_check of_regulator_get_optional(struct device *dev, - struct device_node *node, - const char *id) -{ - return ERR_PTR(-ENODEV); -} - -static inline struct regulator *__must_check devm_of_regulator_get_optional(struct device *dev, - struct device_node *node, - const char *id) -{ - return ERR_PTR(-ENODEV); -} - static inline void regulator_put(struct regulator *regulator) { } @@ -701,6 +655,38 @@ regulator_is_equal(struct regulator *reg1, struct regulator *reg2) } #endif +#if IS_ENABLED(CONFIG_OF) && IS_ENABLED(CONFIG_REGULATOR) +struct regulator *__must_check of_regulator_get_optional(struct device *dev, + struct device_node *node, + const char *id); +struct regulator *__must_check devm_of_regulator_get_optional(struct device *dev, + struct device_node *node, + const char *id); +int __must_check of_regulator_bulk_get_all(struct device *dev, struct device_node *np, + struct regulator_bulk_data **consumers); +#else +static inline struct regulator *__must_check of_regulator_get_optional(struct device *dev, + struct device_node *node, + const char *id) +{ + return ERR_PTR(-ENODEV); +} + +static inline struct regulator *__must_check devm_of_regulator_get_optional(struct device *dev, + struct device_node *node, + const char *id) +{ + return ERR_PTR(-ENODEV); +} + +static inline int of_regulator_bulk_get_all(struct device *dev, struct device_node *np, + struct regulator_bulk_data **consumers) +{ + return 0; +} + +#endif + static inline int regulator_set_voltage_triplet(struct regulator *regulator, int min_uV, int target_uV, int max_uV) -- cgit v1.2.3 From 344bac8f0d73fe970cd9f5b2f132906317d29e8b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 15 Dec 2024 21:17:05 +0100 Subject: fs: kill MNT_ONRB Move mnt->mnt_node into the union with mnt->mnt_rcu and mnt->mnt_llist instead of keeping it with mnt->mnt_list. This allows us to use RB_CLEAR_NODE(&mnt->mnt_node) in umount_tree() as well as list_empty(&mnt->mnt_node). That in turn allows us to remove MNT_ONRB. This also fixes the bug reported in [1] where seemingly MNT_ONRB wasn't set in @mnt->mnt_flags even though the mount was present in the mount rbtree of the mount namespace. The root cause is the following race. When a btrfs subvolume is mounted a temporary mount is created: btrfs_get_tree_subvol() { mnt = fc_mount() // Register the newly allocated mount with sb->mounts: lock_mount_hash(); list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts); unlock_mount_hash(); } and registered on sb->s_mounts. Later it is added to an anonymous mount namespace via mount_subvol(): -> mount_subvol() -> mount_subtree() -> alloc_mnt_ns() mnt_add_to_ns() vfs_path_lookup() put_mnt_ns() The mnt_add_to_ns() call raises MNT_ONRB in @mnt->mnt_flags. If someone concurrently does a ro remount: reconfigure_super() -> sb_prepare_remount_readonly() { list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { } all mounts registered in sb->s_mounts are visited and first MNT_WRITE_HOLD is raised, then MNT_READONLY is raised, and finally MNT_WRITE_HOLD is removed again. The flag modification for MNT_WRITE_HOLD/MNT_READONLY and MNT_ONRB race so MNT_ONRB might be lost. Fixes: 2eea9ce4310d ("mounts: keep list of mounts in an rbtree") Cc: # v6.8+ Link: https://lore.kernel.org/r/20241215-vfs-6-14-mount-work-v1-1-fd55922c4af8@kernel.org Link: https://lore.kernel.org/r/ec6784ed-8722-4695-980a-4400d4e7bd1a@gmx.com [1] Signed-off-by: Christian Brauner --- fs/mount.h | 15 +++++++++------ fs/namespace.c | 14 ++++++-------- include/linux/mount.h | 3 +-- 3 files changed, 16 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/fs/mount.h b/fs/mount.h index 185fc56afc13..179f690a0c72 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -38,6 +38,7 @@ struct mount { struct dentry *mnt_mountpoint; struct vfsmount mnt; union { + struct rb_node mnt_node; /* node in the ns->mounts rbtree */ struct rcu_head mnt_rcu; struct llist_node mnt_llist; }; @@ -51,10 +52,7 @@ struct mount { struct list_head mnt_child; /* and going through their mnt_child */ struct list_head mnt_instance; /* mount instance on sb->s_mounts */ const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */ - union { - struct rb_node mnt_node; /* Under ns->mounts */ - struct list_head mnt_list; - }; + struct list_head mnt_list; struct list_head mnt_expire; /* link in fs-specific expiry list */ struct list_head mnt_share; /* circular list of shared mounts */ struct list_head mnt_slave_list;/* list of slave mounts */ @@ -145,11 +143,16 @@ static inline bool is_anon_ns(struct mnt_namespace *ns) return ns->seq == 0; } +static inline bool mnt_ns_attached(const struct mount *mnt) +{ + return !RB_EMPTY_NODE(&mnt->mnt_node); +} + static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list) { - WARN_ON(!(mnt->mnt.mnt_flags & MNT_ONRB)); - mnt->mnt.mnt_flags &= ~MNT_ONRB; + WARN_ON(!mnt_ns_attached(mnt)); rb_erase(&mnt->mnt_node, &mnt->mnt_ns->mounts); + RB_CLEAR_NODE(&mnt->mnt_node); list_add_tail(&mnt->mnt_list, dt_list); } diff --git a/fs/namespace.c b/fs/namespace.c index 23e81c2a1e3f..847fa8443e8a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -344,6 +344,7 @@ static struct mount *alloc_vfsmnt(const char *name) INIT_HLIST_NODE(&mnt->mnt_mp_list); INIT_LIST_HEAD(&mnt->mnt_umounting); INIT_HLIST_HEAD(&mnt->mnt_stuck_children); + RB_CLEAR_NODE(&mnt->mnt_node); mnt->mnt.mnt_idmap = &nop_mnt_idmap; } return mnt; @@ -1124,7 +1125,7 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt) struct rb_node **link = &ns->mounts.rb_node; struct rb_node *parent = NULL; - WARN_ON(mnt->mnt.mnt_flags & MNT_ONRB); + WARN_ON(mnt_ns_attached(mnt)); mnt->mnt_ns = ns; while (*link) { parent = *link; @@ -1135,7 +1136,6 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt) } rb_link_node(&mnt->mnt_node, parent, link); rb_insert_color(&mnt->mnt_node, &ns->mounts); - mnt->mnt.mnt_flags |= MNT_ONRB; } /* @@ -1305,7 +1305,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, } mnt->mnt.mnt_flags = old->mnt.mnt_flags; - mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL|MNT_ONRB); + mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL); atomic_inc(&sb->s_active); mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt)); @@ -1763,7 +1763,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) /* Gather the mounts to umount */ for (p = mnt; p; p = next_mnt(p, mnt)) { p->mnt.mnt_flags |= MNT_UMOUNT; - if (p->mnt.mnt_flags & MNT_ONRB) + if (mnt_ns_attached(p)) move_from_ns(p, &tmp_list); else list_move(&p->mnt_list, &tmp_list); @@ -1912,16 +1912,14 @@ static int do_umount(struct mount *mnt, int flags) event++; if (flags & MNT_DETACH) { - if (mnt->mnt.mnt_flags & MNT_ONRB || - !list_empty(&mnt->mnt_list)) + if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list)) umount_tree(mnt, UMOUNT_PROPAGATE); retval = 0; } else { shrink_submounts(mnt); retval = -EBUSY; if (!propagate_mount_busy(mnt, 2)) { - if (mnt->mnt.mnt_flags & MNT_ONRB || - !list_empty(&mnt->mnt_list)) + if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list)) umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); retval = 0; } diff --git a/include/linux/mount.h b/include/linux/mount.h index c34c18b4e8f3..04213d8ef837 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -50,7 +50,7 @@ struct path; #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME ) #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \ - MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | MNT_ONRB) + MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED) #define MNT_INTERNAL 0x4000 @@ -64,7 +64,6 @@ struct path; #define MNT_SYNC_UMOUNT 0x2000000 #define MNT_MARKED 0x4000000 #define MNT_UMOUNT 0x8000000 -#define MNT_ONRB 0x10000000 struct vfsmount { struct dentry *mnt_root; /* root of the mounted tree */ -- cgit v1.2.3 From 155c5bf26f983e9988333eeb0ef217138304d13b Mon Sep 17 00:00:00 2001 From: guanjing Date: Fri, 20 Dec 2024 09:33:35 +0100 Subject: firewall: remove misplaced semicolon from stm32_firewall_get_firewall Remove misplaced colon in stm32_firewall_get_firewall() which results in a syntax error when the code is compiled without CONFIG_STM32_FIREWALL. Fixes: 5c9668cfc6d7 ("firewall: introduce stm32_firewall framework") Signed-off-by: guanjing Reviewed-by: Gatien Chevallier Signed-off-by: Alexandre Torgue Signed-off-by: Arnd Bergmann --- include/linux/bus/stm32_firewall_device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bus/stm32_firewall_device.h b/include/linux/bus/stm32_firewall_device.h index 18e0a2fc3816..5178b72bc920 100644 --- a/include/linux/bus/stm32_firewall_device.h +++ b/include/linux/bus/stm32_firewall_device.h @@ -115,7 +115,7 @@ void stm32_firewall_release_access_by_id(struct stm32_firewall *firewall, u32 su #else /* CONFIG_STM32_FIREWALL */ int stm32_firewall_get_firewall(struct device_node *np, struct stm32_firewall *firewall, - unsigned int nb_firewall); + unsigned int nb_firewall) { return -ENODEV; } -- cgit v1.2.3 From cacd9ae4bf801ff4125d8961bb9a3ba955e51680 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 7 Jan 2025 17:27:17 +0100 Subject: poll_wait: add mb() to fix theoretical race between waitqueue_active() and .poll() As the comment above waitqueue_active() explains, it can only be used if both waker and waiter have mb()'s that pair with each other. However __pollwait() is broken in this respect. This is not pipe-specific, but let's look at pipe_poll() for example: poll_wait(...); // -> __pollwait() -> add_wait_queue() LOAD(pipe->head); LOAD(pipe->head); In theory these LOAD()'s can leak into the critical section inside add_wait_queue() and can happen before list_add(entry, wq_head), in this case pipe_poll() can race with wakeup_pipe_readers/writers which do smp_mb(); if (waitqueue_active(wq_head)) wake_up_interruptible(wq_head); There are more __pollwait()-like functions (grep init_poll_funcptr), and it seems that at least ep_ptable_queue_proc() has the same problem, so the patch adds smp_mb() into poll_wait(). Link: https://lore.kernel.org/all/20250102163320.GA17691@redhat.com/ Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250107162717.GA18922@redhat.com Signed-off-by: Christian Brauner --- include/linux/poll.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/poll.h b/include/linux/poll.h index d1ea4f3714a8..fc641b50f129 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -41,8 +41,16 @@ typedef struct poll_table_struct { static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p) { - if (p && p->_qproc && wait_address) + if (p && p->_qproc && wait_address) { p->_qproc(filp, wait_address, p); + /* + * This memory barrier is paired in the wq_has_sleeper(). + * See the comment above prepare_to_wait(), we need to + * ensure that subsequent tests in this thread can't be + * reordered with __add_wait_queue() in _qproc() paths. + */ + smp_mb(); + } } /* -- cgit v1.2.3 From 10b02a2cfec2f106db4897ad87732db56d71e6fd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 7 Jan 2025 17:27:24 +0100 Subject: poll_wait: kill the obsolete wait_address check This check is historical and no longer needed, wait_address is never NULL. These days we rely on the poll_table->_qproc check. NULL if select/poll is not going to sleep, or it already has a data to report, or all waiters have already been registered after the 1st iteration. However, poll_table *p can be NULL, see p9_fd_poll() for example, so we can't remove the "p != NULL" check. Link: https://lore.kernel.org/all/20250106180325.GF7233@redhat.com/ Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250107162724.GA18926@redhat.com Signed-off-by: Christian Brauner --- include/linux/poll.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/poll.h b/include/linux/poll.h index fc641b50f129..57b6d1ccd8bf 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -41,7 +41,7 @@ typedef struct poll_table_struct { static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p) { - if (p && p->_qproc && wait_address) { + if (p && p->_qproc) { p->_qproc(filp, wait_address, p); /* * This memory barrier is paired in the wq_has_sleeper(). -- cgit v1.2.3 From f005bf18a57aadf3af1e85a0f0151cb3688ee606 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 7 Jan 2025 17:27:43 +0100 Subject: poll: kill poll_does_not_wait() It no longer has users. Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20250107162743.GA18947@redhat.com Signed-off-by: Christian Brauner --- include/linux/poll.h | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/poll.h b/include/linux/poll.h index 57b6d1ccd8bf..12bb18e8b978 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -25,14 +25,14 @@ struct poll_table_struct; -/* +/* * structures and helpers for f_op->poll implementations */ typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *); /* - * Do not touch the structure directly, use the access functions - * poll_does_not_wait() and poll_requested_events() instead. + * Do not touch the structure directly, use the access function + * poll_requested_events() instead. */ typedef struct poll_table_struct { poll_queue_proc _qproc; @@ -53,16 +53,6 @@ static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_addres } } -/* - * Return true if it is guaranteed that poll will not wait. This is the case - * if the poll() of another file descriptor in the set got an event, so there - * is no need for waiting. - */ -static inline bool poll_does_not_wait(const poll_table *p) -{ - return p == NULL || p->_qproc == NULL; -} - /* * Return the set of events that the application wants to poll for. * This is useful for drivers that need to know whether a DMA transfer has -- cgit v1.2.3