From 57df858a46f0a4cc104716e0ec88864e5c386ca4 Mon Sep 17 00:00:00 2001 From: Jassi Brar Date: Mon, 9 Feb 2026 17:36:19 -0600 Subject: mailbox: add API to query available TX queue slots Clients sometimes need to know whether the mailbox TX queue has room before posting a new message. Rather than exposing internal queue state through a struct field, provide a proper accessor function that returns the number of available slots for a given channel. This lets clients choose to back off when the queue is full instead of hitting the -ENOBUFS error path and the misleading "Try increasing MBOX_TX_QUEUE_LEN" warning. Tested-by: Tanmay Shah Signed-off-by: Jassi Brar --- include/linux/mailbox_client.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mailbox_client.h b/include/linux/mailbox_client.h index c6eea9afb943..e5997120f45c 100644 --- a/include/linux/mailbox_client.h +++ b/include/linux/mailbox_client.h @@ -45,6 +45,7 @@ int mbox_send_message(struct mbox_chan *chan, void *mssg); int mbox_flush(struct mbox_chan *chan, unsigned long timeout); void mbox_client_txdone(struct mbox_chan *chan, int r); /* atomic */ bool mbox_client_peek_data(struct mbox_chan *chan); /* atomic */ +unsigned int mbox_chan_tx_slots_available(struct mbox_chan *chan); /* atomic */ void mbox_free_channel(struct mbox_chan *chan); /* may sleep */ #endif /* __MAILBOX_CLIENT_H */ -- cgit v1.2.3 From 89e5d7d616009e5fada5da081b1d79cdd59150ab Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 27 Mar 2026 16:10:21 +0100 Subject: mailbox: remove superfluous internal header Quite some controller drivers use the defines from the internal header already. This prevents controller drivers outside the mailbox directory. Move the defines to the public controller header to allow this again as the defines are not strictly internal anyhow. Signed-off-by: Wolfram Sang Reviewed-by: Sudeep Holla Reviewed-by: Daniel Baluta Signed-off-by: Jassi Brar --- drivers/mailbox/cix-mailbox.c | 2 -- drivers/mailbox/hi3660-mailbox.c | 2 -- drivers/mailbox/imx-mailbox.c | 2 -- drivers/mailbox/mailbox-sti.c | 2 -- drivers/mailbox/mailbox.c | 2 -- drivers/mailbox/mailbox.h | 12 ------------ drivers/mailbox/omap-mailbox.c | 2 -- drivers/mailbox/pcc.c | 2 -- drivers/mailbox/tegra-hsp.c | 2 -- include/linux/mailbox_controller.h | 5 +++++ 10 files changed, 5 insertions(+), 28 deletions(-) delete mode 100644 drivers/mailbox/mailbox.h (limited to 'include') diff --git a/drivers/mailbox/cix-mailbox.c b/drivers/mailbox/cix-mailbox.c index 443620e8ae37..864f98f21fc3 100644 --- a/drivers/mailbox/cix-mailbox.c +++ b/drivers/mailbox/cix-mailbox.c @@ -12,8 +12,6 @@ #include #include -#include "mailbox.h" - /* * The maximum transmission size is 32 words or 128 bytes. */ diff --git a/drivers/mailbox/hi3660-mailbox.c b/drivers/mailbox/hi3660-mailbox.c index 17c29e960fbf..9b727a2b54a5 100644 --- a/drivers/mailbox/hi3660-mailbox.c +++ b/drivers/mailbox/hi3660-mailbox.c @@ -15,8 +15,6 @@ #include #include -#include "mailbox.h" - #define MBOX_CHAN_MAX 32 #define MBOX_RX 0x0 diff --git a/drivers/mailbox/imx-mailbox.c b/drivers/mailbox/imx-mailbox.c index 003f9236c35e..22331b579489 100644 --- a/drivers/mailbox/imx-mailbox.c +++ b/drivers/mailbox/imx-mailbox.c @@ -23,8 +23,6 @@ #include #include -#include "mailbox.h" - #define IMX_MU_CHANS 24 /* TX0/RX0/RXDB[0-3] */ #define IMX_MU_SCU_CHANS 6 diff --git a/drivers/mailbox/mailbox-sti.c b/drivers/mailbox/mailbox-sti.c index b4b5bdd503cf..b6c9ecbbc8ec 100644 --- a/drivers/mailbox/mailbox-sti.c +++ b/drivers/mailbox/mailbox-sti.c @@ -21,8 +21,6 @@ #include #include -#include "mailbox.h" - #define STI_MBOX_INST_MAX 4 /* RAM saving: Max supported instances */ #define STI_MBOX_CHAN_MAX 20 /* RAM saving: Max supported channels */ diff --git a/drivers/mailbox/mailbox.c b/drivers/mailbox/mailbox.c index 03473ae41ed1..13de3d047853 100644 --- a/drivers/mailbox/mailbox.c +++ b/drivers/mailbox/mailbox.c @@ -18,8 +18,6 @@ #include #include -#include "mailbox.h" - static LIST_HEAD(mbox_cons); static DEFINE_MUTEX(con_mutex); diff --git a/drivers/mailbox/mailbox.h b/drivers/mailbox/mailbox.h deleted file mode 100644 index e1ec4efab693..000000000000 --- a/drivers/mailbox/mailbox.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ - -#ifndef __MAILBOX_H -#define __MAILBOX_H - -#include - -#define TXDONE_BY_IRQ BIT(0) /* controller has remote RTR irq */ -#define TXDONE_BY_POLL BIT(1) /* controller can read status of last TX */ -#define TXDONE_BY_ACK BIT(2) /* S/W ACK received by Client ticks the TX */ - -#endif /* __MAILBOX_H */ diff --git a/drivers/mailbox/omap-mailbox.c b/drivers/mailbox/omap-mailbox.c index d9f100c18895..5772c6b9886a 100644 --- a/drivers/mailbox/omap-mailbox.c +++ b/drivers/mailbox/omap-mailbox.c @@ -22,8 +22,6 @@ #include #include -#include "mailbox.h" - #define MAILBOX_REVISION 0x000 #define MAILBOX_MESSAGE(m) (0x040 + 4 * (m)) #define MAILBOX_FIFOSTATUS(m) (0x080 + 4 * (m)) diff --git a/drivers/mailbox/pcc.c b/drivers/mailbox/pcc.c index 22e70af1ae5d..636879ae1db7 100644 --- a/drivers/mailbox/pcc.c +++ b/drivers/mailbox/pcc.c @@ -59,8 +59,6 @@ #include #include -#include "mailbox.h" - #define MBOX_IRQ_NAME "pcc-mbox" /** diff --git a/drivers/mailbox/tegra-hsp.c b/drivers/mailbox/tegra-hsp.c index ed9a0bb2bcd8..2231050bb5a9 100644 --- a/drivers/mailbox/tegra-hsp.c +++ b/drivers/mailbox/tegra-hsp.c @@ -16,8 +16,6 @@ #include -#include "mailbox.h" - #define HSP_INT_IE(x) (0x100 + ((x) * 4)) #define HSP_INT_IV 0x300 #define HSP_INT_IR 0x304 diff --git a/include/linux/mailbox_controller.h b/include/linux/mailbox_controller.h index 80a427c7ca29..16fef421c30c 100644 --- a/include/linux/mailbox_controller.h +++ b/include/linux/mailbox_controller.h @@ -3,6 +3,7 @@ #ifndef __MAILBOX_CONTROLLER_H #define __MAILBOX_CONTROLLER_H +#include #include #include #include @@ -11,6 +12,10 @@ struct mbox_chan; +#define TXDONE_BY_IRQ BIT(0) /* controller has remote RTR irq */ +#define TXDONE_BY_POLL BIT(1) /* controller can read status of last TX */ +#define TXDONE_BY_ACK BIT(2) /* S/W ACK received by Client ticks the TX */ + /** * struct mbox_chan_ops - methods to control mailbox channels * @send_data: The API asks the MBOX controller driver, in atomic -- cgit v1.2.3 From c58e9456e30c7098cbcd9f04571992be8a2e4e63 Mon Sep 17 00:00:00 2001 From: Jassi Brar Date: Fri, 27 Mar 2026 17:00:40 -0500 Subject: mailbox: Fix NULL message support in mbox_send_message() The active_req field serves double duty as both the "is a TX in flight" flag (NULL means idle) and the storage for the in-flight message pointer. When a client sends NULL via mbox_send_message(), active_req is set to NULL, which the framework misinterprets as "no active request". This breaks the TX state machine by: - tx_tick() short-circuits on (!mssg), skipping the tx_done callback and the tx_complete completion - txdone_hrtimer() skips the channel entirely since active_req is NULL, so poll-based TX-done detection never fires. Fix this by introducing a MBOX_NO_MSG sentinel value that means "no active request," freeing NULL to be valid message data. The sentinel is defined in the subsystem-internal mailbox.h so that controller drivers within drivers/mailbox/ can reference it, but it is not exposed to clients outside the subsystem. Fifteen in-tree callers send NULL (doorbell-style IPCs on Qualcomm, Tegra, TI, Xilinx, i.MX, SCMI, and PCC platforms). All were audited for regression: - Most already work around the bug via knows_txdone=true with a manual mbox_client_txdone() call, making the framework's tracking irrelevant. These are unaffected. - Poll-based callers (Xilinx zynqmp/r5) are strictly better off: the poll timer now correctly detects NULL-active channels instead of silently skipping them. - irq-qcom-mpm.c was a pre-existing bug -- the only Qualcomm caller that omitted the knows_txdone + mbox_client_txdone() pattern. Fixed in a companion commit ("irqchip/qcom-mpm: Fix missing mailbox TX done acknowledgment"). - No caller sets both a tx_done callback and sends NULL, nor combines tx_block=true with NULL sends, so the newly reachable callback/completion paths are never exercised. Also update tegra-hsp's flush callback, which directly inspects active_req to wait for the channel to drain: the old "!= NULL" check becomes "!= MBOX_NO_MSG", otherwise flush spins until timeout since the sentinel is non-NULL. The only tradeoff is that 'MBOX_NO_MSG' can not be used as a message by clients. Reported-by: Joonwon Kang Reviewed-by: Douglas Anderson Signed-off-by: Jassi Brar --- drivers/mailbox/mailbox.c | 15 ++++++++------- drivers/mailbox/tegra-hsp.c | 2 +- include/linux/mailbox_controller.h | 3 +++ 3 files changed, 12 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/mailbox/mailbox.c b/drivers/mailbox/mailbox.c index 13de3d047853..138ffbcd4fde 100644 --- a/drivers/mailbox/mailbox.c +++ b/drivers/mailbox/mailbox.c @@ -50,7 +50,7 @@ static void msg_submit(struct mbox_chan *chan) int err = -EBUSY; scoped_guard(spinlock_irqsave, &chan->lock) { - if (!chan->msg_count || chan->active_req) + if (!chan->msg_count || chan->active_req != MBOX_NO_MSG) break; count = chan->msg_count; @@ -85,13 +85,13 @@ static void tx_tick(struct mbox_chan *chan, int r) scoped_guard(spinlock_irqsave, &chan->lock) { mssg = chan->active_req; - chan->active_req = NULL; + chan->active_req = MBOX_NO_MSG; } /* Submit next message */ msg_submit(chan); - if (!mssg) + if (mssg == MBOX_NO_MSG) return; /* Notify the client */ @@ -112,7 +112,7 @@ static enum hrtimer_restart txdone_hrtimer(struct hrtimer *hrtimer) for (i = 0; i < mbox->num_chans; i++) { struct mbox_chan *chan = &mbox->chans[i]; - if (chan->active_req && chan->cl) { + if (chan->active_req != MBOX_NO_MSG && chan->cl) { txdone = chan->mbox->ops->last_tx_done(chan); if (txdone) tx_tick(chan, 0); @@ -267,7 +267,7 @@ int mbox_send_message(struct mbox_chan *chan, void *mssg) { int t; - if (!chan || !chan->cl) + if (!chan || !chan->cl || mssg == MBOX_NO_MSG) return -EINVAL; t = add_to_rbuf(chan, mssg); @@ -340,7 +340,7 @@ static int __mbox_bind_client(struct mbox_chan *chan, struct mbox_client *cl) scoped_guard(spinlock_irqsave, &chan->lock) { chan->msg_free = 0; chan->msg_count = 0; - chan->active_req = NULL; + chan->active_req = MBOX_NO_MSG; chan->cl = cl; init_completion(&chan->tx_complete); @@ -498,7 +498,7 @@ void mbox_free_channel(struct mbox_chan *chan) /* The queued TX requests are simply aborted, no callbacks are made */ scoped_guard(spinlock_irqsave, &chan->lock) { chan->cl = NULL; - chan->active_req = NULL; + chan->active_req = MBOX_NO_MSG; if (chan->txdone_method == TXDONE_BY_ACK) chan->txdone_method = TXDONE_BY_POLL; } @@ -553,6 +553,7 @@ int mbox_controller_register(struct mbox_controller *mbox) chan->cl = NULL; chan->mbox = mbox; + chan->active_req = MBOX_NO_MSG; chan->txdone_method = txdone; spin_lock_init(&chan->lock); } diff --git a/drivers/mailbox/tegra-hsp.c b/drivers/mailbox/tegra-hsp.c index 2231050bb5a9..7b1e1b83ea29 100644 --- a/drivers/mailbox/tegra-hsp.c +++ b/drivers/mailbox/tegra-hsp.c @@ -495,7 +495,7 @@ static int tegra_hsp_mailbox_flush(struct mbox_chan *chan, mbox_chan_txdone(chan, 0); /* Wait until channel is empty */ - if (chan->active_req != NULL) + if (chan->active_req != MBOX_NO_MSG) continue; return 0; diff --git a/include/linux/mailbox_controller.h b/include/linux/mailbox_controller.h index 16fef421c30c..e3896b08f22e 100644 --- a/include/linux/mailbox_controller.h +++ b/include/linux/mailbox_controller.h @@ -12,6 +12,9 @@ struct mbox_chan; +/* Sentinel value distinguishing "no active request" from "NULL message data" */ +#define MBOX_NO_MSG ((void *)-1) + #define TXDONE_BY_IRQ BIT(0) /* controller has remote RTR irq */ #define TXDONE_BY_POLL BIT(1) /* controller can read status of last TX */ #define TXDONE_BY_ACK BIT(2) /* S/W ACK received by Client ticks the TX */ -- cgit v1.2.3 From 0bd75b7abafb3ed199df830c539c57ef9b62c2a2 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 10 Apr 2026 14:49:12 +0200 Subject: mailbox: prefix new constants with MBOX_ Commit 89e5d7d61600 ("mailbox: remove superfluous internal header") moved some constants to a public header but forgot to add a mailbox specific prefix. Add this now to prevent future collisions on a too generic naming. Link: https://sashiko.dev/#/patchset/20260327151112.5202-2-wsa%2Brenesas%40sang-engineering.com Signed-off-by: Wolfram Sang Reviewed-by: Sudeep Holla Signed-off-by: Jassi Brar --- drivers/mailbox/cix-mailbox.c | 2 +- drivers/mailbox/imx-mailbox.c | 2 +- drivers/mailbox/mailbox.c | 22 +++++++++++----------- drivers/mailbox/mtk-cmdq-mailbox.c | 2 +- drivers/mailbox/omap-mailbox.c | 2 +- drivers/mailbox/tegra-hsp.c | 2 +- include/linux/mailbox_controller.h | 6 +++--- 7 files changed, 19 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/drivers/mailbox/cix-mailbox.c b/drivers/mailbox/cix-mailbox.c index 8cfaa91b75bd..43c76cdab24a 100644 --- a/drivers/mailbox/cix-mailbox.c +++ b/drivers/mailbox/cix-mailbox.c @@ -413,7 +413,7 @@ static int cix_mbox_startup(struct mbox_chan *chan) switch (cp->type) { case CIX_MBOX_TYPE_DB: /* Overwrite txdone_method for DB channel */ - chan->txdone_method = TXDONE_BY_ACK; + chan->txdone_method = MBOX_TXDONE_BY_ACK; fallthrough; case CIX_MBOX_TYPE_REG: if (priv->dir == CIX_MBOX_TX) { diff --git a/drivers/mailbox/imx-mailbox.c b/drivers/mailbox/imx-mailbox.c index 22331b579489..246a9a9e3952 100644 --- a/drivers/mailbox/imx-mailbox.c +++ b/drivers/mailbox/imx-mailbox.c @@ -732,7 +732,7 @@ static struct mbox_chan * imx_mu_xlate(struct mbox_controller *mbox, p_chan = &mbox->chans[chan]; if (type == IMX_MU_TYPE_TXDB_V2) - p_chan->txdone_method = TXDONE_BY_ACK; + p_chan->txdone_method = MBOX_TXDONE_BY_ACK; return p_chan; } diff --git a/drivers/mailbox/mailbox.c b/drivers/mailbox/mailbox.c index 138ffbcd4fde..30eafdf3a91e 100644 --- a/drivers/mailbox/mailbox.c +++ b/drivers/mailbox/mailbox.c @@ -72,7 +72,7 @@ static void msg_submit(struct mbox_chan *chan) } } - if (!err && (chan->txdone_method & TXDONE_BY_POLL)) { + if (!err && (chan->txdone_method & MBOX_TXDONE_BY_POLL)) { /* kick start the timer immediately to avoid delays */ scoped_guard(spinlock_irqsave, &chan->mbox->poll_hrt_lock) hrtimer_start(&chan->mbox->poll_hrt, 0, HRTIMER_MODE_REL); @@ -162,7 +162,7 @@ EXPORT_SYMBOL_GPL(mbox_chan_received_data); */ void mbox_chan_txdone(struct mbox_chan *chan, int r) { - if (unlikely(!(chan->txdone_method & TXDONE_BY_IRQ))) { + if (unlikely(!(chan->txdone_method & MBOX_TXDONE_BY_IRQ))) { dev_err(chan->mbox->dev, "Controller can't run the TX ticker\n"); return; @@ -183,7 +183,7 @@ EXPORT_SYMBOL_GPL(mbox_chan_txdone); */ void mbox_client_txdone(struct mbox_chan *chan, int r) { - if (unlikely(!(chan->txdone_method & TXDONE_BY_ACK))) { + if (unlikely(!(chan->txdone_method & MBOX_TXDONE_BY_ACK))) { dev_err(chan->mbox->dev, "Client can't run the TX ticker\n"); return; } @@ -344,8 +344,8 @@ static int __mbox_bind_client(struct mbox_chan *chan, struct mbox_client *cl) chan->cl = cl; init_completion(&chan->tx_complete); - if (chan->txdone_method == TXDONE_BY_POLL && cl->knows_txdone) - chan->txdone_method = TXDONE_BY_ACK; + if (chan->txdone_method == MBOX_TXDONE_BY_POLL && cl->knows_txdone) + chan->txdone_method = MBOX_TXDONE_BY_ACK; } if (chan->mbox->ops->startup) { @@ -499,8 +499,8 @@ void mbox_free_channel(struct mbox_chan *chan) scoped_guard(spinlock_irqsave, &chan->lock) { chan->cl = NULL; chan->active_req = MBOX_NO_MSG; - if (chan->txdone_method == TXDONE_BY_ACK) - chan->txdone_method = TXDONE_BY_POLL; + if (chan->txdone_method == MBOX_TXDONE_BY_ACK) + chan->txdone_method = MBOX_TXDONE_BY_POLL; } module_put(chan->mbox->dev->driver->owner); @@ -531,13 +531,13 @@ int mbox_controller_register(struct mbox_controller *mbox) return -EINVAL; if (mbox->txdone_irq) - txdone = TXDONE_BY_IRQ; + txdone = MBOX_TXDONE_BY_IRQ; else if (mbox->txdone_poll) - txdone = TXDONE_BY_POLL; + txdone = MBOX_TXDONE_BY_POLL; else /* It has to be ACK then */ - txdone = TXDONE_BY_ACK; + txdone = MBOX_TXDONE_BY_ACK; - if (txdone == TXDONE_BY_POLL) { + if (txdone == MBOX_TXDONE_BY_POLL) { if (!mbox->ops->last_tx_done) { dev_err(mbox->dev, "last_tx_done method is absent\n"); diff --git a/drivers/mailbox/mtk-cmdq-mailbox.c b/drivers/mailbox/mtk-cmdq-mailbox.c index 547a10a8fad3..e523c84b4808 100644 --- a/drivers/mailbox/mtk-cmdq-mailbox.c +++ b/drivers/mailbox/mtk-cmdq-mailbox.c @@ -728,7 +728,7 @@ static int cmdq_probe(struct platform_device *pdev) cmdq->mbox.ops = &cmdq_mbox_chan_ops; cmdq->mbox.of_xlate = cmdq_xlate; - /* make use of TXDONE_BY_ACK */ + /* make use of MBOX_TXDONE_BY_ACK */ cmdq->mbox.txdone_irq = false; cmdq->mbox.txdone_poll = false; diff --git a/drivers/mailbox/omap-mailbox.c b/drivers/mailbox/omap-mailbox.c index 5772c6b9886a..535ca8020877 100644 --- a/drivers/mailbox/omap-mailbox.c +++ b/drivers/mailbox/omap-mailbox.c @@ -238,7 +238,7 @@ static int omap_mbox_startup(struct omap_mbox *mbox) } if (mbox->send_no_irq) - mbox->chan->txdone_method = TXDONE_BY_ACK; + mbox->chan->txdone_method = MBOX_TXDONE_BY_ACK; omap_mbox_enable_irq(mbox, IRQ_RX); diff --git a/drivers/mailbox/tegra-hsp.c b/drivers/mailbox/tegra-hsp.c index 7b1e1b83ea29..500fa77c7d53 100644 --- a/drivers/mailbox/tegra-hsp.c +++ b/drivers/mailbox/tegra-hsp.c @@ -514,7 +514,7 @@ static int tegra_hsp_mailbox_startup(struct mbox_chan *chan) struct tegra_hsp *hsp = mb->channel.hsp; unsigned long flags; - chan->txdone_method = TXDONE_BY_IRQ; + chan->txdone_method = MBOX_TXDONE_BY_IRQ; /* * Shared mailboxes start out as consumers by default. FULL and EMPTY diff --git a/include/linux/mailbox_controller.h b/include/linux/mailbox_controller.h index e3896b08f22e..a49ee687d4cf 100644 --- a/include/linux/mailbox_controller.h +++ b/include/linux/mailbox_controller.h @@ -15,9 +15,9 @@ struct mbox_chan; /* Sentinel value distinguishing "no active request" from "NULL message data" */ #define MBOX_NO_MSG ((void *)-1) -#define TXDONE_BY_IRQ BIT(0) /* controller has remote RTR irq */ -#define TXDONE_BY_POLL BIT(1) /* controller can read status of last TX */ -#define TXDONE_BY_ACK BIT(2) /* S/W ACK received by Client ticks the TX */ +#define MBOX_TXDONE_BY_IRQ BIT(0) /* controller has remote RTR irq */ +#define MBOX_TXDONE_BY_POLL BIT(1) /* controller can read status of last TX */ +#define MBOX_TXDONE_BY_ACK BIT(2) /* S/W ACK received by Client ticks the TX */ /** * struct mbox_chan_ops - methods to control mailbox channels -- cgit v1.2.3 From 7746e3bd4cc19b5092e00d32d676e329bfcb6900 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 10 Apr 2026 16:49:47 +0200 Subject: fanotify: fix false positive on permission events fsnotify_get_mark_safe() may return false for a mark on an unrelated group, which results in bypassing the permission check. Fix by skipping over detached marks that are not in the current group. CC: stable@vger.kernel.org Fixes: abc77577a669 ("fsnotify: Provide framework for dropping SRCU lock in ->handle_event") Signed-off-by: Miklos Szeredi Link: https://patch.msgid.link/20260410144950.156160-1-mszeredi@redhat.com Signed-off-by: Jan Kara --- fs/notify/fsnotify.c | 2 +- fs/notify/mark.c | 18 +++++++++++------- include/linux/fsnotify_backend.h | 1 + 3 files changed, 13 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 9995de1710e5..b646a861a84c 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -388,7 +388,7 @@ static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector return hlist_entry_safe(node, struct fsnotify_mark, obj_list); } -static struct fsnotify_mark *fsnotify_next_mark(struct fsnotify_mark *mark) +struct fsnotify_mark *fsnotify_next_mark(struct fsnotify_mark *mark) { struct hlist_node *node = NULL; diff --git a/fs/notify/mark.c b/fs/notify/mark.c index c2ed5b11b0fe..622f05977f86 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -457,9 +457,6 @@ EXPORT_SYMBOL_GPL(fsnotify_put_mark); */ static bool fsnotify_get_mark_safe(struct fsnotify_mark *mark) { - if (!mark) - return true; - if (refcount_inc_not_zero(&mark->refcnt)) { spin_lock(&mark->lock); if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) { @@ -500,15 +497,22 @@ bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info) int type; fsnotify_foreach_iter_type(type) { + struct fsnotify_mark *mark = iter_info->marks[type]; + /* This can fail if mark is being removed */ - if (!fsnotify_get_mark_safe(iter_info->marks[type])) { - __release(&fsnotify_mark_srcu); - goto fail; + while (mark && !fsnotify_get_mark_safe(mark)) { + if (mark->group == iter_info->current_group) { + __release(&fsnotify_mark_srcu); + goto fail; + } + /* This is a mark in an unrelated group, skip */ + mark = fsnotify_next_mark(mark); + iter_info->marks[type] = mark; } } /* - * Now that both marks are pinned by refcount in the inode / vfsmount + * Now that all marks are pinned by refcount in the inode / vfsmount / etc * lists, we can drop SRCU lock, and safely resume the list iteration * once userspace returns. */ diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 95985400d3d8..e5cde39d6e85 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -915,6 +915,7 @@ extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group, unsigned int obj_type); extern void fsnotify_get_mark(struct fsnotify_mark *mark); extern void fsnotify_put_mark(struct fsnotify_mark *mark); +struct fsnotify_mark *fsnotify_next_mark(struct fsnotify_mark *mark); extern void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info); extern bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info); -- cgit v1.2.3 From a068c4d42c035c63b26ff91c394e6dc2cb7dc5d0 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 13 Apr 2026 12:42:39 +0200 Subject: mailbox: update kdoc for struct mbox_controller Add field for missing lock around the hrtimer. Add 'Required' where the core checks for valid entries. Signed-off-by: Wolfram Sang Reviewed-by: Geert Uytterhoeven Signed-off-by: Jassi Brar --- include/linux/mailbox_controller.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mailbox_controller.h b/include/linux/mailbox_controller.h index a49ee687d4cf..dc93287a2a01 100644 --- a/include/linux/mailbox_controller.h +++ b/include/linux/mailbox_controller.h @@ -62,10 +62,10 @@ struct mbox_chan_ops { /** * struct mbox_controller - Controller of a class of communication channels - * @dev: Device backing this controller - * @ops: Operators that work on each communication chan - * @chans: Array of channels - * @num_chans: Number of channels in the 'chans' array. + * @dev: Device backing this controller. Required. + * @ops: Operators that work on each communication chan. Required. + * @chans: Array of channels. Required. + * @num_chans: Number of channels in the 'chans' array. Required. * @txdone_irq: Indicates if the controller can report to API when * the last transmitted data was read by the remote. * Eg, if it has some TX ACK irq. @@ -78,6 +78,7 @@ struct mbox_chan_ops { * @of_xlate: Controller driver specific mapping of channel via DT * @poll_hrt: API private. hrtimer used to poll for TXDONE on all * channels. + * @poll_hrt_lock: API private. Lock protecting access to poll_hrt. * @node: API private. To hook into list of controllers. */ struct mbox_controller { -- cgit v1.2.3 From 73bd1227787bfe73eea3d04c63a89cb55db9c23e Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 18 Apr 2026 09:41:21 +0800 Subject: rhashtable: Restore insecure_elasticity toggle Some users of rhashtable cannot handle insertion failures, and are happy to accept the consequences of a hash table that having very long chains. Restore the insecure_elasticity toggle for these users. In addition to disabling the chain length checks, this also removes the emergency resize that would otherwise occur when the hash table occupancy hits 100% (an async resize is still scheduled at 75%). Signed-off-by: Herbert Xu Signed-off-by: Tejun Heo --- include/linux/rhashtable-types.h | 2 ++ include/linux/rhashtable.h | 5 +++-- lib/rhashtable.c | 5 +++-- 3 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h index 015c8298bebc..72082428d6c6 100644 --- a/include/linux/rhashtable-types.h +++ b/include/linux/rhashtable-types.h @@ -49,6 +49,7 @@ typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg, * @head_offset: Offset of rhash_head in struct to be hashed * @max_size: Maximum size while expanding * @min_size: Minimum size while shrinking + * @insecure_elasticity: Set to true to disable chain length checks * @automatic_shrinking: Enable automatic shrinking of tables * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash) * @obj_hashfn: Function to hash object @@ -61,6 +62,7 @@ struct rhashtable_params { u16 head_offset; unsigned int max_size; u16 min_size; + bool insecure_elasticity; bool automatic_shrinking; rht_hashfn_t hashfn; rht_obj_hashfn_t obj_hashfn; diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 0480509a6339..7def3f0f556b 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -821,14 +821,15 @@ slow_path: goto out; } - if (elasticity <= 0) + if (elasticity <= 0 && !params.insecure_elasticity) goto slow_path; data = ERR_PTR(-E2BIG); if (unlikely(rht_grow_above_max(ht, tbl))) goto out_unlock; - if (unlikely(rht_grow_above_100(ht, tbl))) + if (unlikely(rht_grow_above_100(ht, tbl)) && + !params.insecure_elasticity) goto slow_path; /* Inserting at head of list makes unlocking free. */ diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 6074ed5f66f3..fb2b7bc137ba 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -538,7 +538,7 @@ static void *rhashtable_lookup_one(struct rhashtable *ht, return NULL; } - if (elasticity <= 0) + if (elasticity <= 0 && !ht->p.insecure_elasticity) return ERR_PTR(-EAGAIN); return ERR_PTR(-ENOENT); @@ -568,7 +568,8 @@ static struct bucket_table *rhashtable_insert_one( if (unlikely(rht_grow_above_max(ht, tbl))) return ERR_PTR(-E2BIG); - if (unlikely(rht_grow_above_100(ht, tbl))) + if (unlikely(rht_grow_above_100(ht, tbl)) && + !ht->p.insecure_elasticity) return ERR_PTR(-EAGAIN); head = rht_ptr(bkt, tbl, hash); -- cgit v1.2.3 From 4fe985292709eeb6a4653c71660f893e26c2f2dd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 20 Apr 2026 20:03:26 -1000 Subject: rhashtable: Bounce deferred worker kick through irq_work Inserts past 75% load call schedule_work(&ht->run_work) to kick an async resize. If a caller holds a raw spinlock (e.g. an insecure_elasticity user), schedule_work() under that lock records caller_lock -> pool->lock -> pi_lock -> rq->__lock A cycle forms if any of these locks is acquired in the reverse direction elsewhere. sched_ext, the only current insecure_elasticity user, hits this: it holds scx_sched_lock across rhashtable inserts of sub-schedulers, while scx_bypass() takes rq->__lock -> scx_sched_lock. Exercising the resize path produces: Chain exists of: &pool->lock --> &rq->__lock --> scx_sched_lock Bounce the kick from the insert paths through irq_work so schedule_work() runs from hard IRQ context with the caller's lock no longer held. rht_deferred_worker()'s self-rearm on error stays on schedule_work(&ht->run_work) - the worker runs in process context with no caller lock held, and keeping the self-requeue on @run_work lets cancel_work_sync() in rhashtable_free_and_destroy() drain it. v3: Keep rht_deferred_worker()'s self-rearm on schedule_work(&run_work). Routing it through irq_work in v2 broke cancel_work_sync()'s self-requeue handling - an irq_work queued after irq_work_sync() returned but while cancel_work_sync() was still waiting could fire post-teardown. v2: Bounce unconditionally instead of gating on insecure_elasticity, as suggested by Herbert. Signed-off-by: Tejun Heo Acked-by: Herbert Xu --- include/linux/rhashtable-types.h | 3 +++ include/linux/rhashtable.h | 3 ++- lib/rhashtable.c | 31 ++++++++++++++++++++++++++++--- 3 files changed, 33 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h index 72082428d6c6..fc2f596a6df1 100644 --- a/include/linux/rhashtable-types.h +++ b/include/linux/rhashtable-types.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -77,6 +78,7 @@ struct rhashtable_params { * @p: Configuration parameters * @rhlist: True if this is an rhltable * @run_work: Deferred worker to expand/shrink asynchronously + * @run_irq_work: Bounces the @run_work kick through hard IRQ context. * @mutex: Mutex to protect current/future table swapping * @lock: Spin lock to protect walker list * @nelems: Number of elements in table @@ -88,6 +90,7 @@ struct rhashtable { struct rhashtable_params p; bool rhlist; struct work_struct run_work; + struct irq_work run_irq_work; struct mutex mutex; spinlock_t lock; atomic_t nelems; diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 7def3f0f556b..ef5230cece36 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -847,7 +848,7 @@ slow_path: rht_assign_unlock(tbl, bkt, obj, flags); if (rht_grow_above_75(ht, tbl)) - schedule_work(&ht->run_work); + irq_work_queue(&ht->run_irq_work); data = NULL; out: diff --git a/lib/rhashtable.c b/lib/rhashtable.c index fb2b7bc137ba..7a67ef5b67b6 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -441,10 +441,33 @@ static void rht_deferred_worker(struct work_struct *work) mutex_unlock(&ht->mutex); + /* + * Re-arm via @run_work, not @run_irq_work. + * rhashtable_free_and_destroy() drains async work as irq_work_sync() + * followed by cancel_work_sync(). If this site queued irq_work while + * cancel_work_sync() was waiting for us, irq_work_sync() would already + * have returned and the stale irq_work could fire post-teardown. + * cancel_work_sync() natively handles self-requeue on @run_work. + */ if (err) schedule_work(&ht->run_work); } +/* + * Insert-path callers can run under a raw spinlock (e.g. an insecure_elasticity + * user). Calling schedule_work() under that lock records caller_lock -> + * pool->lock -> pi_lock -> rq->__lock, closing a locking cycle if any of + * these is acquired in the reverse direction elsewhere. Bounce through + * irq_work so the schedule_work() runs with the caller's lock no longer held. + */ +static void rht_deferred_irq_work(struct irq_work *irq_work) +{ + struct rhashtable *ht = container_of(irq_work, struct rhashtable, + run_irq_work); + + schedule_work(&ht->run_work); +} + static int rhashtable_insert_rehash(struct rhashtable *ht, struct bucket_table *tbl) { @@ -477,7 +500,7 @@ static int rhashtable_insert_rehash(struct rhashtable *ht, if (err == -EEXIST) err = 0; } else - schedule_work(&ht->run_work); + irq_work_queue(&ht->run_irq_work); return err; @@ -488,7 +511,7 @@ fail: /* Schedule async rehash to retry allocation in process context. */ if (err == -ENOMEM) - schedule_work(&ht->run_work); + irq_work_queue(&ht->run_irq_work); return err; } @@ -630,7 +653,7 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key, rht_unlock(tbl, bkt, flags); if (inserted && rht_grow_above_75(ht, tbl)) - schedule_work(&ht->run_work); + irq_work_queue(&ht->run_irq_work); } } while (!IS_ERR_OR_NULL(new_tbl)); @@ -1085,6 +1108,7 @@ int rhashtable_init_noprof(struct rhashtable *ht, RCU_INIT_POINTER(ht->tbl, tbl); INIT_WORK(&ht->run_work, rht_deferred_worker); + init_irq_work(&ht->run_irq_work, rht_deferred_irq_work); return 0; } @@ -1150,6 +1174,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht, struct bucket_table *tbl, *next_tbl; unsigned int i; + irq_work_sync(&ht->run_irq_work); cancel_work_sync(&ht->run_work); mutex_lock(&ht->mutex); -- cgit v1.2.3 From f902877b635551513729bdf9a8d1422c4aab7741 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 15 Apr 2026 17:56:02 +0200 Subject: rculist: add list_splice_rcu() for private lists This patch adds a helper function, list_splice_rcu(), to safely splice a private (non-RCU-protected) list into an RCU-protected list. The function ensures that only the pointer visible to RCU readers (prev->next) is updated using rcu_assign_pointer(), while the rest of the list manipulations are performed with regular assignments, as the source list is private and not visible to concurrent RCU readers. This is useful for moving elements from a private list into a global RCU-protected list, ensuring safe publication for RCU readers. Subsystems with some sort of batching mechanism from userspace can benefit from this new function. The function __list_splice_rcu() has been added for clarity and to follow the same pattern as in the existing list_splice*() interfaces, where there is a check to ensure that the list to splice is not empty. Note that __list_splice_rcu() has no documentation for this reason. Reviewed-by: Paul E. McKenney Signed-off-by: Pablo Neira Ayuso --- include/linux/rculist.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 2abba7552605..e3bc44225692 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -261,6 +261,35 @@ static inline void list_replace_rcu(struct list_head *old, old->prev = LIST_POISON2; } +static inline void __list_splice_rcu(struct list_head *list, + struct list_head *prev, + struct list_head *next) +{ + struct list_head *first = list->next; + struct list_head *last = list->prev; + + last->next = next; + first->prev = prev; + next->prev = last; + rcu_assign_pointer(list_next_rcu(prev), first); +} + +/** + * list_splice_rcu - splice a non-RCU list into an RCU-protected list, + * designed for stacks. + * @list: the non RCU-protected list to splice + * @head: the place in the existing RCU-protected list to splice + * + * The list pointed to by @head can be RCU-read traversed concurrently with + * this function. + */ +static inline void list_splice_rcu(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) + __list_splice_rcu(list, head, head->next); +} + /** * __list_splice_init_rcu - join an RCU-protected list into an existing list. * @list: the RCU-protected list to splice -- cgit v1.2.3 From 10f79dbd7719d1da9f5884d13060322d8729f091 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 15 Apr 2026 22:58:23 +0200 Subject: netfilter: nf_tables: add hook transactions for device deletions Restore the flag that indicates that the hook is going away, ie. NFT_HOOK_REMOVE, but add a new transaction object to track deletion of hooks without altering the basechain/flowtable hook_list during the preparation phase. The existing approach that moves the hook from the basechain/flowtable hook_list to transaction hook_list breaks netlink dump path readers of this RCU-protected list. It should be possible use an array for nft_trans_hook to store the deleted hooks to compact the representation but I am not expecting many hook object, specially now that wildcard support for devices is in place. Note that the nft_trans_chain_hooks() list contains a list of struct nft_trans_hook objects for DELCHAIN and DELFLOWTABLE commands, while this list stores struct nft_hook objects for NEWCHAIN and NEWFLOWTABLE. Note that new commands can be updated to use nft_trans_hook for consistency. This patch also adapts the event notification path to deal with the list of hook transactions. Fixes: 7d937b107108 ("netfilter: nf_tables: support for deleting devices in an existing netdev chain") Fixes: b6d9014a3335 ("netfilter: nf_tables: delete flowtable hooks via transaction list") Reported-by: Xiang Mei Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 13 ++ net/netfilter/nf_tables_api.c | 264 +++++++++++++++++++++++++++++--------- 2 files changed, 217 insertions(+), 60 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 2c0173d9309c..cff7b773e972 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1204,12 +1204,15 @@ struct nft_stats { struct u64_stats_sync syncp; }; +#define NFT_HOOK_REMOVE (1 << 0) + struct nft_hook { struct list_head list; struct list_head ops_list; struct rcu_head rcu; char ifname[IFNAMSIZ]; u8 ifnamelen; + u8 flags; }; struct nf_hook_ops *nft_hook_find_ops(const struct nft_hook *hook, @@ -1664,6 +1667,16 @@ struct nft_trans { u8 put_net:1; }; +/** + * struct nft_trans_hook - nf_tables hook update in transaction + * @list: used internally + * @hook: struct nft_hook with the device hook + */ +struct nft_trans_hook { + struct list_head list; + struct nft_hook *hook; +}; + /** * struct nft_trans_binding - nf_tables object with binding support in transaction * @nft_trans: base structure, MUST be first member diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index ae10116af923..d20ce5c36d31 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -380,6 +380,32 @@ static void nft_netdev_hook_unlink_free_rcu(struct nft_hook *hook) nft_netdev_hook_free_rcu(hook); } +static void nft_trans_hook_destroy(struct nft_trans_hook *trans_hook) +{ + list_del(&trans_hook->list); + kfree(trans_hook); +} + +static void nft_netdev_unregister_trans_hook(struct net *net, + const struct nft_table *table, + struct list_head *hook_list) +{ + struct nft_trans_hook *trans_hook, *next; + struct nf_hook_ops *ops; + struct nft_hook *hook; + + list_for_each_entry_safe(trans_hook, next, hook_list, list) { + hook = trans_hook->hook; + + if (!(table->flags & NFT_TABLE_F_DORMANT)) { + list_for_each_entry(ops, &hook->ops_list, list) + nf_unregister_net_hook(net, ops); + } + nft_netdev_hook_unlink_free_rcu(hook); + nft_trans_hook_destroy(trans_hook); + } +} + static void nft_netdev_unregister_hooks(struct net *net, struct list_head *hook_list, bool release_netdev) @@ -1946,15 +1972,69 @@ static int nft_nla_put_hook_dev(struct sk_buff *skb, struct nft_hook *hook) return nla_put_string(skb, attr, hook->ifname); } +struct nft_hook_dump_ctx { + struct nft_hook *first; + int n; +}; + +static int nft_dump_basechain_hook_one(struct sk_buff *skb, + struct nft_hook *hook, + struct nft_hook_dump_ctx *dump_ctx) +{ + if (!dump_ctx->first) + dump_ctx->first = hook; + + if (nft_nla_put_hook_dev(skb, hook)) + return -1; + + dump_ctx->n++; + + return 0; +} + +static int nft_dump_basechain_hook_list(struct sk_buff *skb, + const struct net *net, + const struct list_head *hook_list, + struct nft_hook_dump_ctx *dump_ctx) +{ + struct nft_hook *hook; + int err; + + list_for_each_entry_rcu(hook, hook_list, list, + lockdep_commit_lock_is_held(net)) { + err = nft_dump_basechain_hook_one(skb, hook, dump_ctx); + if (err < 0) + return err; + } + + return 0; +} + +static int nft_dump_basechain_trans_hook_list(struct sk_buff *skb, + const struct list_head *trans_hook_list, + struct nft_hook_dump_ctx *dump_ctx) +{ + struct nft_trans_hook *trans_hook; + int err; + + list_for_each_entry(trans_hook, trans_hook_list, list) { + err = nft_dump_basechain_hook_one(skb, trans_hook->hook, dump_ctx); + if (err < 0) + return err; + } + + return 0; +} + static int nft_dump_basechain_hook(struct sk_buff *skb, const struct net *net, int family, const struct nft_base_chain *basechain, - const struct list_head *hook_list) + const struct list_head *hook_list, + const struct list_head *trans_hook_list) { const struct nf_hook_ops *ops = &basechain->ops; - struct nft_hook *hook, *first = NULL; + struct nft_hook_dump_ctx dump_hook_ctx = {}; struct nlattr *nest, *nest_devs; - int n = 0; nest = nla_nest_start_noflag(skb, NFTA_CHAIN_HOOK); if (nest == NULL) @@ -1969,23 +2049,23 @@ static int nft_dump_basechain_hook(struct sk_buff *skb, if (!nest_devs) goto nla_put_failure; - if (!hook_list) + if (!hook_list && !trans_hook_list) hook_list = &basechain->hook_list; - list_for_each_entry_rcu(hook, hook_list, list, - lockdep_commit_lock_is_held(net)) { - if (!first) - first = hook; - - if (nft_nla_put_hook_dev(skb, hook)) - goto nla_put_failure; - n++; + if (hook_list && + nft_dump_basechain_hook_list(skb, net, hook_list, &dump_hook_ctx)) { + goto nla_put_failure; + } else if (trans_hook_list && + nft_dump_basechain_trans_hook_list(skb, trans_hook_list, + &dump_hook_ctx)) { + goto nla_put_failure; } + nla_nest_end(skb, nest_devs); - if (n == 1 && - !hook_is_prefix(first) && - nla_put_string(skb, NFTA_HOOK_DEV, first->ifname)) + if (dump_hook_ctx.n == 1 && + !hook_is_prefix(dump_hook_ctx.first) && + nla_put_string(skb, NFTA_HOOK_DEV, dump_hook_ctx.first->ifname)) goto nla_put_failure; } nla_nest_end(skb, nest); @@ -1999,7 +2079,8 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq, int event, u32 flags, int family, const struct nft_table *table, const struct nft_chain *chain, - const struct list_head *hook_list) + const struct list_head *hook_list, + const struct list_head *trans_hook_list) { struct nlmsghdr *nlh; @@ -2015,7 +2096,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, NFTA_CHAIN_PAD)) goto nla_put_failure; - if (!hook_list && + if (!hook_list && !trans_hook_list && (event == NFT_MSG_DELCHAIN || event == NFT_MSG_DESTROYCHAIN)) { nlmsg_end(skb, nlh); @@ -2026,7 +2107,8 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, const struct nft_base_chain *basechain = nft_base_chain(chain); struct nft_stats __percpu *stats; - if (nft_dump_basechain_hook(skb, net, family, basechain, hook_list)) + if (nft_dump_basechain_hook(skb, net, family, basechain, + hook_list, trans_hook_list)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_CHAIN_POLICY, @@ -2062,7 +2144,8 @@ nla_put_failure: } static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event, - const struct list_head *hook_list) + const struct list_head *hook_list, + const struct list_head *trans_hook_list) { struct nftables_pernet *nft_net; struct sk_buff *skb; @@ -2082,7 +2165,7 @@ static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event, err = nf_tables_fill_chain_info(skb, ctx->net, ctx->portid, ctx->seq, event, flags, ctx->family, ctx->table, - ctx->chain, hook_list); + ctx->chain, hook_list, trans_hook_list); if (err < 0) { kfree_skb(skb); goto err; @@ -2128,7 +2211,7 @@ static int nf_tables_dump_chains(struct sk_buff *skb, NFT_MSG_NEWCHAIN, NLM_F_MULTI, table->family, table, - chain, NULL) < 0) + chain, NULL, NULL) < 0) goto done; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); @@ -2182,7 +2265,7 @@ static int nf_tables_getchain(struct sk_buff *skb, const struct nfnl_info *info, err = nf_tables_fill_chain_info(skb2, net, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFT_MSG_NEWCHAIN, - 0, family, table, chain, NULL); + 0, family, table, chain, NULL, NULL); if (err < 0) goto err_fill_chain_info; @@ -2345,8 +2428,12 @@ static struct nft_hook *nft_hook_list_find(struct list_head *hook_list, list_for_each_entry(hook, hook_list, list) { if (!strncmp(hook->ifname, this->ifname, - min(hook->ifnamelen, this->ifnamelen))) + min(hook->ifnamelen, this->ifnamelen))) { + if (hook->flags & NFT_HOOK_REMOVE) + continue; + return hook; + } } return NULL; @@ -3105,6 +3192,32 @@ static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info, return nf_tables_addchain(&ctx, family, policy, flags, extack); } +static int nft_trans_delhook(struct nft_hook *hook, + struct list_head *del_list) +{ + struct nft_trans_hook *trans_hook; + + trans_hook = kmalloc_obj(*trans_hook, GFP_KERNEL); + if (!trans_hook) + return -ENOMEM; + + trans_hook->hook = hook; + list_add_tail(&trans_hook->list, del_list); + hook->flags |= NFT_HOOK_REMOVE; + + return 0; +} + +static void nft_trans_delhook_abort(struct list_head *del_list) +{ + struct nft_trans_hook *trans_hook, *next; + + list_for_each_entry_safe(trans_hook, next, del_list, list) { + trans_hook->hook->flags &= ~NFT_HOOK_REMOVE; + nft_trans_hook_destroy(trans_hook); + } +} + static int nft_delchain_hook(struct nft_ctx *ctx, struct nft_base_chain *basechain, struct netlink_ext_ack *extack) @@ -3131,7 +3244,10 @@ static int nft_delchain_hook(struct nft_ctx *ctx, err = -ENOENT; goto err_chain_del_hook; } - list_move(&hook->list, &chain_del_list); + if (nft_trans_delhook(hook, &chain_del_list) < 0) { + err = -ENOMEM; + goto err_chain_del_hook; + } } trans = nft_trans_alloc_chain(ctx, NFT_MSG_DELCHAIN); @@ -3151,7 +3267,7 @@ static int nft_delchain_hook(struct nft_ctx *ctx, return 0; err_chain_del_hook: - list_splice(&chain_del_list, &basechain->hook_list); + nft_trans_delhook_abort(&chain_del_list); nft_chain_release_hook(&chain_hook); return err; @@ -8941,6 +9057,24 @@ static void nft_hooks_destroy(struct list_head *hook_list) nft_netdev_hook_unlink_free_rcu(hook); } +static void nft_flowtable_unregister_trans_hook(struct net *net, + struct nft_flowtable *flowtable, + struct list_head *hook_list) +{ + struct nft_trans_hook *trans_hook, *next; + struct nf_hook_ops *ops; + struct nft_hook *hook; + + list_for_each_entry_safe(trans_hook, next, hook_list, list) { + hook = trans_hook->hook; + list_for_each_entry(ops, &hook->ops_list, list) + nft_unregister_flowtable_ops(net, flowtable, ops); + + nft_netdev_hook_unlink_free_rcu(hook); + nft_trans_hook_destroy(trans_hook); + } +} + static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, struct nft_flowtable *flowtable, struct netlink_ext_ack *extack) @@ -9199,7 +9333,10 @@ static int nft_delflowtable_hook(struct nft_ctx *ctx, err = -ENOENT; goto err_flowtable_del_hook; } - list_move(&hook->list, &flowtable_del_list); + if (nft_trans_delhook(hook, &flowtable_del_list) < 0) { + err = -ENOMEM; + goto err_flowtable_del_hook; + } } trans = nft_trans_alloc(ctx, NFT_MSG_DELFLOWTABLE, @@ -9220,7 +9357,7 @@ static int nft_delflowtable_hook(struct nft_ctx *ctx, return 0; err_flowtable_del_hook: - list_splice(&flowtable_del_list, &flowtable->hook_list); + nft_trans_delhook_abort(&flowtable_del_list); nft_flowtable_hook_release(&flowtable_hook); return err; @@ -9285,8 +9422,10 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq, int event, u32 flags, int family, struct nft_flowtable *flowtable, - struct list_head *hook_list) + struct list_head *hook_list, + struct list_head *trans_hook_list) { + struct nft_trans_hook *trans_hook; struct nlattr *nest, *nest_devs; struct nft_hook *hook; struct nlmsghdr *nlh; @@ -9303,7 +9442,7 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, NFTA_FLOWTABLE_PAD)) goto nla_put_failure; - if (!hook_list && + if (!hook_list && !trans_hook_list && (event == NFT_MSG_DELFLOWTABLE || event == NFT_MSG_DESTROYFLOWTABLE)) { nlmsg_end(skb, nlh); @@ -9325,13 +9464,20 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, if (!nest_devs) goto nla_put_failure; - if (!hook_list) + if (!hook_list && !trans_hook_list) hook_list = &flowtable->hook_list; - list_for_each_entry_rcu(hook, hook_list, list, - lockdep_commit_lock_is_held(net)) { - if (nft_nla_put_hook_dev(skb, hook)) - goto nla_put_failure; + if (hook_list) { + list_for_each_entry_rcu(hook, hook_list, list, + lockdep_commit_lock_is_held(net)) { + if (nft_nla_put_hook_dev(skb, hook)) + goto nla_put_failure; + } + } else if (trans_hook_list) { + list_for_each_entry(trans_hook, trans_hook_list, list) { + if (nft_nla_put_hook_dev(skb, trans_hook->hook)) + goto nla_put_failure; + } } nla_nest_end(skb, nest_devs); nla_nest_end(skb, nest); @@ -9385,7 +9531,7 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb, NFT_MSG_NEWFLOWTABLE, NLM_F_MULTI | NLM_F_APPEND, table->family, - flowtable, NULL) < 0) + flowtable, NULL, NULL) < 0) goto done; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); @@ -9485,7 +9631,7 @@ static int nf_tables_getflowtable(struct sk_buff *skb, err = nf_tables_fill_flowtable_info(skb2, net, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFT_MSG_NEWFLOWTABLE, 0, family, - flowtable, NULL); + flowtable, NULL, NULL); if (err < 0) goto err_fill_flowtable_info; @@ -9498,7 +9644,9 @@ err_fill_flowtable_info: static void nf_tables_flowtable_notify(struct nft_ctx *ctx, struct nft_flowtable *flowtable, - struct list_head *hook_list, int event) + struct list_head *hook_list, + struct list_head *trans_hook_list, + int event) { struct nftables_pernet *nft_net = nft_pernet(ctx->net); struct sk_buff *skb; @@ -9518,7 +9666,8 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx, err = nf_tables_fill_flowtable_info(skb, ctx->net, ctx->portid, ctx->seq, event, flags, - ctx->family, flowtable, hook_list); + ctx->family, flowtable, + hook_list, trans_hook_list); if (err < 0) { kfree_skb(skb); goto err; @@ -10052,9 +10201,7 @@ static void nft_commit_release(struct nft_trans *trans) break; case NFT_MSG_DELCHAIN: case NFT_MSG_DESTROYCHAIN: - if (nft_trans_chain_update(trans)) - nft_hooks_destroy(&nft_trans_chain_hooks(trans)); - else + if (!nft_trans_chain_update(trans)) nf_tables_chain_destroy(nft_trans_chain(trans)); break; case NFT_MSG_DELRULE: @@ -10075,9 +10222,7 @@ static void nft_commit_release(struct nft_trans *trans) break; case NFT_MSG_DELFLOWTABLE: case NFT_MSG_DESTROYFLOWTABLE: - if (nft_trans_flowtable_update(trans)) - nft_hooks_destroy(&nft_trans_flowtable_hooks(trans)); - else + if (!nft_trans_flowtable_update(trans)) nf_tables_flowtable_destroy(nft_trans_flowtable(trans)); break; } @@ -10837,31 +10982,28 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) if (nft_trans_chain_update(trans)) { nft_chain_commit_update(nft_trans_container_chain(trans)); nf_tables_chain_notify(&ctx, NFT_MSG_NEWCHAIN, - &nft_trans_chain_hooks(trans)); + &nft_trans_chain_hooks(trans), NULL); list_splice_rcu(&nft_trans_chain_hooks(trans), &nft_trans_basechain(trans)->hook_list); /* trans destroyed after rcu grace period */ } else { nft_chain_commit_drop_policy(nft_trans_container_chain(trans)); nft_clear(net, nft_trans_chain(trans)); - nf_tables_chain_notify(&ctx, NFT_MSG_NEWCHAIN, NULL); + nf_tables_chain_notify(&ctx, NFT_MSG_NEWCHAIN, NULL, NULL); nft_trans_destroy(trans); } break; case NFT_MSG_DELCHAIN: case NFT_MSG_DESTROYCHAIN: if (nft_trans_chain_update(trans)) { - nf_tables_chain_notify(&ctx, NFT_MSG_DELCHAIN, + nf_tables_chain_notify(&ctx, NFT_MSG_DELCHAIN, NULL, &nft_trans_chain_hooks(trans)); - if (!(table->flags & NFT_TABLE_F_DORMANT)) { - nft_netdev_unregister_hooks(net, - &nft_trans_chain_hooks(trans), - true); - } + nft_netdev_unregister_trans_hook(net, table, + &nft_trans_chain_hooks(trans)); } else { nft_chain_del(nft_trans_chain(trans)); nf_tables_chain_notify(&ctx, NFT_MSG_DELCHAIN, - NULL); + NULL, NULL); nf_tables_unregister_hook(ctx.net, ctx.table, nft_trans_chain(trans)); } @@ -10967,6 +11109,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_flowtable_notify(&ctx, nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans), + NULL, NFT_MSG_NEWFLOWTABLE); list_splice_rcu(&nft_trans_flowtable_hooks(trans), &nft_trans_flowtable(trans)->hook_list); @@ -10975,6 +11118,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_flowtable_notify(&ctx, nft_trans_flowtable(trans), NULL, + NULL, NFT_MSG_NEWFLOWTABLE); } nft_trans_destroy(trans); @@ -10984,16 +11128,18 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) if (nft_trans_flowtable_update(trans)) { nf_tables_flowtable_notify(&ctx, nft_trans_flowtable(trans), + NULL, &nft_trans_flowtable_hooks(trans), trans->msg_type); - nft_unregister_flowtable_net_hooks(net, - nft_trans_flowtable(trans), - &nft_trans_flowtable_hooks(trans)); + nft_flowtable_unregister_trans_hook(net, + nft_trans_flowtable(trans), + &nft_trans_flowtable_hooks(trans)); } else { list_del_rcu(&nft_trans_flowtable(trans)->list); nf_tables_flowtable_notify(&ctx, nft_trans_flowtable(trans), NULL, + NULL, trans->msg_type); nft_unregister_flowtable_net_hooks(net, nft_trans_flowtable(trans), @@ -11157,8 +11303,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) case NFT_MSG_DELCHAIN: case NFT_MSG_DESTROYCHAIN: if (nft_trans_chain_update(trans)) { - list_splice(&nft_trans_chain_hooks(trans), - &nft_trans_basechain(trans)->hook_list); + nft_trans_delhook_abort(&nft_trans_chain_hooks(trans)); } else { nft_use_inc_restore(&table->use); nft_clear(trans->net, nft_trans_chain(trans)); @@ -11272,8 +11417,7 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) case NFT_MSG_DELFLOWTABLE: case NFT_MSG_DESTROYFLOWTABLE: if (nft_trans_flowtable_update(trans)) { - list_splice(&nft_trans_flowtable_hooks(trans), - &nft_trans_flowtable(trans)->hook_list); + nft_trans_delhook_abort(&nft_trans_flowtable_hooks(trans)); } else { nft_use_inc_restore(&table->use); nft_clear(trans->net, nft_trans_flowtable(trans)); -- cgit v1.2.3 From bd7b7ce96db4487bb77692a85ee4489fd2c395df Mon Sep 17 00:00:00 2001 From: Chris Leech Date: Wed, 22 Apr 2026 12:06:36 -0700 Subject: nvme-auth: Hash DH shared secret to create session key The NVMe Base Specification 8.3.5.5.9 states that the session key Ks shall be computed from the ephemeral DH key by applying the hash function selected by the HashID parameter. The current implementation stores the raw DH shared secret as the session key without hashing it. This causes redundant hash operations: 1. Augmented challenge computation (section 8.3.5.5.4) requires Ca = HMAC(H(g^xy mod p), C). The code compensates by hashing the unhashed session key in nvme_auth_augmented_challenge() to produce the correct result. 2. PSK generation (section 8.3.5.5.9) requires PSK = HMAC(Ks, C1 || C2) where Ks should already be H(g^xy mod p). As the DH shared secret is always larger than the HMAC block size, HMAC internally hashes it before use, accidentally producing the correct result. When using secure channel concatenation with bidirectional authentication, this results in hashing the DH value three times: twice for augmented challenge calculations and once during PSK generation. Fix this by: - Modifying nvme_auth_gen_shared_secret() to hash the DH shared secret once after computation: Ks = H(g^xy mod p) - Removing the hash operation from nvme_auth_augmented_challenge() as the session key is now already hashed - Updating session key buffer size from DH key size to hash output size - Adding specification references in comments This avoid storing the raw DH shared secret and reduces the number of hash operations from three to one when using secure channel concatenation. Reviewed-by: Hannes Reinecke Reviewed-by: Eric Biggers Signed-off-by: Chris Leech Signed-off-by: Keith Busch --- drivers/nvme/common/auth.c | 94 ++++++++++++++++++++++++++++++++++++---------- drivers/nvme/host/auth.c | 13 ++++--- drivers/nvme/target/auth.c | 15 ++++---- include/linux/nvme-auth.h | 6 +-- 4 files changed, 92 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c index 2d325fb93083..77f1d22512f8 100644 --- a/drivers/nvme/common/auth.c +++ b/drivers/nvme/common/auth.c @@ -351,18 +351,29 @@ struct nvme_dhchap_key *nvme_auth_transform_key( } EXPORT_SYMBOL_GPL(nvme_auth_transform_key); +/** + * nvme_auth_augmented_challenge() - Compute the augmented DH-HMAC-CHAP challenge + * @hmac_id: Hash algorithm identifier + * @skey: Session key + * @skey_len: Length of @skey + * @challenge: Challenge value + * @aug: Output buffer for the augmented challenge + * @hlen: Hash output length (length of @challenge and @aug) + * + * NVMe base specification 8.3.5.5.4: The augmented challenge is computed + * applying the HMAC function using the hash function H() selected by the + * HashID parameter ... with the hash of the ephemeral DH key ... as HMAC key + * to the challenge C (i.e., Ca = HMAC(H(g^xy mod p), C)). + * + * As the session key skey is already H(g^xy mod p) per section 8.3.5.5.9, use + * it directly as the HMAC key without additional hashing. + * + * Return: 0 on success, negative errno on failure. + */ int nvme_auth_augmented_challenge(u8 hmac_id, const u8 *skey, size_t skey_len, const u8 *challenge, u8 *aug, size_t hlen) { - u8 hashed_key[NVME_AUTH_MAX_DIGEST_SIZE]; - int ret; - - ret = nvme_auth_hash(hmac_id, skey, skey_len, hashed_key); - if (ret) - return ret; - ret = nvme_auth_hmac(hmac_id, hashed_key, hlen, challenge, hlen, aug); - memzero_explicit(hashed_key, sizeof(hashed_key)); - return ret; + return nvme_auth_hmac(hmac_id, skey, skey_len, challenge, hlen, aug); } EXPORT_SYMBOL_GPL(nvme_auth_augmented_challenge); @@ -403,33 +414,76 @@ int nvme_auth_gen_pubkey(struct crypto_kpp *dh_tfm, } EXPORT_SYMBOL_GPL(nvme_auth_gen_pubkey); -int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm, - const u8 *ctrl_key, size_t ctrl_key_len, - u8 *sess_key, size_t sess_key_len) +/** + * nvme_auth_gen_session_key() - Generate an ephemeral session key + * @dh_tfm: Diffie-Hellman transform with local private key already set + * @public_key: Peer's public key + * @public_key_len: Length of @public_key + * @sess_key: Output buffer for the session key + * @sess_key_len: Size of @sess_key buffer + * @hash_id: Hash algorithm identifier + * + * NVMe base specification 8.3.5.5.9: The session key Ks shall be computed from + * the ephemeral DH key (i.e., g^xy mod p) ... by applying the hash function + * H() selected by the HashID parameter ... (i.e., Ks = H(g^xy mod p)). + * + * Return: 0 on success, negative errno on failure. + */ +int nvme_auth_gen_session_key(struct crypto_kpp *dh_tfm, + const u8 *public_key, size_t public_key_len, + u8 *sess_key, size_t sess_key_len, u8 hash_id) { struct kpp_request *req; struct crypto_wait wait; struct scatterlist src, dst; + u8 *dh_secret; + size_t dh_secret_len, hash_len; int ret; - req = kpp_request_alloc(dh_tfm, GFP_KERNEL); - if (!req) + hash_len = nvme_auth_hmac_hash_len(hash_id); + if (!hash_len) { + pr_warn("%s: invalid hash algorithm %d\n", __func__, hash_id); + return -EINVAL; + } + + if (sess_key_len != hash_len) { + pr_warn("%s: sess_key buffer missized (%zu != %zu)\n", + __func__, sess_key_len, hash_len); + return -EINVAL; + } + + dh_secret_len = crypto_kpp_maxsize(dh_tfm); + dh_secret = kzalloc(dh_secret_len, GFP_KERNEL); + if (!dh_secret) return -ENOMEM; + req = kpp_request_alloc(dh_tfm, GFP_KERNEL); + if (!req) { + ret = -ENOMEM; + goto out_free_secret; + } + crypto_init_wait(&wait); - sg_init_one(&src, ctrl_key, ctrl_key_len); - kpp_request_set_input(req, &src, ctrl_key_len); - sg_init_one(&dst, sess_key, sess_key_len); - kpp_request_set_output(req, &dst, sess_key_len); + sg_init_one(&src, public_key, public_key_len); + kpp_request_set_input(req, &src, public_key_len); + sg_init_one(&dst, dh_secret, dh_secret_len); + kpp_request_set_output(req, &dst, dh_secret_len); kpp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, crypto_req_done, &wait); ret = crypto_wait_req(crypto_kpp_compute_shared_secret(req), &wait); - kpp_request_free(req); + + if (ret) + goto out_free_secret; + + ret = nvme_auth_hash(hash_id, dh_secret, dh_secret_len, sess_key); + +out_free_secret: + kfree_sensitive(dh_secret); return ret; } -EXPORT_SYMBOL_GPL(nvme_auth_gen_shared_secret); +EXPORT_SYMBOL_GPL(nvme_auth_gen_session_key); int nvme_auth_parse_key(const char *secret, struct nvme_dhchap_key **ret_key) { diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 63f543e80998..16de4499a8e7 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -588,7 +588,7 @@ static int nvme_auth_dhchap_exponential(struct nvme_ctrl *ctrl, } gen_sesskey: - chap->sess_key_len = chap->host_key_len; + chap->sess_key_len = chap->hash_len; chap->sess_key = kmalloc(chap->sess_key_len, GFP_KERNEL); if (!chap->sess_key) { chap->sess_key_len = 0; @@ -596,16 +596,17 @@ gen_sesskey: return -ENOMEM; } - ret = nvme_auth_gen_shared_secret(chap->dh_tfm, - chap->ctrl_key, chap->ctrl_key_len, - chap->sess_key, chap->sess_key_len); + ret = nvme_auth_gen_session_key(chap->dh_tfm, + chap->ctrl_key, chap->ctrl_key_len, + chap->sess_key, chap->sess_key_len, + chap->hash_id); if (ret) { dev_dbg(ctrl->device, - "failed to generate shared secret, error %d\n", ret); + "failed to generate session key, error %d\n", ret); chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; return ret; } - dev_dbg(ctrl->device, "shared secret %*ph\n", + dev_dbg(ctrl->device, "session key %*ph\n", (int)chap->sess_key_len, chap->sess_key); return 0; } diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c index c35c427ca2ac..9a2eccdc8b13 100644 --- a/drivers/nvme/target/auth.c +++ b/drivers/nvme/target/auth.c @@ -447,18 +447,19 @@ int nvmet_auth_ctrl_sesskey(struct nvmet_req *req, struct nvmet_ctrl *ctrl = req->sq->ctrl; int ret; - req->sq->dhchap_skey_len = ctrl->dh_keysize; + req->sq->dhchap_skey_len = nvme_auth_hmac_hash_len(ctrl->shash_id); req->sq->dhchap_skey = kzalloc(req->sq->dhchap_skey_len, GFP_KERNEL); if (!req->sq->dhchap_skey) return -ENOMEM; - ret = nvme_auth_gen_shared_secret(ctrl->dh_tfm, - pkey, pkey_size, - req->sq->dhchap_skey, - req->sq->dhchap_skey_len); + ret = nvme_auth_gen_session_key(ctrl->dh_tfm, + pkey, pkey_size, + req->sq->dhchap_skey, + req->sq->dhchap_skey_len, + ctrl->shash_id); if (ret) - pr_debug("failed to compute shared secret, err %d\n", ret); + pr_debug("failed to compute session key, err %d\n", ret); else - pr_debug("%s: shared secret %*ph\n", __func__, + pr_debug("%s: session key %*ph\n", __func__, (int)req->sq->dhchap_skey_len, req->sq->dhchap_skey); diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h index 184a1f9510fa..89902ae8b929 100644 --- a/include/linux/nvme-auth.h +++ b/include/linux/nvme-auth.h @@ -49,9 +49,9 @@ int nvme_auth_augmented_challenge(u8 hmac_id, const u8 *skey, size_t skey_len, int nvme_auth_gen_privkey(struct crypto_kpp *dh_tfm, u8 dh_gid); int nvme_auth_gen_pubkey(struct crypto_kpp *dh_tfm, u8 *host_key, size_t host_key_len); -int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm, - const u8 *ctrl_key, size_t ctrl_key_len, - u8 *sess_key, size_t sess_key_len); +int nvme_auth_gen_session_key(struct crypto_kpp *dh_tfm, + const u8 *public_key, size_t public_key_len, + u8 *sess_key, size_t sess_key_len, u8 hash_id); int nvme_auth_generate_psk(u8 hmac_id, const u8 *skey, size_t skey_len, const u8 *c1, const u8 *c2, size_t hash_len, u8 **ret_psk, size_t *ret_len); -- cgit v1.2.3 From 0b13173d27fa15679463b62a10cfa8b3d6c3a71c Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Wed, 15 Apr 2026 13:41:01 +0800 Subject: dma-buf: fix stale @lock references in struct dma_buf documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kernel-doc comments for vmapping_counter and vmap_ptr in struct dma_buf reference "@lock" as the protecting lock, but struct dma_buf no longer has a "lock" member. The mutex was removed in favor of using the dma_resv lock exclusively. The implementation correctly uses dma_resv_assert_held(dmabuf->resv) in dma_buf_vmap() and dma_buf_vunmap(), so update the documentation to reference @resv instead. Signed-off-by: gaoxiang17 Reviewed-by: Christian König Signed-off-by: Christian König Link: https://lore.kernel.org/r/20260415054101.535520-1-gxxa03070307@gmail.com --- include/linux/dma-buf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index 133b9e637b55..ef6d93fd7a2c 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -322,13 +322,13 @@ struct dma_buf { * @vmapping_counter: * * Used internally to refcnt the vmaps returned by dma_buf_vmap(). - * Protected by @lock. + * Protected by @resv. */ unsigned vmapping_counter; /** * @vmap_ptr: - * The current vmap ptr if @vmapping_counter > 0. Protected by @lock. + * The current vmap ptr if @vmapping_counter > 0. Protected by @resv. */ struct iosys_map vmap_ptr; -- cgit v1.2.3 From 619eab23e1ce7c97e54bfc5a417306d94b3f6f13 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 21 Apr 2026 11:21:50 +0100 Subject: mm/vma: do not try to unmap a VMA if mmap_prepare() invoked from mmap() The mmap_prepare hook functionality includes the ability to invoke mmap_prepare() from the mmap() hook of existing 'stacked' drivers, that is ones which are capable of calling the mmap hooks of other drivers/file systems (e.g. overlayfs, shm). As part of the mmap_prepare action functionality, we deal with errors by unmapping the VMA should one arise. This works in the usual mmap_prepare case, as we invoke this action at the last moment, when the VMA is established in the maple tree. However, the mmap() hook passes a not-fully-established VMA pointer to the caller (which is the motivation behind the mmap_prepare() work), which is detached. So attempting to unmap a VMA in this state will be problematic, with the most obvious symptom being a warning in vma_mark_detached(), because the VMA is already detached. It's also unncessary - the mmap() handler will clean up the VMA on error. So to fix this issue, this patch propagates whether or not an mmap action is being completed via the compatibility layer or directly. If the former, then we do not attempt VMA cleanup, if the latter, then we do. This patch also updates the userland VMA tests to reflect the change. Link: https://lore.kernel.org/20260421102150.189982-1-ljs@kernel.org Fixes: ac0a3fc9c07d ("mm: add ability to take further action in vm_area_desc") Signed-off-by: Lorenzo Stoakes Reported-by: syzbot+db390288d141a1dccf96@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/69e69734.050a0220.24bfd3.0027.GAE@google.com/ Cc: David Hildenbrand Cc: Jann Horn Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Pedro Falcato Cc: Suren Baghdasaryan Cc: Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- mm/util.c | 26 +++++++++++++++++--------- mm/vma.c | 3 ++- tools/testing/vma/include/dup.h | 2 +- tools/testing/vma/include/stubs.h | 3 ++- 5 files changed, 23 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0b776907152e..af23453e9dbd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4391,7 +4391,7 @@ static inline void mmap_action_map_kernel_pages_full(struct vm_area_desc *desc, int mmap_action_prepare(struct vm_area_desc *desc); int mmap_action_complete(struct vm_area_struct *vma, - struct mmap_action *action); + struct mmap_action *action, bool is_compat); /* Look up the first VMA which exactly match the interval vm_start ... vm_end */ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, diff --git a/mm/util.c b/mm/util.c index 232c3930a662..3cc949a0b7ed 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1232,7 +1232,7 @@ int __compat_vma_mmap(struct vm_area_desc *desc, /* Update the VMA from the descriptor. */ compat_set_vma_from_desc(vma, desc); /* Complete any specified mmap actions. */ - return mmap_action_complete(vma, &desc->action); + return mmap_action_complete(vma, &desc->action, /*is_compat=*/true); } EXPORT_SYMBOL(__compat_vma_mmap); @@ -1389,7 +1389,8 @@ static int call_vma_mapped(struct vm_area_struct *vma) } static int mmap_action_finish(struct vm_area_struct *vma, - struct mmap_action *action, int err) + struct mmap_action *action, int err, + bool is_compat) { size_t len; @@ -1400,8 +1401,12 @@ static int mmap_action_finish(struct vm_area_struct *vma, /* do_munmap() might take rmap lock, so release if held. */ maybe_rmap_unlock_action(vma, action); - if (!err) - return 0; + /* + * If this is invoked from the compatibility layer, post-mmap() hook + * logic will handle cleanup for us. + */ + if (!err || is_compat) + return err; /* * If an error occurs, unmap the VMA altogether and return an error. We @@ -1451,13 +1456,15 @@ EXPORT_SYMBOL(mmap_action_prepare); * mmap_action_complete - Execute VMA descriptor action. * @vma: The VMA to perform the action upon. * @action: The action to perform. + * @is_compat: Is this being invoked from the compatibility layer? * * Similar to mmap_action_prepare(). * - * Return: 0 on success, or error, at which point the VMA will be unmapped. + * Return: 0 on success, or error, at which point the VMA will be unmapped if + * !@is_compat. */ int mmap_action_complete(struct vm_area_struct *vma, - struct mmap_action *action) + struct mmap_action *action, bool is_compat) { int err = 0; @@ -1478,7 +1485,7 @@ int mmap_action_complete(struct vm_area_struct *vma, break; } - return mmap_action_finish(vma, action, err); + return mmap_action_finish(vma, action, err, is_compat); } EXPORT_SYMBOL(mmap_action_complete); #else @@ -1500,7 +1507,8 @@ int mmap_action_prepare(struct vm_area_desc *desc) EXPORT_SYMBOL(mmap_action_prepare); int mmap_action_complete(struct vm_area_struct *vma, - struct mmap_action *action) + struct mmap_action *action, + bool is_compat) { int err = 0; @@ -1517,7 +1525,7 @@ int mmap_action_complete(struct vm_area_struct *vma, break; } - return mmap_action_finish(vma, action, err); + return mmap_action_finish(vma, action, err, is_compat); } EXPORT_SYMBOL(mmap_action_complete); #endif diff --git a/mm/vma.c b/mm/vma.c index 377321b48734..d90791b00a7b 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -2780,7 +2780,8 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, __mmap_complete(&map, vma); if (have_mmap_prepare && allocated_new) { - error = mmap_action_complete(vma, &desc.action); + error = mmap_action_complete(vma, &desc.action, + /*is_compat=*/false); if (error) return error; } diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h index b4864aad2db0..9e0dfd3a85b0 100644 --- a/tools/testing/vma/include/dup.h +++ b/tools/testing/vma/include/dup.h @@ -1330,7 +1330,7 @@ static inline int __compat_vma_mmap(struct vm_area_desc *desc, /* Update the VMA from the descriptor. */ compat_set_vma_from_desc(vma, desc); /* Complete any specified mmap actions. */ - return mmap_action_complete(vma, &desc->action); + return mmap_action_complete(vma, &desc->action, /*is_compat=*/true); } static inline int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/tools/testing/vma/include/stubs.h b/tools/testing/vma/include/stubs.h index a30b8bc84955..64164e25658f 100644 --- a/tools/testing/vma/include/stubs.h +++ b/tools/testing/vma/include/stubs.h @@ -87,7 +87,8 @@ static inline int mmap_action_prepare(struct vm_area_desc *desc) } static inline int mmap_action_complete(struct vm_area_struct *vma, - struct mmap_action *action) + struct mmap_action *action, + bool is_compat) { return 0; } -- cgit v1.2.3 From 77a50e9652ac3c669c6690088bce97d960f5fd17 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 22 Apr 2026 14:43:10 -0400 Subject: MAINTAINERS: update Liam's email address Switching to private email address. Update all contact information Add an entry to mailmap at the same time. Link: https://lore.kernel.org/20260422184310.2682901-1-liam@infradead.org Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- .mailmap | 1 + MAINTAINERS | 20 ++++++++++---------- include/linux/maple_tree.h | 2 +- lib/maple_tree.c | 2 +- lib/test_maple_tree.c | 4 ++-- tools/testing/radix-tree/maple.c | 2 +- 6 files changed, 16 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/.mailmap b/.mailmap index a8fd13adf226..f1eafd91d2c2 100644 --- a/.mailmap +++ b/.mailmap @@ -496,6 +496,7 @@ Leon Romanovsky Leon Romanovsky Leon Romanovsky Leo Yan +Liam R. Howlett Liam Mark Linas Vepstas Linus Lüssing diff --git a/MAINTAINERS b/MAINTAINERS index d6f017581d54..be2e017b3a9d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15399,7 +15399,7 @@ F: include/net/netns/mctp.h F: net/mctp/ MAPLE TREE -M: Liam R. Howlett +M: Liam R. Howlett R: Alice Ryhl R: Andrew Ballance L: maple-tree@lists.infradead.org @@ -16759,7 +16759,7 @@ MEMORY MANAGEMENT - CORE M: Andrew Morton M: David Hildenbrand R: Lorenzo Stoakes -R: Liam R. Howlett +R: Liam R. Howlett R: Vlastimil Babka R: Mike Rapoport R: Suren Baghdasaryan @@ -16895,7 +16895,7 @@ MEMORY MANAGEMENT - MISC M: Andrew Morton M: David Hildenbrand R: Lorenzo Stoakes -R: Liam R. Howlett +R: Liam R. Howlett R: Vlastimil Babka R: Mike Rapoport R: Suren Baghdasaryan @@ -16997,7 +16997,7 @@ M: Andrew Morton M: David Hildenbrand M: Lorenzo Stoakes R: Rik van Riel -R: Liam R. Howlett +R: Liam R. Howlett R: Vlastimil Babka R: Harry Yoo R: Jann Horn @@ -17044,7 +17044,7 @@ M: David Hildenbrand M: Lorenzo Stoakes R: Zi Yan R: Baolin Wang -R: Liam R. Howlett +R: Liam R. Howlett R: Nico Pache R: Ryan Roberts R: Dev Jain @@ -17082,7 +17082,7 @@ F: tools/testing/selftests/mm/uffd-*.[ch] MEMORY MANAGEMENT - RUST M: Alice Ryhl R: Lorenzo Stoakes -R: Liam R. Howlett +R: Liam R. Howlett L: linux-mm@kvack.org L: rust-for-linux@vger.kernel.org S: Maintained @@ -17096,7 +17096,7 @@ F: rust/kernel/page.rs MEMORY MAPPING M: Andrew Morton -M: Liam R. Howlett +M: Liam R. Howlett M: Lorenzo Stoakes R: Vlastimil Babka R: Jann Horn @@ -17128,7 +17128,7 @@ F: tools/testing/vma/ MEMORY MAPPING - LOCKING M: Andrew Morton M: Suren Baghdasaryan -M: Liam R. Howlett +M: Liam R. Howlett M: Lorenzo Stoakes R: Vlastimil Babka R: Shakeel Butt @@ -17143,7 +17143,7 @@ F: mm/mmap_lock.c MEMORY MAPPING - MADVISE (MEMORY ADVICE) M: Andrew Morton -M: Liam R. Howlett +M: Liam R. Howlett M: Lorenzo Stoakes M: David Hildenbrand R: Vlastimil Babka @@ -23370,7 +23370,7 @@ RUST [ALLOC] M: Danilo Krummrich R: Lorenzo Stoakes R: Vlastimil Babka -R: Liam R. Howlett +R: Liam R. Howlett R: Uladzislau Rezki L: rust-for-linux@vger.kernel.org S: Maintained diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 0c464eade1d6..4a5631906aff 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -4,7 +4,7 @@ /* * Maple Tree - An RCU-safe adaptive tree for storing ranges * Copyright (c) 2018-2022 Oracle - * Authors: Liam R. Howlett + * Authors: Liam R. Howlett * Matthew Wilcox */ diff --git a/lib/maple_tree.c b/lib/maple_tree.c index d18d7ed9ab67..60ae5e6fc1ee 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -2,7 +2,7 @@ /* * Maple Tree implementation * Copyright (c) 2018-2022 Oracle Corporation - * Authors: Liam R. Howlett + * Authors: Liam R. Howlett * Matthew Wilcox * Copyright (c) 2023 ByteDance * Author: Peng Zhang diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index 434d8a2fdd99..b9367c61e8b5 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -2,7 +2,7 @@ /* * test_maple_tree.c: Test the maple tree API * Copyright (c) 2018-2022 Oracle Corporation - * Author: Liam R. Howlett + * Author: Liam R. Howlett * * Any tests that only require the interface of the tree. */ @@ -4021,6 +4021,6 @@ static void __exit maple_tree_harvest(void) module_init(maple_tree_seed); module_exit(maple_tree_harvest); -MODULE_AUTHOR("Liam R. Howlett "); +MODULE_AUTHOR("Liam R. Howlett "); MODULE_DESCRIPTION("maple tree API test module"); MODULE_LICENSE("GPL"); diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index feedd5ab7058..0607913a3022 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -2,7 +2,7 @@ /* * maple_tree.c: Userspace testing for maple tree test-suite * Copyright (c) 2018-2022 Oracle Corporation - * Author: Liam R. Howlett + * Author: Liam R. Howlett * * Any tests that require internal knowledge of the tree or threads and other * difficult to handle in kernel tests. -- cgit v1.2.3 From 5e25407b68f460142539536e31fa20338db6146f Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 10 Apr 2026 19:41:03 +0200 Subject: mtd: spinand: Add support for packed read data ODTR commands Some devices stuff address bits in the double byte opcode (in place of the repeated byte) in order to be able to increase the size of the devices, without adding extra address bytes. Create a flag to identify those devices. When the flag is set, use the "packed" variant for the read data operation. Signed-off-by: Miquel Raynal --- drivers/mtd/nand/spi/core.c | 24 +++++++++++++++++++++--- include/linux/mtd/spinand.h | 7 +++++++ 2 files changed, 28 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/mtd/nand/spi/core.c b/drivers/mtd/nand/spi/core.c index 8aa3753aaaa1..0b076790bd9d 100644 --- a/drivers/mtd/nand/spi/core.c +++ b/drivers/mtd/nand/spi/core.c @@ -100,6 +100,17 @@ spinand_fill_page_read_op(struct spinand_device *spinand, u64 addr) return op; } +static struct spi_mem_op +spinand_fill_page_read_packed_op(struct spinand_device *spinand, u64 addr) +{ + struct spi_mem_op op = spinand->op_templates->page_read; + + op.cmd.opcode |= addr >> 16; + op.addr.val = addr & 0xFFFF; + + return op; +} + struct spi_mem_op spinand_fill_prog_exec_op(struct spinand_device *spinand, u64 addr) { @@ -453,7 +464,10 @@ static int spinand_load_page_op(struct spinand_device *spinand, { struct nand_device *nand = spinand_to_nand(spinand); unsigned int row = nanddev_pos_to_row(nand, &req->pos); - struct spi_mem_op op = SPINAND_OP(spinand, page_read, row); + bool packed = spinand->flags & SPINAND_ODTR_PACKED_PAGE_READ; + struct spi_mem_op op = packed ? + SPINAND_OP(spinand, page_read_packed, row) : + SPINAND_OP(spinand, page_read, row); return spi_mem_exec_op(spinand->spimem, &op); } @@ -1489,9 +1503,13 @@ static int spinand_init_odtr_instruction_set(struct spinand_device *spinand) if (!spi_mem_supports_op(spinand->spimem, &tmpl->blk_erase)) return -EOPNOTSUPP; - tmpl->page_read = (struct spi_mem_op)SPINAND_PAGE_READ_8D_8D_0_OP(0); - if (!spi_mem_supports_op(spinand->spimem, &tmpl->page_read)) + if (spinand->flags & SPINAND_ODTR_PACKED_PAGE_READ) + tmpl->page_read = (struct spi_mem_op)SPINAND_PAGE_READ_PACKED_8D_8D_0_OP(0); + else + tmpl->page_read = (struct spi_mem_op)SPINAND_PAGE_READ_8D_8D_0_OP(0); + if (!spi_mem_supports_op(spinand->spimem, &tmpl->page_read)) { return -EOPNOTSUPP; + } tmpl->prog_exec = (struct spi_mem_op)SPINAND_PROG_EXEC_8D_8D_0_OP(0); if (!spi_mem_supports_op(spinand->spimem, &tmpl->prog_exec)) diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 58abd306ebe3..782984ba3a20 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -290,6 +290,12 @@ SPI_MEM_OP_NO_DUMMY, \ SPI_MEM_OP_NO_DATA) +#define SPINAND_PAGE_READ_PACKED_8D_8D_0_OP(addr) \ + SPI_MEM_OP(SPI_MEM_DTR_OP_PACKED_CMD(0x13, addr >> 16, 8), \ + SPI_MEM_DTR_OP_ADDR(2, addr & 0xffff, 8), \ + SPI_MEM_OP_NO_DUMMY, \ + SPI_MEM_OP_NO_DATA) + #define SPINAND_PAGE_READ_FROM_CACHE_8D_8D_8D_OP(addr, ndummy, buf, len, freq) \ SPI_MEM_OP(SPI_MEM_DTR_OP_RPT_CMD(0x9d, 8), \ SPI_MEM_DTR_OP_ADDR(2, addr, 8), \ @@ -483,6 +489,7 @@ struct spinand_ecc_info { #define SPINAND_HAS_PROG_PLANE_SELECT_BIT BIT(2) #define SPINAND_HAS_READ_PLANE_SELECT_BIT BIT(3) #define SPINAND_NO_RAW_ACCESS BIT(4) +#define SPINAND_ODTR_PACKED_PAGE_READ BIT(5) /** * struct spinand_ondie_ecc_conf - private SPI-NAND on-die ECC engine structure -- cgit v1.2.3 From 1f6008538384453eb4c13a3d7ff9e37ee8aee6b9 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 21 Apr 2026 08:02:15 -0700 Subject: ACPICA: Provide #defines for EINJV2 error types EINJV2 defined new error types by moving the severity (correctable, uncorrectable non-fatal, uncorrectable fatal) out of the "type". ACPI 6.5 introduced EINJV2 and defined a vendor defined error type using bit 31. This was dropped in ACPI 6.6. Link: https://github.com/acpica/acpica/commit/e82d2d2fd145 Signed-off-by: Tony Luck Link: https://patch.msgid.link/20260421150216.11666-2-tony.luck@intel.com Signed-off-by: Rafael J. Wysocki --- include/acpi/actbl1.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/acpi/actbl1.h b/include/acpi/actbl1.h index 4e15583e0d25..f72e00517eb3 100644 --- a/include/acpi/actbl1.h +++ b/include/acpi/actbl1.h @@ -1386,6 +1386,12 @@ enum acpi_einj_command_status { #define ACPI_EINJ_CXL_MEM_FATAL (1<<17) #define ACPI_EINJ_VENDOR_DEFINED (1<<31) +/* EINJV2 error types from EINJV2_GET_ERROR_TYPE (ACPI 6.6) */ + +#define ACPI_EINJV2_PROCESSOR (1) +#define ACPI_EINJV2_MEMORY (1<<1) +#define ACPI_EINJV2_PCIE (1<<2) + /******************************************************************************* * * ERST - Error Record Serialization Table (ACPI 4.0) -- cgit v1.2.3 From ea216d3ae7305ad2c8256524e65b7219492d8685 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 27 Apr 2026 13:22:38 +0200 Subject: ACPI: bus: add missing forward declaration to acpi_bus.h The header references struct notifier_block but neither includes linux/notifier.h nor contains the relevant forward declaration. Add the latter for correctness. Signed-off-by: Bartosz Golaszewski [ rjw: Subject tweak ] Link: https://patch.msgid.link/20260427112238.132419-1-bartosz.golaszewski@oss.qualcomm.com Signed-off-by: Rafael J. Wysocki --- include/acpi/acpi_bus.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index b701b5f972cb..c41d9a7565cf 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -17,6 +17,8 @@ #include #include +struct notifier_block; + struct acpi_handle_list { u32 count; acpi_handle *handles; -- cgit v1.2.3 From 0898a817621a2f0cddca8122d9b974003fe5036d Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Mon, 27 Apr 2026 22:01:39 +0100 Subject: cdrom, scsi: sr: propagate read-only status to block layer via set_disk_ro() The cdrom core never calls set_disk_ro() for a registered device, so BLKROGET on a CD-ROM device always returns 0 (writable), even when the drive has no write capabilities and writes will inevitably fail. This causes problems for userspace that relies on BLKROGET to determine whether a block device is read-only. For example, systemd's loop device setup uses BLKROGET to decide whether to create a loop device with LO_FLAGS_READ_ONLY. Without the read-only flag, writes pass through the loop device to the CD-ROM and fail with I/O errors. systemd-fsck similarly checks BLKROGET to decide whether to run fsck in no-repair mode (-n). The write-capability bits in cdi->mask come from two different sources: CDC_DVD_RAM and CDC_CD_RW are populated by the driver from the MODE SENSE capabilities page (page 0x2A) before register_cdrom() is called, while CDC_MRW_W and CDC_RAM require the MMC GET CONFIGURATION command and were only probed by cdrom_open_write() at device open time. This meant that any attempt to compute the writable state from the full mask at probe time was incorrect, because the GET CONFIGURATION bits were still unset (and cdi->mask is initialized such that capabilities are assumed present). Fix this by factoring the GET CONFIGURATION probing out of cdrom_open_write() into a new exported helper, cdrom_probe_write_features(), and having sr call it from sr_probe() right after get_capabilities() has populated the MODE SENSE bits. register_cdrom() then calls set_disk_ro() based on the full write-capability mask (CDC_DVD_RAM | CDC_MRW_W | CDC_RAM | CDC_CD_RW) so the block layer reflects the drive's actual write support. The feature queries used (CDF_MRW and CDF_RWRT via GET CONFIGURATION with RT=00) report drive-level capabilities that are persistent across media, so a single probe before register_cdrom() is sufficient and the redundant probe at open time is dropped. With set_disk_ro() now accurate, the long-vestigial cd->writeable flag in sr can go: get_capabilities() used to set cd->writeable based on the same four mask bits, but because CDC_MRW_W and CDC_RAM default to "capability present" in cdi->mask and aren't touched by MODE SENSE, the condition that gated cd->writeable was always true, making it unconditionally 1. Replace the corresponding gate in sr_init_command() with get_disk_ro(cd->disk), which turns a previously no-op check into a real one and also catches kernel-internal bio writers that bypass blkdev_write_iter()'s bdev_read_only() check. The sd driver (SCSI disks) does not have this problem because it checks the MODE SENSE Write Protect bit and calls set_disk_ro() accordingly. The sr driver cannot use the same approach because the MMC specification does not define the WP bit in the MODE SENSE device-specific parameter byte for CD-ROM devices. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Daan De Meyer Reviewed-by: Phillip Potter Reviewed-by: Martin K. Petersen Signed-off-by: Phillip Potter Link: https://patch.msgid.link/20260427210139.1400-2-phil@philpotter.co.uk Signed-off-by: Jens Axboe --- drivers/cdrom/cdrom.c | 73 +++++++++++++++++++++++++++++++++------------------ drivers/scsi/sr.c | 11 ++------ drivers/scsi/sr.h | 1 - include/linux/cdrom.h | 1 + 4 files changed, 51 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index fc049612d6dc..62934cf4b10d 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -631,6 +631,16 @@ int register_cdrom(struct gendisk *disk, struct cdrom_device_info *cdi) WARN_ON(!cdo->generic_packet); + /* + * Propagate the drive's write support to the block layer so BLKROGET + * reflects actual write capability. Drivers that use GET CONFIGURATION + * features (CDC_MRW_W, CDC_RAM) must have called + * cdrom_probe_write_features() before register_cdrom() so the mask is + * complete here. + */ + set_disk_ro(disk, !CDROM_CAN(CDC_DVD_RAM | CDC_MRW_W | CDC_RAM | + CDC_CD_RW)); + cd_dbg(CD_REG_UNREG, "drive \"/dev/%s\" registered\n", cdi->name); mutex_lock(&cdrom_mutex); list_add(&cdi->list, &cdrom_list); @@ -742,6 +752,44 @@ static int cdrom_is_random_writable(struct cdrom_device_info *cdi, int *write) return 0; } +/* + * Probe write-related MMC features via GET CONFIGURATION and update + * cdi->mask accordingly. Drivers that populate cdi->mask from the MODE SENSE + * capabilities page (e.g. sr) should call this after those MODE SENSE bits + * have been set but before register_cdrom(), so that the full set of + * write-capability bits is known by the time register_cdrom() decides on the + * initial read-only state of the disk. + */ +void cdrom_probe_write_features(struct cdrom_device_info *cdi) +{ + int mrw, mrw_write, ram_write; + + mrw = 0; + if (!cdrom_is_mrw(cdi, &mrw_write)) + mrw = 1; + + if (CDROM_CAN(CDC_MO_DRIVE)) + ram_write = 1; + else + (void) cdrom_is_random_writable(cdi, &ram_write); + + if (mrw) + cdi->mask &= ~CDC_MRW; + else + cdi->mask |= CDC_MRW; + + if (mrw_write) + cdi->mask &= ~CDC_MRW_W; + else + cdi->mask |= CDC_MRW_W; + + if (ram_write) + cdi->mask &= ~CDC_RAM; + else + cdi->mask |= CDC_RAM; +} +EXPORT_SYMBOL(cdrom_probe_write_features); + static int cdrom_media_erasable(struct cdrom_device_info *cdi) { disc_information di; @@ -894,33 +942,8 @@ static int cdrom_is_dvd_rw(struct cdrom_device_info *cdi) */ static int cdrom_open_write(struct cdrom_device_info *cdi) { - int mrw, mrw_write, ram_write; int ret = 1; - mrw = 0; - if (!cdrom_is_mrw(cdi, &mrw_write)) - mrw = 1; - - if (CDROM_CAN(CDC_MO_DRIVE)) - ram_write = 1; - else - (void) cdrom_is_random_writable(cdi, &ram_write); - - if (mrw) - cdi->mask &= ~CDC_MRW; - else - cdi->mask |= CDC_MRW; - - if (mrw_write) - cdi->mask &= ~CDC_MRW_W; - else - cdi->mask |= CDC_MRW_W; - - if (ram_write) - cdi->mask &= ~CDC_RAM; - else - cdi->mask |= CDC_RAM; - if (CDROM_CAN(CDC_MRW_W)) ret = cdrom_mrw_open_write(cdi); else if (CDROM_CAN(CDC_DVD_RAM)) diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 7adb2573f50d..c36c54ecd354 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -395,7 +395,7 @@ static blk_status_t sr_init_command(struct scsi_cmnd *SCpnt) switch (req_op(rq)) { case REQ_OP_WRITE: - if (!cd->writeable) + if (get_disk_ro(cd->disk)) goto out; SCpnt->cmnd[0] = WRITE_10; cd->cdi.media_written = 1; @@ -681,6 +681,7 @@ static int sr_probe(struct scsi_device *sdev) error = -ENOMEM; if (get_capabilities(cd)) goto fail_minor; + cdrom_probe_write_features(&cd->cdi); sr_vendor_init(cd); set_capacity(disk, cd->capacity); @@ -899,14 +900,6 @@ static int get_capabilities(struct scsi_cd *cd) /*else I don't think it can close its tray cd->cdi.mask |= CDC_CLOSE_TRAY; */ - /* - * if DVD-RAM, MRW-W or CD-RW, we are randomly writable - */ - if ((cd->cdi.mask & (CDC_DVD_RAM | CDC_MRW_W | CDC_RAM | CDC_CD_RW)) != - (CDC_DVD_RAM | CDC_MRW_W | CDC_RAM | CDC_CD_RW)) { - cd->writeable = 1; - } - kfree(buffer); return 0; } diff --git a/drivers/scsi/sr.h b/drivers/scsi/sr.h index dc899277b3a4..2d92f9cb6fec 100644 --- a/drivers/scsi/sr.h +++ b/drivers/scsi/sr.h @@ -35,7 +35,6 @@ typedef struct scsi_cd { struct scsi_device *device; unsigned int vendor; /* vendor code, see sr_vendor.c */ unsigned long ms_offset; /* for reading multisession-CD's */ - unsigned writeable : 1; unsigned use:1; /* is this device still supportable */ unsigned xa_flag:1; /* CD has XA sectors ? */ unsigned readcd_known:1; /* drive supports READ_CD (0xbe) */ diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index b907e6c2307d..260d7968cf72 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -108,6 +108,7 @@ int cdrom_ioctl(struct cdrom_device_info *cdi, struct block_device *bdev, extern unsigned int cdrom_check_events(struct cdrom_device_info *cdi, unsigned int clearing); +extern void cdrom_probe_write_features(struct cdrom_device_info *cdi); extern int register_cdrom(struct gendisk *disk, struct cdrom_device_info *cdi); extern void unregister_cdrom(struct cdrom_device_info *cdi); -- cgit v1.2.3 From b3b6babf47517fde6b6de2493dea28e8831b9347 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 23 Apr 2026 05:34:54 +0000 Subject: ipmr: Free mr_table after RCU grace period. With CONFIG_IP_MROUTE_MULTIPLE_TABLES=n, ipmr_fib_lookup() does not check if net->ipv4.mrt is NULL. Since default_device_exit_batch() is called after ->exit_rtnl(), a device could receive IGMP packets and access net->ipv4.mrt during/after ipmr_rules_exit_rtnl(). If ipmr_rules_exit_rtnl() had already cleared it and freed the memory, the access would trigger null-ptr-deref or use-after-free. Let's fix it by using RCU helper and free mrt after RCU grace period. In addition, check_net(net) is added to mroute_clean_tables() and ipmr_cache_unresolved() to synchronise via mfc_unres_lock. This prevents ipmr_cache_unresolved() from putting skb into c->_c.mfc_un.unres.unresolved after mroute_clean_tables() purges it. For the same reason, timer_shutdown_sync() is moved after mroute_clean_tables(). Since rhltable_destroy() holds mutex internally, rcu_work is used, and it is placed as the first member because rcu_head must be placed within <4K offset. mr_table is alraedy 3864 bytes without rcu_work. Note that IP6MR is not yet converted to ->exit_rtnl(), so this change is not needed for now but will be. Fixes: b22b01867406 ("ipmr: Convert ipmr_net_exit_batch() to ->exit_rtnl().") Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20260423053456.4097409-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/mroute_base.h | 3 ++ net/ipv4/ipmr.c | 108 ++++++++++++++++++++++++-------------------- net/ipv4/ipmr_base.c | 16 +++++++ 3 files changed, 77 insertions(+), 50 deletions(-) (limited to 'include') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index cf3374580f74..5d75cc5b057e 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -226,6 +226,7 @@ struct mr_table_ops { /** * struct mr_table - a multicast routing table + * @work: used for table destruction * @list: entry within a list of multicast routing tables * @net: net where this table belongs * @ops: protocol specific operations @@ -243,6 +244,7 @@ struct mr_table_ops { * @mroute_reg_vif_num: PIM-device vif index */ struct mr_table { + struct rcu_work work; struct list_head list; possible_net_t net; struct mr_table_ops ops; @@ -274,6 +276,7 @@ void vif_device_init(struct vif_device *v, unsigned short flags, unsigned short get_iflink_mask); +void mr_table_free(struct mr_table *mrt); struct mr_table * mr_table_alloc(struct net *net, u32 id, struct mr_table_ops *ops, diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 8a08d09b4c30..2058ca860294 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -151,16 +151,6 @@ static struct mr_table *__ipmr_get_table(struct net *net, u32 id) return NULL; } -static struct mr_table *ipmr_get_table(struct net *net, u32 id) -{ - struct mr_table *mrt; - - rcu_read_lock(); - mrt = __ipmr_get_table(net, id); - rcu_read_unlock(); - return mrt; -} - static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4, struct mr_table **mrt) { @@ -293,7 +283,7 @@ static void __net_exit ipmr_rules_exit_rtnl(struct net *net, struct mr_table *mrt, *next; list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) { - list_del(&mrt->list); + list_del_rcu(&mrt->list); ipmr_free_table(mrt, dev_kill_list); } } @@ -315,28 +305,30 @@ bool ipmr_rule_default(const struct fib_rule *rule) } EXPORT_SYMBOL(ipmr_rule_default); #else -#define ipmr_for_each_table(mrt, net) \ - for (mrt = net->ipv4.mrt; mrt; mrt = NULL) - static struct mr_table *ipmr_mr_table_iter(struct net *net, struct mr_table *mrt) { if (!mrt) - return net->ipv4.mrt; + return rcu_dereference(net->ipv4.mrt); return NULL; } -static struct mr_table *ipmr_get_table(struct net *net, u32 id) +static struct mr_table *__ipmr_get_table(struct net *net, u32 id) { - return net->ipv4.mrt; + return rcu_dereference_check(net->ipv4.mrt, + lockdep_rtnl_is_held() || + !rcu_access_pointer(net->ipv4.mrt)); } -#define __ipmr_get_table ipmr_get_table +#define ipmr_for_each_table(mrt, net) \ + for (mrt = __ipmr_get_table(net, 0); mrt; mrt = NULL) static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4, struct mr_table **mrt) { - *mrt = net->ipv4.mrt; + *mrt = rcu_dereference(net->ipv4.mrt); + if (!*mrt) + return -EAGAIN; return 0; } @@ -347,7 +339,8 @@ static int __net_init ipmr_rules_init(struct net *net) mrt = ipmr_new_table(net, RT_TABLE_DEFAULT); if (IS_ERR(mrt)) return PTR_ERR(mrt); - net->ipv4.mrt = mrt; + + rcu_assign_pointer(net->ipv4.mrt, mrt); return 0; } @@ -358,9 +351,10 @@ static void __net_exit ipmr_rules_exit(struct net *net) static void __net_exit ipmr_rules_exit_rtnl(struct net *net, struct list_head *dev_kill_list) { - ipmr_free_table(net->ipv4.mrt, dev_kill_list); + struct mr_table *mrt = rcu_dereference_protected(net->ipv4.mrt, 1); - net->ipv4.mrt = NULL; + RCU_INIT_POINTER(net->ipv4.mrt, NULL); + ipmr_free_table(mrt, dev_kill_list); } static int ipmr_rules_dump(struct net *net, struct notifier_block *nb, @@ -381,6 +375,17 @@ bool ipmr_rule_default(const struct fib_rule *rule) EXPORT_SYMBOL(ipmr_rule_default); #endif +static struct mr_table *ipmr_get_table(struct net *net, u32 id) +{ + struct mr_table *mrt; + + rcu_read_lock(); + mrt = __ipmr_get_table(net, id); + rcu_read_unlock(); + + return mrt; +} + static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { @@ -441,12 +446,11 @@ static void ipmr_free_table(struct mr_table *mrt, struct list_head *dev_kill_lis WARN_ON_ONCE(!mr_can_free_table(net)); - timer_shutdown_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC | MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC, &ipmr_dev_kill_list); - rhltable_destroy(&mrt->mfc_hash); - kfree(mrt); + timer_shutdown_sync(&mrt->ipmr_expire_timer); + mr_table_free(mrt); WARN_ON_ONCE(!net_initialized(net) && !list_empty(&ipmr_dev_kill_list)); list_splice(&ipmr_dev_kill_list, dev_kill_list); @@ -1135,12 +1139,19 @@ static int ipmr_cache_report(const struct mr_table *mrt, static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb, struct net_device *dev) { + struct net *net = read_pnet(&mrt->net); const struct iphdr *iph = ip_hdr(skb); - struct mfc_cache *c; + struct mfc_cache *c = NULL; bool found = false; int err; spin_lock_bh(&mfc_unres_lock); + + if (!check_net(net)) { + err = -EINVAL; + goto err; + } + list_for_each_entry(c, &mrt->mfc_unres_queue, _c.list) { if (c->mfc_mcastgrp == iph->daddr && c->mfc_origin == iph->saddr) { @@ -1153,10 +1164,8 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, /* Create a new entry if allowable */ c = ipmr_cache_alloc_unres(); if (!c) { - spin_unlock_bh(&mfc_unres_lock); - - kfree_skb(skb); - return -ENOBUFS; + err = -ENOBUFS; + goto err; } /* Fill in the new cache entry */ @@ -1166,17 +1175,8 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, /* Reflect first query at mrouted. */ err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE); - - if (err < 0) { - /* If the report failed throw the cache entry - out - Brad Parker - */ - spin_unlock_bh(&mfc_unres_lock); - - ipmr_cache_free(c); - kfree_skb(skb); - return err; - } + if (err < 0) + goto err; atomic_inc(&mrt->cache_resolve_queue_len); list_add(&c->_c.list, &mrt->mfc_unres_queue); @@ -1189,18 +1189,26 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, /* See if we can append the packet */ if (c->_c.mfc_un.unres.unresolved.qlen > 3) { - kfree_skb(skb); + c = NULL; err = -ENOBUFS; - } else { - if (dev) { - skb->dev = dev; - skb->skb_iif = dev->ifindex; - } - skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb); - err = 0; + goto err; + } + + if (dev) { + skb->dev = dev; + skb->skb_iif = dev->ifindex; } + skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb); + spin_unlock_bh(&mfc_unres_lock); + return 0; + +err: + spin_unlock_bh(&mfc_unres_lock); + if (c) + ipmr_cache_free(c); + kfree_skb(skb); return err; } @@ -1346,7 +1354,7 @@ static void mroute_clean_tables(struct mr_table *mrt, int flags, } if (flags & MRT_FLUSH_MFC) { - if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { + if (atomic_read(&mrt->cache_resolve_queue_len) != 0 || !check_net(net)) { spin_lock_bh(&mfc_unres_lock); list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) { list_del(&c->list); diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index 37a3c144276c..3930d612c3de 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -28,6 +28,20 @@ void vif_device_init(struct vif_device *v, v->link = dev->ifindex; } +static void __mr_free_table(struct work_struct *work) +{ + struct mr_table *mrt = container_of(to_rcu_work(work), + struct mr_table, work); + + rhltable_destroy(&mrt->mfc_hash); + kfree(mrt); +} + +void mr_table_free(struct mr_table *mrt) +{ + queue_rcu_work(system_unbound_wq, &mrt->work); +} + struct mr_table * mr_table_alloc(struct net *net, u32 id, struct mr_table_ops *ops, @@ -50,6 +64,8 @@ mr_table_alloc(struct net *net, u32 id, kfree(mrt); return ERR_PTR(err); } + + INIT_RCU_WORK(&mrt->work, __mr_free_table); INIT_LIST_HEAD(&mrt->mfc_cache_list); INIT_LIST_HEAD(&mrt->mfc_unres_queue); -- cgit v1.2.3 From 5ec07d5204b4544271f32f6261ee097fe53cb081 Mon Sep 17 00:00:00 2001 From: Sheng Che Peng Date: Wed, 22 Apr 2026 10:18:19 +0800 Subject: tracepoint: Fix typo in tracepoint.h comment Change "my" to "may" in the description of subsystem configurations. Link: https://patch.msgid.link/20260422021819.1788091-1-synte4028@gmail.com Signed-off-by: Sheng Che Peng Signed-off-by: Steven Rostedt --- include/linux/tracepoint.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 578e520b6ee6..763eea4d80d8 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -202,7 +202,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define TP_CONDITION(args...) args /* - * Individual subsystem my have a separate configuration to + * Individual subsystem may have a separate configuration to * enable their tracepoints. By default, this file will create * the tracepoints if CONFIG_TRACEPOINTS is defined. If a subsystem * wants to be able to disable its tracepoints from being created -- cgit v1.2.3 From 927011b65a875302d08709bbe82eaf4d0d96c5d5 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 27 Apr 2026 22:49:41 -0400 Subject: drm/amdgpu: fix build for CONFIG_DRM_FBDEV_EMULATION=n MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The merge-commit 02e778f12359 ("Merge tag 'amd-drm-next-7.1-2026-03-12' of https://gitlab.freedesktop.org/agd5f/linux into drm-next") removes the stub for drm_fb_helper_gem_is_fb(), so the buld gets broken if DRM_FBDEV_EMULATION is not set. ‘drm_fb_helper_gem_is_fb’; did you mean ‘drm_fb_helper_from_client’? [-Wimplicit-function-declaration] 1777 | if (!drm_fb_helper_gem_is_fb(dev->fb_helper, fb->obj[0])) { | ^~~~~~~~~~~~~~~~~~~~~~~ | drm_fb_helper_from_client Restore it. Fixes: 02e778f12359 ("Merge tag 'amd-drm-next-7.1-2026-03-12' of https://gitlab.freedesktop.org/agd5f/linux into drm-next") Reviewed-by: Thomas Zimmermann Signed-off-by: Yury Norov Signed-off-by: Alex Deucher (cherry picked from commit 7b81bc38e92c2522484c42671401eaa023ae8831) --- include/drm/drm_fb_helper.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/drm/drm_fb_helper.h b/include/drm/drm_fb_helper.h index bf391903443d..0c5e5ed7b5e7 100644 --- a/include/drm/drm_fb_helper.h +++ b/include/drm/drm_fb_helper.h @@ -273,6 +273,12 @@ int drm_fb_helper_hotplug_event(struct drm_fb_helper *fb_helper); int drm_fb_helper_initial_config(struct drm_fb_helper *fb_helper); bool drm_fb_helper_gem_is_fb(const struct drm_fb_helper *fb_helper, const struct drm_gem_object *obj); +#else +static inline bool drm_fb_helper_gem_is_fb(const struct drm_fb_helper *fb_helper, + const struct drm_gem_object *obj) +{ + return false; +} #endif #endif -- cgit v1.2.3 From 7deba791ad495ce1d7921683f4f7d1190fa210d1 Mon Sep 17 00:00:00 2001 From: Martin Michaelis Date: Thu, 23 Apr 2026 15:54:11 -0600 Subject: io_uring/kbuf: support min length left for incremental buffers Incrementally consumed buffer rings are generally fully consumed, but it's quite possible that the application has a minimum size it needs to meet to avoid truncation. Currently that minimum limit is 1 byte, but this should be a setting that is the hands of the application. For recvmsg multishot, a prime use case for incrementally consumed buffers, the application may get spurious -EFAULT returned at the end of an incrementally consumed buffer, as less space is available than the headers need. Grab a u32 field in struct io_uring_buf_reg, which the application can use to inform the kernel of the minimum size that should be available in an incrementally consumed buffer. If less than that is available, the current buffer is fully processed and the next one will be picked. Cc: stable@vger.kernel.org Fixes: ae98dbf43d75 ("io_uring/kbuf: add support for incremental buffer consumption") Link: https://github.com/axboe/liburing/issues/1433 Signed-off-by: Martin Michaelis [axboe: write commit message, change io_buffer_list member name] Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 3 ++- io_uring/kbuf.c | 8 +++++++- io_uring/kbuf.h | 7 +++++++ 3 files changed, 16 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 17ac1b785440..909fb7aea638 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -905,7 +905,8 @@ struct io_uring_buf_reg { __u32 ring_entries; __u16 bgid; __u16 flags; - __u64 resv[3]; + __u32 min_left; + __u32 resv[5]; }; /* argument for IORING_REGISTER_PBUF_STATUS */ diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 43e4f8615fe8..63061aa1cab9 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -47,7 +47,7 @@ static bool io_kbuf_inc_commit(struct io_buffer_list *bl, int len) this_len = min_t(u32, len, buf_len); buf_len -= this_len; /* Stop looping for invalid buffer length of 0 */ - if (buf_len || !this_len) { + if (buf_len > bl->min_left_sub_one || !this_len) { WRITE_ONCE(buf->addr, READ_ONCE(buf->addr) + this_len); WRITE_ONCE(buf->len, buf_len); return false; @@ -637,6 +637,10 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (reg.ring_entries >= 65536) return -EINVAL; + /* minimum left byte count is a property of incremental buffers */ + if (!(reg.flags & IOU_PBUF_RING_INC) && reg.min_left) + return -EINVAL; + bl = io_buffer_get_list(ctx, reg.bgid); if (bl) { /* if mapped buffer ring OR classic exists, don't allow */ @@ -683,6 +687,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) bl->mask = reg.ring_entries - 1; bl->flags |= IOBL_BUF_RING; bl->buf_ring = br; + if (reg.min_left) + bl->min_left_sub_one = reg.min_left - 1; if (reg.flags & IOU_PBUF_RING_INC) bl->flags |= IOBL_INC; ret = io_buffer_add_list(ctx, bl, reg.bgid); diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index abf7052b556e..401773e1ef80 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -32,6 +32,13 @@ struct io_buffer_list { __u16 flags; + /* + * minimum required amount to be left to reuse an incrementally + * consumed buffer. If less than this is left at consumption time, + * buffer is done and head is incremented to the next buffer. + */ + __u32 min_left_sub_one; + struct io_mapped_region region; }; -- cgit v1.2.3 From 735a309b4bfb9e1e26636ff4a3e8a146f53c54f9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 27 Apr 2026 19:53:20 -0700 Subject: net: add net_iov_init() and use it to initialize ->page_type Commit db359fccf212 ("mm: introduce a new page type for page pool in page type") added a page_type field to struct net_iov at the same offset as struct page::page_type, so that page_pool_set_pp_info() can call __SetPageNetpp() uniformly on both pages and net_iovs. The page-type API requires the field to hold the UINT_MAX "no type" sentinel before a type can be set; for real struct page that invariant is established by the page allocator on free. struct net_iov is not allocated through the page allocator, so the field is left as zero (io_uring zcrx, which uses __GFP_ZERO) or as slab garbage (devmem, which uses kvmalloc_objs() without zeroing). When the page pool then calls page_pool_set_pp_info() on a freshly-bound niov, __SetPageNetpp()'s VM_BUG_ON_PAGE(page->page_type != UINT_MAX) fires and the kernel BUGs. Triggered in selftests by io_uring zcrx setup through the fbnic queue restart path: kernel BUG at ./include/linux/page-flags.h:1062! RIP: 0010:page_pool_set_pp_info (./include/linux/page-flags.h:1062 net/core/page_pool.c:716) Call Trace: net_mp_niov_set_page_pool (net/core/page_pool.c:1360) io_pp_zc_alloc_netmems (io_uring/zcrx.c:1089 io_uring/zcrx.c:1110) fbnic_fill_bdq (./include/net/page_pool/helpers.h:160 drivers/net/ethernet/meta/fbnic/fbnic_txrx.c:906) __fbnic_nv_restart (drivers/net/ethernet/meta/fbnic/fbnic_txrx.c:2470 drivers/net/ethernet/meta/fbnic/fbnic_txrx.c:2874) fbnic_queue_start (drivers/net/ethernet/meta/fbnic/fbnic_txrx.c:2903) netdev_rx_queue_reconfig (net/core/netdev_rx_queue.c:137) __netif_mp_open_rxq (net/core/netdev_rx_queue.c:234) io_register_zcrx (io_uring/zcrx.c:818 io_uring/zcrx.c:903) __io_uring_register (io_uring/register.c:931) __do_sys_io_uring_register (io_uring/register.c:1029) do_syscall_64 (arch/x86/entry/syscall_64.c:63 arch/x86/entry/syscall_64.c:94) The same path is reachable through devmem dmabuf binding via netdev_nl_bind_rx_doit() -> net_devmem_bind_dmabuf_to_queue(). Add a net_iov_init() helper that stamps ->owner, ->type and the ->page_type sentinel, and use it from both the devmem and io_uring zcrx niov init loops. Fixes: db359fccf212 ("mm: introduce a new page type for page pool in page type") Acked-by: Vlastimil Babka (SUSE) Acked-by: Byungchul Park Reviewed-by: Jens Axboe Acked-by: Pavel Begunkov Link: https://patch.msgid.link/20260428025320.853452-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/netmem.h | 15 +++++++++++++++ io_uring/zcrx.c | 3 +-- net/core/devmem.c | 3 +-- 3 files changed, 17 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/netmem.h b/include/net/netmem.h index 507b74c9f52d..78fe51e5756b 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -127,6 +127,21 @@ static inline unsigned int net_iov_idx(const struct net_iov *niov) return niov - net_iov_owner(niov)->niovs; } +/* Initialize a niov: stamp the owning area, the memory provider type, + * and the page_type "no type" sentinel expected by the page-type API + * (see PAGE_TYPE_OPS in ) so that + * page_pool_set_pp_info() can later call __SetPageNetpp() on a niov + * cast to struct page. + */ +static inline void net_iov_init(struct net_iov *niov, + struct net_iov_area *owner, + enum net_iov_type type) +{ + niov->owner = owner; + niov->type = type; + niov->page_type = UINT_MAX; +} + /* netmem */ /** diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 7b93c87b8371..19837e0b5e91 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -495,10 +495,9 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, for (i = 0; i < nr_iovs; i++) { struct net_iov *niov = &area->nia.niovs[i]; - niov->owner = &area->nia; + net_iov_init(niov, &area->nia, NET_IOV_IOURING); area->freelist[i] = i; atomic_set(&area->user_refs[i], 0); - niov->type = NET_IOV_IOURING; } if (ifq->dev) { diff --git a/net/core/devmem.c b/net/core/devmem.c index cde4c89bc146..468344739db2 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -297,8 +297,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, for (i = 0; i < owner->area.num_niovs; i++) { niov = &owner->area.niovs[i]; - niov->type = NET_IOV_DMABUF; - niov->owner = &owner->area; + net_iov_init(niov, &owner->area, NET_IOV_DMABUF); page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), net_devmem_get_dma_addr(niov)); if (direction == DMA_TO_DEVICE) -- cgit v1.2.3 From c4f050ce06c56cfb5993268af4a5cb66ed1cd04e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Apr 2026 12:32:07 +0000 Subject: bonding: 3ad: implement proper RCU rules for port->aggregator syzbot found a data-race in bond_3ad_get_active_agg_info / bond_3ad_state_machine_handler [1] which hints at lack of proper RCU implementation. Add __rcu qualifier to port->aggregator, and add proper RCU API. [1] BUG: KCSAN: data-race in bond_3ad_get_active_agg_info / bond_3ad_state_machine_handler write to 0xffff88813cf5c4b0 of 8 bytes by task 36 on cpu 0: ad_port_selection_logic drivers/net/bonding/bond_3ad.c:1659 [inline] bond_3ad_state_machine_handler+0x9d5/0x2d60 drivers/net/bonding/bond_3ad.c:2569 process_one_work kernel/workqueue.c:3302 [inline] process_scheduled_works+0x4f0/0x9c0 kernel/workqueue.c:3385 worker_thread+0x58a/0x780 kernel/workqueue.c:3466 kthread+0x22a/0x280 kernel/kthread.c:436 ret_from_fork+0x146/0x330 arch/x86/kernel/process.c:158 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245 read to 0xffff88813cf5c4b0 of 8 bytes by task 22063 on cpu 1: __bond_3ad_get_active_agg_info drivers/net/bonding/bond_3ad.c:2858 [inline] bond_3ad_get_active_agg_info+0x8c/0x230 drivers/net/bonding/bond_3ad.c:2881 bond_fill_info+0xe0f/0x10f0 drivers/net/bonding/bond_netlink.c:853 rtnl_link_info_fill net/core/rtnetlink.c:906 [inline] rtnl_link_fill+0x1d7/0x4e0 net/core/rtnetlink.c:927 rtnl_fill_ifinfo+0xf8e/0x1380 net/core/rtnetlink.c:2168 rtmsg_ifinfo_build_skb+0x11c/0x1b0 net/core/rtnetlink.c:4453 rtmsg_ifinfo_event net/core/rtnetlink.c:4486 [inline] rtmsg_ifinfo+0x6d/0x110 net/core/rtnetlink.c:4495 __dev_notify_flags+0x76/0x390 net/core/dev.c:9790 netif_change_flags+0xac/0xd0 net/core/dev.c:9823 do_setlink+0x905/0x2950 net/core/rtnetlink.c:3180 rtnl_group_changelink net/core/rtnetlink.c:3813 [inline] __rtnl_newlink net/core/rtnetlink.c:3981 [inline] rtnl_newlink+0xf55/0x1400 net/core/rtnetlink.c:4109 rtnetlink_rcv_msg+0x64b/0x720 net/core/rtnetlink.c:6995 netlink_rcv_skb+0x123/0x220 net/netlink/af_netlink.c:2550 rtnetlink_rcv+0x1c/0x30 net/core/rtnetlink.c:7022 netlink_unicast_kernel net/netlink/af_netlink.c:1318 [inline] netlink_unicast+0x5a8/0x680 net/netlink/af_netlink.c:1344 netlink_sendmsg+0x5c8/0x6f0 net/netlink/af_netlink.c:1894 sock_sendmsg_nosec net/socket.c:787 [inline] __sock_sendmsg net/socket.c:802 [inline] ____sys_sendmsg+0x563/0x5b0 net/socket.c:2698 ___sys_sendmsg+0x195/0x1e0 net/socket.c:2752 __sys_sendmsg net/socket.c:2784 [inline] __do_sys_sendmsg net/socket.c:2789 [inline] __se_sys_sendmsg net/socket.c:2787 [inline] __x64_sys_sendmsg+0xd4/0x160 net/socket.c:2787 x64_sys_call+0x194c/0x3020 arch/x86/include/generated/asm/syscalls_64.h:47 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0x12c/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f value changed: 0x0000000000000000 -> 0xffff88813cf5c400 Reported by Kernel Concurrency Sanitizer on: CPU: 1 UID: 0 PID: 22063 Comm: syz.0.31122 Tainted: G W syzkaller #0 PREEMPT(full) Tainted: [W]=WARN Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 04/18/2026 Fixes: 47e91f56008b ("bonding: use RCU protection for 3ad xmit path") Reported-by: syzbot+9bb2ff2a4ab9e17307e1@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/69f0a82f.050a0220.3aadc4.0000.GAE@google.com/ Signed-off-by: Eric Dumazet Cc: Jay Vosburgh Cc: Andrew Lunn Link: https://patch.msgid.link/20260428123207.3809211-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_3ad.c | 109 ++++++++++++++++++--------------- drivers/net/bonding/bond_main.c | 8 ++- drivers/net/bonding/bond_netlink.c | 16 +++-- drivers/net/bonding/bond_procfs.c | 3 +- drivers/net/bonding/bond_sysfs_slave.c | 17 +++-- include/net/bond_3ad.h | 2 +- 6 files changed, 89 insertions(+), 66 deletions(-) (limited to 'include') diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index af7f74cfdc08..f0aa7d2f2171 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -1029,6 +1029,7 @@ static void ad_cond_set_peer_notif(struct port *port) static void ad_mux_machine(struct port *port, bool *update_slave_arr) { struct bonding *bond = __get_bond_by_port(port); + struct aggregator *aggregator; mux_states_t last_state; /* keep current State Machine state to compare later if it was @@ -1036,6 +1037,7 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr) */ last_state = port->sm_mux_state; + aggregator = rcu_dereference(port->aggregator); if (port->sm_vars & AD_PORT_BEGIN) { port->sm_mux_state = AD_MUX_DETACHED; } else { @@ -1055,7 +1057,7 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr) * cycle to update ready variable, we check * READY_N and update READY here */ - __set_agg_ports_ready(port->aggregator, __agg_ports_are_ready(port->aggregator)); + __set_agg_ports_ready(aggregator, __agg_ports_are_ready(aggregator)); port->sm_mux_state = AD_MUX_DETACHED; break; } @@ -1070,7 +1072,7 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr) * update ready variable, we check READY_N and update * READY here */ - __set_agg_ports_ready(port->aggregator, __agg_ports_are_ready(port->aggregator)); + __set_agg_ports_ready(aggregator, __agg_ports_are_ready(aggregator)); /* if the wait_while_timer expired, and the port is * in READY state, move to ATTACHED state @@ -1086,7 +1088,7 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr) if ((port->sm_vars & AD_PORT_SELECTED) && (port->partner_oper.port_state & LACP_STATE_SYNCHRONIZATION) && !__check_agg_selection_timer(port)) { - if (port->aggregator->is_active) { + if (aggregator->is_active) { int state = AD_MUX_COLLECTING_DISTRIBUTING; if (!bond->params.coupled_control) @@ -1102,9 +1104,9 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr) * cycle to update ready variable, we check * READY_N and update READY here */ - __set_agg_ports_ready(port->aggregator, __agg_ports_are_ready(port->aggregator)); + __set_agg_ports_ready(aggregator, __agg_ports_are_ready(aggregator)); port->sm_mux_state = AD_MUX_DETACHED; - } else if (port->aggregator->is_active) { + } else if (aggregator->is_active) { port->actor_oper_port_state |= LACP_STATE_SYNCHRONIZATION; } @@ -1115,7 +1117,7 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr) * sure that a collecting distributing * port in an active aggregator is enabled */ - if (port->aggregator->is_active && + if (aggregator->is_active && !__port_is_collecting_distributing(port)) { __enable_port(port); *update_slave_arr = true; @@ -1134,7 +1136,7 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr) */ struct slave *slave = port->slave; - if (port->aggregator->is_active && + if (aggregator->is_active && bond_is_slave_rx_disabled(slave)) { ad_enable_collecting(port); *update_slave_arr = true; @@ -1154,8 +1156,8 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr) * sure that a collecting distributing * port in an active aggregator is enabled */ - if (port->aggregator && - port->aggregator->is_active && + if (aggregator && + aggregator->is_active && !__port_is_collecting_distributing(port)) { __enable_port(port); *update_slave_arr = true; @@ -1187,7 +1189,7 @@ static void ad_mux_machine(struct port *port, bool *update_slave_arr) port->sm_mux_timer_counter = __ad_timer_to_ticks(AD_WAIT_WHILE_TIMER, 0); break; case AD_MUX_ATTACHED: - if (port->aggregator->is_active) + if (aggregator->is_active) port->actor_oper_port_state |= LACP_STATE_SYNCHRONIZATION; else @@ -1561,9 +1563,9 @@ static void ad_port_selection_logic(struct port *port, bool *update_slave_arr) bond = __get_bond_by_port(port); /* if the port is connected to other aggregator, detach it */ - if (port->aggregator) { + temp_aggregator = rcu_dereference(port->aggregator); + if (temp_aggregator) { /* detach the port from its former aggregator */ - temp_aggregator = port->aggregator; for (curr_port = temp_aggregator->lag_ports; curr_port; last_port = curr_port, curr_port = curr_port->next_port_in_aggregator) { @@ -1586,7 +1588,7 @@ static void ad_port_selection_logic(struct port *port, bool *update_slave_arr) /* clear the port's relations to this * aggregator */ - port->aggregator = NULL; + RCU_INIT_POINTER(port->aggregator, NULL); port->next_port_in_aggregator = NULL; port->actor_port_aggregator_identifier = 0; @@ -1609,7 +1611,7 @@ static void ad_port_selection_logic(struct port *port, bool *update_slave_arr) port->slave->bond->dev->name, port->slave->dev->name, port->actor_port_number, - port->aggregator->aggregator_identifier); + temp_aggregator->aggregator_identifier); } } /* search on all aggregators for a suitable aggregator for this port */ @@ -1633,15 +1635,15 @@ static void ad_port_selection_logic(struct port *port, bool *update_slave_arr) ) ) { /* attach to the founded aggregator */ - port->aggregator = aggregator; + rcu_assign_pointer(port->aggregator, aggregator); port->actor_port_aggregator_identifier = - port->aggregator->aggregator_identifier; + aggregator->aggregator_identifier; port->next_port_in_aggregator = aggregator->lag_ports; - port->aggregator->num_of_ports++; + aggregator->num_of_ports++; aggregator->lag_ports = port; slave_dbg(bond->dev, slave->dev, "Port %d joined LAG %d (existing LAG)\n", port->actor_port_number, - port->aggregator->aggregator_identifier); + aggregator->aggregator_identifier); /* mark this port as selected */ port->sm_vars |= AD_PORT_SELECTED; @@ -1656,39 +1658,40 @@ static void ad_port_selection_logic(struct port *port, bool *update_slave_arr) if (!found) { if (free_aggregator) { /* assign port a new aggregator */ - port->aggregator = free_aggregator; port->actor_port_aggregator_identifier = - port->aggregator->aggregator_identifier; + free_aggregator->aggregator_identifier; /* update the new aggregator's parameters * if port was responsed from the end-user */ if (port->actor_oper_port_key & AD_DUPLEX_KEY_MASKS) /* if port is full duplex */ - port->aggregator->is_individual = false; + free_aggregator->is_individual = false; else - port->aggregator->is_individual = true; + free_aggregator->is_individual = true; - port->aggregator->actor_admin_aggregator_key = + free_aggregator->actor_admin_aggregator_key = port->actor_admin_port_key; - port->aggregator->actor_oper_aggregator_key = + free_aggregator->actor_oper_aggregator_key = port->actor_oper_port_key; - port->aggregator->partner_system = + free_aggregator->partner_system = port->partner_oper.system; - port->aggregator->partner_system_priority = + free_aggregator->partner_system_priority = port->partner_oper.system_priority; - port->aggregator->partner_oper_aggregator_key = port->partner_oper.key; - port->aggregator->receive_state = 1; - port->aggregator->transmit_state = 1; - port->aggregator->lag_ports = port; - port->aggregator->num_of_ports++; + free_aggregator->partner_oper_aggregator_key = port->partner_oper.key; + free_aggregator->receive_state = 1; + free_aggregator->transmit_state = 1; + free_aggregator->lag_ports = port; + free_aggregator->num_of_ports++; + + rcu_assign_pointer(port->aggregator, free_aggregator); /* mark this port as selected */ port->sm_vars |= AD_PORT_SELECTED; slave_dbg(bond->dev, port->slave->dev, "Port %d joined LAG %d (new LAG)\n", port->actor_port_number, - port->aggregator->aggregator_identifier); + free_aggregator->aggregator_identifier); } else { slave_err(bond->dev, port->slave->dev, "Port %d did not find a suitable aggregator\n", @@ -1700,13 +1703,12 @@ static void ad_port_selection_logic(struct port *port, bool *update_slave_arr) * in all aggregator's ports, else set ready=FALSE in all * aggregator's ports */ - __set_agg_ports_ready(port->aggregator, - __agg_ports_are_ready(port->aggregator)); + aggregator = rcu_dereference(port->aggregator); + __set_agg_ports_ready(aggregator, __agg_ports_are_ready(aggregator)); - aggregator = __get_first_agg(port); - ad_agg_selection_logic(aggregator, update_slave_arr); + ad_agg_selection_logic(__get_first_agg(port), update_slave_arr); - if (!port->aggregator->is_active) + if (!aggregator->is_active) port->actor_oper_port_state &= ~LACP_STATE_SYNCHRONIZATION; } @@ -2075,13 +2077,15 @@ static void ad_initialize_port(struct port *port, const struct bond_params *bond */ static void ad_enable_collecting(struct port *port) { - if (port->aggregator->is_active) { + struct aggregator *aggregator = rcu_dereference(port->aggregator); + + if (aggregator->is_active) { struct slave *slave = port->slave; slave_dbg(slave->bond->dev, slave->dev, "Enabling collecting on port %d (LAG %d)\n", port->actor_port_number, - port->aggregator->aggregator_identifier); + aggregator->aggregator_identifier); __enable_collecting_port(port); } } @@ -2093,11 +2097,13 @@ static void ad_enable_collecting(struct port *port) */ static void ad_disable_distributing(struct port *port, bool *update_slave_arr) { - if (port->aggregator && __agg_has_partner(port->aggregator)) { + struct aggregator *aggregator = rcu_dereference(port->aggregator); + + if (aggregator && __agg_has_partner(aggregator)) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Disabling distributing on port %d (LAG %d)\n", port->actor_port_number, - port->aggregator->aggregator_identifier); + aggregator->aggregator_identifier); __disable_distributing_port(port); /* Slave array needs an update */ *update_slave_arr = true; @@ -2114,11 +2120,13 @@ static void ad_disable_distributing(struct port *port, bool *update_slave_arr) static void ad_enable_collecting_distributing(struct port *port, bool *update_slave_arr) { - if (port->aggregator->is_active) { + struct aggregator *aggregator = rcu_dereference(port->aggregator); + + if (aggregator->is_active) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Enabling port %d (LAG %d)\n", port->actor_port_number, - port->aggregator->aggregator_identifier); + aggregator->aggregator_identifier); __enable_port(port); /* Slave array needs update */ *update_slave_arr = true; @@ -2135,11 +2143,13 @@ static void ad_enable_collecting_distributing(struct port *port, static void ad_disable_collecting_distributing(struct port *port, bool *update_slave_arr) { - if (port->aggregator && __agg_has_partner(port->aggregator)) { + struct aggregator *aggregator = rcu_dereference(port->aggregator); + + if (aggregator && __agg_has_partner(aggregator)) { slave_dbg(port->slave->bond->dev, port->slave->dev, "Disabling port %d (LAG %d)\n", port->actor_port_number, - port->aggregator->aggregator_identifier); + aggregator->aggregator_identifier); __disable_port(port); /* Slave array needs an update */ *update_slave_arr = true; @@ -2379,7 +2389,7 @@ void bond_3ad_unbind_slave(struct slave *slave) */ for (temp_port = aggregator->lag_ports; temp_port; temp_port = temp_port->next_port_in_aggregator) { - temp_port->aggregator = new_aggregator; + rcu_assign_pointer(temp_port->aggregator, new_aggregator); temp_port->actor_port_aggregator_identifier = new_aggregator->aggregator_identifier; } @@ -2848,15 +2858,16 @@ out: int __bond_3ad_get_active_agg_info(struct bonding *bond, struct ad_info *ad_info) { - struct aggregator *aggregator = NULL; + struct aggregator *aggregator = NULL, *tmp; struct list_head *iter; struct slave *slave; struct port *port; bond_for_each_slave_rcu(bond, slave, iter) { port = &(SLAVE_AD_INFO(slave)->port); - if (port->aggregator && port->aggregator->is_active) { - aggregator = port->aggregator; + tmp = rcu_dereference(port->aggregator); + if (tmp && tmp->is_active) { + aggregator = tmp; break; } } diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index c7baa5c4bf40..af82a3df2c5d 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1433,7 +1433,7 @@ static void bond_poll_controller(struct net_device *bond_dev) if (BOND_MODE(bond) == BOND_MODE_8023AD) { struct aggregator *agg = - SLAVE_AD_INFO(slave)->port.aggregator; + rcu_dereference(SLAVE_AD_INFO(slave)->port.aggregator); if (agg && agg->aggregator_identifier != ad_info.aggregator_id) @@ -5179,15 +5179,16 @@ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave) spin_unlock_bh(&bond->mode_lock); agg_id = ad_info.aggregator_id; } + rcu_read_lock(); bond_for_each_slave(bond, slave, iter) { if (skipslave == slave) continue; all_slaves->arr[all_slaves->count++] = slave; if (BOND_MODE(bond) == BOND_MODE_8023AD) { - struct aggregator *agg; + const struct aggregator *agg; - agg = SLAVE_AD_INFO(slave)->port.aggregator; + agg = rcu_dereference(SLAVE_AD_INFO(slave)->port.aggregator); if (!agg || agg->aggregator_identifier != agg_id) continue; } @@ -5199,6 +5200,7 @@ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave) usable_slaves->arr[usable_slaves->count++] = slave; } + rcu_read_unlock(); bond_set_slave_arr(bond, usable_slaves, all_slaves); return ret; diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c index ea1a80e658ae..c7d3e0602c83 100644 --- a/drivers/net/bonding/bond_netlink.c +++ b/drivers/net/bonding/bond_netlink.c @@ -66,27 +66,29 @@ static int bond_fill_slave_info(struct sk_buff *skb, const struct port *ad_port; ad_port = &SLAVE_AD_INFO(slave)->port; - agg = SLAVE_AD_INFO(slave)->port.aggregator; + rcu_read_lock(); + agg = rcu_dereference(SLAVE_AD_INFO(slave)->port.aggregator); if (agg) { if (nla_put_u16(skb, IFLA_BOND_SLAVE_AD_AGGREGATOR_ID, agg->aggregator_identifier)) - goto nla_put_failure; + goto nla_put_failure_rcu; if (nla_put_u8(skb, IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE, ad_port->actor_oper_port_state)) - goto nla_put_failure; + goto nla_put_failure_rcu; if (nla_put_u16(skb, IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE, ad_port->partner_oper.port_state)) - goto nla_put_failure; + goto nla_put_failure_rcu; if (nla_put_u8(skb, IFLA_BOND_SLAVE_AD_CHURN_ACTOR_STATE, ad_port->sm_churn_actor_state)) - goto nla_put_failure; + goto nla_put_failure_rcu; if (nla_put_u8(skb, IFLA_BOND_SLAVE_AD_CHURN_PARTNER_STATE, ad_port->sm_churn_partner_state)) - goto nla_put_failure; + goto nla_put_failure_rcu; } + rcu_read_unlock(); if (nla_put_u16(skb, IFLA_BOND_SLAVE_ACTOR_PORT_PRIO, SLAVE_AD_INFO(slave)->port_priority)) @@ -95,6 +97,8 @@ static int bond_fill_slave_info(struct sk_buff *skb, return 0; +nla_put_failure_rcu: + rcu_read_unlock(); nla_put_failure: return -EMSGSIZE; } diff --git a/drivers/net/bonding/bond_procfs.c b/drivers/net/bonding/bond_procfs.c index e34f80305191..3714aab1a3d9 100644 --- a/drivers/net/bonding/bond_procfs.c +++ b/drivers/net/bonding/bond_procfs.c @@ -188,6 +188,7 @@ static void bond_info_show_master(struct seq_file *seq) } } +/* Note: runs under rcu_read_lock() */ static void bond_info_show_slave(struct seq_file *seq, const struct slave *slave) { @@ -214,7 +215,7 @@ static void bond_info_show_slave(struct seq_file *seq, if (BOND_MODE(bond) == BOND_MODE_8023AD) { const struct port *port = &SLAVE_AD_INFO(slave)->port; - const struct aggregator *agg = port->aggregator; + const struct aggregator *agg = rcu_dereference(port->aggregator); if (agg) { seq_printf(seq, "Aggregator ID: %d\n", diff --git a/drivers/net/bonding/bond_sysfs_slave.c b/drivers/net/bonding/bond_sysfs_slave.c index 36d0e8440b5b..fc6fe7181789 100644 --- a/drivers/net/bonding/bond_sysfs_slave.c +++ b/drivers/net/bonding/bond_sysfs_slave.c @@ -62,10 +62,15 @@ static ssize_t ad_aggregator_id_show(struct slave *slave, char *buf) const struct aggregator *agg; if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) { - agg = SLAVE_AD_INFO(slave)->port.aggregator; - if (agg) - return sysfs_emit(buf, "%d\n", - agg->aggregator_identifier); + rcu_read_lock(); + agg = rcu_dereference(SLAVE_AD_INFO(slave)->port.aggregator); + if (agg) { + ssize_t res = sysfs_emit(buf, "%d\n", + agg->aggregator_identifier); + rcu_read_unlock(); + return res; + } + rcu_read_unlock(); } return sysfs_emit(buf, "N/A\n"); @@ -78,7 +83,7 @@ static ssize_t ad_actor_oper_port_state_show(struct slave *slave, char *buf) if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) { ad_port = &SLAVE_AD_INFO(slave)->port; - if (ad_port->aggregator) + if (rcu_access_pointer(ad_port->aggregator)) return sysfs_emit(buf, "%u\n", ad_port->actor_oper_port_state); } @@ -93,7 +98,7 @@ static ssize_t ad_partner_oper_port_state_show(struct slave *slave, char *buf) if (BOND_MODE(slave->bond) == BOND_MODE_8023AD) { ad_port = &SLAVE_AD_INFO(slave)->port; - if (ad_port->aggregator) + if (rcu_access_pointer(ad_port->aggregator)) return sysfs_emit(buf, "%u\n", ad_port->partner_oper.port_state); } diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h index c92d4a976246..05572c19e14b 100644 --- a/include/net/bond_3ad.h +++ b/include/net/bond_3ad.h @@ -243,7 +243,7 @@ typedef struct port { churn_state_t sm_churn_actor_state; churn_state_t sm_churn_partner_state; struct slave *slave; /* pointer to the bond slave that this port belongs to */ - struct aggregator *aggregator; /* pointer to an aggregator that this port related to */ + struct aggregator __rcu *aggregator; /* pointer to an aggregator that this port related to */ struct port *next_port_in_aggregator; /* Next port on the linked list of the parent aggregator */ u32 transaction_id; /* continuous number for identification of Marker PDU's; */ struct lacpdu lacpdu; /* the lacpdu that will be sent for this port */ -- cgit v1.2.3 From 620055cb1036a6125fd912e7a14b47a6572b809b Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Mon, 27 Apr 2026 22:22:21 -0700 Subject: dpll: export __dpll_pin_change_ntf() for use under dpll_lock Export __dpll_pin_change_ntf() so that drivers can send pin change notifications from within pin callbacks, which are already called under dpll_lock. Using dpll_pin_change_ntf() in that context would deadlock. Add lockdep_assert_held() to catch misuse without the lock held. Acked-by: Vadim Fedorenko Signed-off-by: Ivan Vecera Signed-off-by: Petr Oros Tested-by: Alexander Nowlin Reviewed-by: Arkadiusz Kubalewski Signed-off-by: Jacob Keller Link: https://patch.msgid.link/20260427-jk-iwl-net-petr-oros-fixes-v1-9-cdcb48303fd8@intel.com Signed-off-by: Paolo Abeni --- drivers/dpll/dpll_netlink.c | 10 ++++++++++ drivers/dpll/dpll_netlink.h | 2 -- include/linux/dpll.h | 1 + 3 files changed, 11 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c index af7ce62ec55c..0ff1658c2dc1 100644 --- a/drivers/dpll/dpll_netlink.c +++ b/drivers/dpll/dpll_netlink.c @@ -900,11 +900,21 @@ int dpll_pin_delete_ntf(struct dpll_pin *pin) return dpll_pin_event_send(DPLL_CMD_PIN_DELETE_NTF, pin); } +/** + * __dpll_pin_change_ntf - notify that the pin has been changed + * @pin: registered pin pointer + * + * Context: caller must hold dpll_lock. Suitable for use inside pin + * callbacks which are already invoked under dpll_lock. + * Return: 0 if succeeds, error code otherwise. + */ int __dpll_pin_change_ntf(struct dpll_pin *pin) { + lockdep_assert_held(&dpll_lock); dpll_pin_notify(pin, DPLL_PIN_CHANGED); return dpll_pin_event_send(DPLL_CMD_PIN_CHANGE_NTF, pin); } +EXPORT_SYMBOL_GPL(__dpll_pin_change_ntf); /** * dpll_pin_change_ntf - notify that the pin has been changed diff --git a/drivers/dpll/dpll_netlink.h b/drivers/dpll/dpll_netlink.h index dd28b56d27c5..a9cfd55f57fc 100644 --- a/drivers/dpll/dpll_netlink.h +++ b/drivers/dpll/dpll_netlink.h @@ -11,5 +11,3 @@ int dpll_device_delete_ntf(struct dpll_device *dpll); int dpll_pin_create_ntf(struct dpll_pin *pin); int dpll_pin_delete_ntf(struct dpll_pin *pin); - -int __dpll_pin_change_ntf(struct dpll_pin *pin); diff --git a/include/linux/dpll.h b/include/linux/dpll.h index b7277a8b484d..f8037f1ab20b 100644 --- a/include/linux/dpll.h +++ b/include/linux/dpll.h @@ -286,6 +286,7 @@ int dpll_pin_ref_sync_pair_add(struct dpll_pin *pin, int dpll_device_change_ntf(struct dpll_device *dpll); +int __dpll_pin_change_ntf(struct dpll_pin *pin); int dpll_pin_change_ntf(struct dpll_pin *pin); int register_dpll_notifier(struct notifier_block *nb); -- cgit v1.2.3