From 48fcc895403cc97aa6c776cb65e6aa11290c0b44 Mon Sep 17 00:00:00 2001
From: Wei Liu <wei.liu@kernel.org>
Date: Thu, 23 Apr 2026 17:26:26 +0000
Subject: mshv: add a missing padding field
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

That was missed when importing the header.

Reported-by: Doru Blânzeanu <dblanzeanu@linux.microsoft.com>
Reported-by: Magnus Kulke <magnuskulke@linux.microsoft.com>
Fixes: e68bda71a2384 ("hyperv: Add new Hyper-V headers in include/hyperv")
Cc: stable@kernel.org
Reviewed-by: Easwar Hariharan <easwar.hariharan@linux.microsoft.com>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 include/hyperv/hvhdk.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/hyperv/hvhdk.h b/include/hyperv/hvhdk.h
index 5e83d3714966..0c89c62c9706 100644
--- a/include/hyperv/hvhdk.h
+++ b/include/hyperv/hvhdk.h
@@ -79,6 +79,7 @@ struct hv_vp_register_page {
 
 		u64 registers[18];
 	};
+	u8 reserved[8];
 	/* Volatile XMM registers (HV_X64_REGISTER_CLASS_XMM) */
 	union {
 		struct {
-- 
cgit v1.2.3


From c5c3ef8d49e15d2fc1cec4ad7c91d81b99977440 Mon Sep 17 00:00:00 2001
From: Michael Kelley <mhklinux@outlook.com>
Date: Tue, 17 Feb 2026 10:23:34 -0800
Subject: Drivers: hv: vmbus: Provide option to skip VMBus unload on panic

Currently, VMBus code initiates a VMBus unload in the panic path so
that if a kdump kernel is loaded, it can start fresh in setting up its
own VMBus connection. However, a driver for the VMBus virtual frame
buffer may need to flush dirty portions of the frame buffer back to
the Hyper-V host so that panic information is visible in the graphics
console. To support such flushing, provide exported functions for the
frame buffer driver to specify that the VMBus unload should not be
done by the VMBus driver, and to initiate the VMBus unload itself.
Together these allow a frame buffer driver to delay the VMBus unload
until after it has completed the flush.

Ideally, the VMBus driver could use its own panic-path callback to do
the unload after all frame buffer drivers have finished. But DRM frame
buffer drivers use the kmsg dump callback, and there are no callbacks
after that in the panic path. Hence this somewhat messy approach to
properly sequencing the frame buffer flush and the VMBus unload.

Fixes: 3671f3777758 ("drm/hyperv: Add support for drm_panic")
Signed-off-by: Michael Kelley <mhklinux@outlook.com>
Reviewed-by: Long Li <longli@microsoft.com>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 drivers/hv/channel_mgmt.c |  1 +
 drivers/hv/hyperv_vmbus.h |  1 -
 drivers/hv/vmbus_drv.c    | 25 ++++++++++++++++++-------
 include/linux/hyperv.h    |  3 +++
 4 files changed, 22 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 84eb0a6a0b54..89d214dda360 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -952,6 +952,7 @@ void vmbus_initiate_unload(bool crash)
 	else
 		vmbus_wait_for_unload();
 }
+EXPORT_SYMBOL_GPL(vmbus_initiate_unload);
 
 static void vmbus_setup_channel_state(struct vmbus_channel *channel,
 				      struct vmbus_channel_offer_channel *offer)
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 05a36854389a..eb8bdd8bb1f5 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -441,7 +441,6 @@ void hv_vss_deinit(void);
 int hv_vss_pre_suspend(void);
 int hv_vss_pre_resume(void);
 void hv_vss_onchannelcallback(void *context);
-void vmbus_initiate_unload(bool crash);
 
 static inline void hv_poll_channel(struct vmbus_channel *channel,
 				   void (*cb)(void *))
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index d28ff45d4cfd..c9eeb2ec365d 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -69,19 +69,29 @@ bool vmbus_is_confidential(void)
 }
 EXPORT_SYMBOL_GPL(vmbus_is_confidential);
 
+static bool skip_vmbus_unload;
+
+/*
+ * Allow a VMBus framebuffer driver to specify that in the case of a panic,
+ * it will do the VMbus unload operation once it has flushed any dirty
+ * portions of the framebuffer to the Hyper-V host.
+ */
+void vmbus_set_skip_unload(bool skip)
+{
+	skip_vmbus_unload = skip;
+}
+EXPORT_SYMBOL_GPL(vmbus_set_skip_unload);
+
 /*
  * The panic notifier below is responsible solely for unloading the
  * vmbus connection, which is necessary in a panic event.
- *
- * Notice an intrincate relation of this notifier with Hyper-V
- * framebuffer panic notifier exists - we need vmbus connection alive
- * there in order to succeed, so we need to order both with each other
- * [see hvfb_on_panic()] - this is done using notifiers' priorities.
  */
 static int hv_panic_vmbus_unload(struct notifier_block *nb, unsigned long val,
 			      void *args)
 {
-	vmbus_initiate_unload(true);
+	if (!skip_vmbus_unload)
+		vmbus_initiate_unload(true);
+
 	return NOTIFY_DONE;
 }
 static struct notifier_block hyperv_panic_vmbus_unload_block = {
@@ -2897,7 +2907,8 @@ static void hv_crash_handler(struct pt_regs *regs)
 {
 	int cpu;
 
-	vmbus_initiate_unload(true);
+	if (!skip_vmbus_unload)
+		vmbus_initiate_unload(true);
 	/*
 	 * In crash handler we can't schedule synic cleanup for all CPUs,
 	 * doing the cleanup for current CPU only. This should be sufficient
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 964f1be8150c..41a3d82f0722 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1336,6 +1336,9 @@ int vmbus_allocate_mmio(struct resource **new, struct hv_device *device_obj,
 			bool fb_overlap_ok);
 void vmbus_free_mmio(resource_size_t start, resource_size_t size);
 
+void vmbus_initiate_unload(bool crash);
+void vmbus_set_skip_unload(bool skip);
+
 /*
  * GUID definitions of various offer types - services offered to the guest.
  */
-- 
cgit v1.2.3


From c15d7a2a11ea055bcecc0b538ae8ba79475637f9 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 4 Dec 2025 11:17:23 +0100
Subject: tee: fix tee_ioctl_object_invoke_arg padding

The tee_ioctl_object_invoke_arg structure has padding on some
architectures but not on x86-32 and a few others:

include/linux/tee.h:474:32: error: padding struct to align 'params' [-Werror=padded]

I expect that all current users of this are on architectures that do
have implicit padding here (arm64, arm, x86, riscv), so make the padding
explicit in order to avoid surprises if this later gets used elsewhere.

Fixes: d5b8b0fa1775 ("tee: add TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Jens Wiklander <jens.wiklander@linaro.org>
Tested-by: Harshal Dev <harshal.dev@oss.qualcomm.com>
Reviewed-by: Sumit Garg <sumit.garg@oss.qualcomm.com>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
---
 include/uapi/linux/tee.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h
index cab5cadca8ef..5203977ed35d 100644
--- a/include/uapi/linux/tee.h
+++ b/include/uapi/linux/tee.h
@@ -470,6 +470,7 @@ struct tee_ioctl_object_invoke_arg {
 	__u32 op;
 	__u32 ret;
 	__u32 num_params;
+	__u32 :32;
 	/* num_params tells the actual number of element in params */
 	struct tee_ioctl_param params[];
 };
-- 
cgit v1.2.3


From 83eb00f31eb1b10735d48e469df72cc2b0e06f6d Mon Sep 17 00:00:00 2001
From: Dexuan Cui <decui@microsoft.com>
Date: Wed, 27 May 2026 12:21:01 -0700
Subject: hyperv: Clean up and fix the guest ID comment in hvgdk.h

Change the "64 bit" to "64-bit", and the "Os" to "OS".

Remove the obsolete paragraph since the guideline has been
published in the Hypervisor Top Level Functional Specification
for many years.

The "OS Type" is 0x1 for Linux, not 0x100.

No functional change.

Fixes: 83ba0c4f3f31 ("Drivers: hv: Cleanup the guest ID computation")
Signed-off-by: Dexuan Cui <decui@microsoft.com>
Reviewed-by: Hamza Mahfooz <hamzamahfooz@linux.microsoft.com>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 include/hyperv/hvgdk.h | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/hyperv/hvgdk.h b/include/hyperv/hvgdk.h
index 384c3f3ff4a5..f538144280ca 100644
--- a/include/hyperv/hvgdk.h
+++ b/include/hyperv/hvgdk.h
@@ -10,18 +10,12 @@
 
 /*
  * The guest OS needs to register the guest ID with the hypervisor.
- * The guest ID is a 64 bit entity and the structure of this ID is
+ * The guest ID is a 64-bit entity and the structure of this ID is
  * specified in the Hyper-V TLFS specification.
  *
- * While the current guideline does not specify how Linux guest ID(s)
- * need to be generated, our plan is to publish the guidelines for
- * Linux and other guest operating systems that currently are hosted
- * on Hyper-V. The implementation here conforms to this yet
- * unpublished guidelines.
- *
  * Bit(s)
  * 63 - Indicates if the OS is Open Source or not; 1 is Open Source
- * 62:56 - Os Type; Linux is 0x100
+ * 62:56 - OS Type; Linux is 0x1
  * 55:48 - Distro specific identification
  * 47:16 - Linux kernel version number
  * 15:0  - Distro specific identification
-- 
cgit v1.2.3


From 840b740a35bf969734e0f2e44c21289fdd03079e Mon Sep 17 00:00:00 2001
From: Michael Kelley <mhklinux@outlook.com>
Date: Tue, 26 May 2026 07:13:04 -0700
Subject: mshv: Add conditional VMBus dependency

When the VMBus driver is not part of the kernel (CONFIG_HYPERV_VMBUS=n),
the MSHV root driver fails to link:

ERROR: modpost: "hv_vmbus_exists" [drivers/hv/mshv_root.ko] undefined!

Fix this while meeting these requirements:
* It must be possible to include the MSHV root driver without the
  VMBus driver. In such case, the MSHV root driver can be built-in
  to the kernel image, or it can be built as a separate module.
* If both the MSHV root driver and the VMBus driver are present, the
  MSHV root driver and VMBus driver can both be built-in, or they can
  both be separate modules. Or the MSHV root driver can be a module
  while the VMBus driver can be built-in, but the reverse is
  disallowed. Regardless of the build choices, the VMBus driver must
  be loaded before the MSHV driver in order for the SynIC to be
  managed properly (see comments in the MSHV SynIC code).

The fix has two parts:
* Add a Kconfig entry for MSHV_ROOT to depend on HYPERV_VMBUS if
  HYPERV_VMBUS is present. The entry disallows MSHV_ROOT being
  built-in when HYPERV_VMBUS is a module, but without requiring that
  HYPERV_VMBUS be built.
* Add a stub implementation of hv_vmbus_exists() for when the
  VMBus driver is not present so that the MSHV root driver has
  no module dependency on VMBus. When the VMBus driver *is*
  present, the module dependency ensures that the VMBus driver
  loads first when both are built as modules.

Existing code ensures that the VMBus driver loads first if it is
built-in. The VMBus driver uses subsys_initcall(), which is
initcall level 4. The MSHV root driver uses module_init(), which
becomes device_init() when built-in, and device_init() is
initcall level 6.

Reported-by: Arnd Bergmann <arnd@arndb.de>
Closes: https://lore.kernel.org/all/20260520074044.923728-1-arnd@kernel.org/
Signed-off-by: Michael Kelley <mhklinux@outlook.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Jork Loeser <jloeser@linux.microsoft.com>
Reviewed-by: Hardik Garg <hargar@linux.microsoft.com>
Signed-off-by: Wei Liu <wei.liu@kernel.org>
---
 drivers/hv/Kconfig     | 1 +
 include/linux/hyperv.h | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig
index 2d0b3fcb0ff8..aa11bcefddf2 100644
--- a/drivers/hv/Kconfig
+++ b/drivers/hv/Kconfig
@@ -74,6 +74,7 @@ config MSHV_ROOT
 	# e.g. When withdrawing memory, the hypervisor gives back 4k pages in
 	# no particular order, making it impossible to reassemble larger pages
 	depends on PAGE_SIZE_4KB
+	depends on HYPERV_VMBUS if HYPERV_VMBUS
 	select EVENTFD
 	select VIRT_XFER_TO_GUEST_WORK
 	select HMM_MIRROR
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 41a3d82f0722..734b7ef98f4d 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1304,7 +1304,11 @@ static inline void *hv_get_drvdata(struct hv_device *dev)
 
 struct device *hv_get_vmbus_root_device(void);
 
+#if IS_ENABLED(CONFIG_HYPERV_VMBUS)
 bool hv_vmbus_exists(void);
+#else
+static inline bool hv_vmbus_exists(void) { return false; }
+#endif
 
 struct hv_ring_buffer_debug_info {
 	u32 current_interrupt_mask;
-- 
cgit v1.2.3


From 3c2d42b8ee345b17a4ba56b0f6492d1ff4c1178e Mon Sep 17 00:00:00 2001
From: Wupeng Ma <mawupeng1@huawei.com>
Date: Fri, 22 May 2026 09:03:05 +0800
Subject: mm/memory-failure: fix hugetlb_lock AA deadlock in
 get_huge_page_for_hwpoison

Two concurrent madvise(MADV_HWPOISON) calls on the same hugetlb page can
trigger a recursive spinlock self-deadlock (AA deadlock) on hugetlb_lock
when racing with a concurrent unmap:

  thread#0                              thread#1
  --------                              --------
  madvise(folio, MADV_HWPOISON)
    -> poisons the folio successfully
  madvise(folio, MADV_HWPOISON)         unmap(folio)
    try_memory_failure_hugetlb
      get_huge_page_for_hwpoison
        spin_lock_irq(&hugetlb_lock)    <- held
        __get_huge_page_for_hwpoison
          hugetlb_update_hwpoison()
            -> MF_HUGETLB_FOLIO_PRE_POISONED
          goto out:
            folio_put()
              refcount: 1 -> 0
              free_huge_folio()
                spin_lock_irqsave(&hugetlb_lock)
                  -> AA DEADLOCK!

The out: path in __get_huge_page_for_hwpoison() calls folio_put() to drop
the GUP reference while the hugetlb_lock is still held by the hugetlb.c
wrapper get_huge_page_for_hwpoison().  If concurrent unmap has released
the page table mapping reference, folio_put() drops the folio refcount to
zero, triggering free_huge_folio() which attempts to re-acquire the
non-recursive hugetlb_lock.

Fix this by moving hugetlb_lock acquisition from the hugetlb.c wrapper
into get_huge_page_for_hwpoison().  Place spin_unlock_irq() before the
folio_put() at the out: label so the folio is always released outside the
lock.

[akpm@linux-foundation.org: fix race, rename label per Miaohe]
  Link: https://sashiko.dev/#/patchset/20260522010305.4099834-1-mawupeng1@huawei.com
  Link: https://lore.kernel.org/f39f405e-4b4b-8f79-70fe-a2b5b62114eb@huawei.com
Link: https://lore.kernel.org/20260522010305.4099834-1-mawupeng1@huawei.com
Fixes: 405ce051236c ("mm/hwpoison: fix race between hugetlb free/demotion and memory_failure_hugetlb()")
Signed-off-by: Wupeng Ma <mawupeng1@huawei.com>
Acked-by: Oscar Salvador (SUSE) <osalvador@kernel.org>
Acked-by: Muchun Song <muchun.song@linux.dev>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Acked-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: David Hildenbrand <david@kernel.org>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <ljs@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h |  8 --------
 include/linux/mm.h      |  8 --------
 mm/hugetlb.c            | 11 -----------
 mm/memory-failure.c     | 19 ++++++++++---------
 4 files changed, 10 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 5957bc25efa8..2abaf99321e9 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -153,8 +153,6 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
 						long freed);
 bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list);
 int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison);
-int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
-				bool *migratable_cleared);
 void folio_putback_hugetlb(struct folio *folio);
 void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason);
 void hugetlb_fix_reserve_counts(struct inode *inode);
@@ -421,12 +419,6 @@ static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb,
 	return 0;
 }
 
-static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
-					bool *migratable_cleared)
-{
-	return 0;
-}
-
 static inline void folio_putback_hugetlb(struct folio *folio)
 {
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 06bbe9eba636..fc2acedf0b76 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4975,8 +4975,6 @@ extern int soft_offline_page(unsigned long pfn, int flags);
  */
 extern const struct attribute_group memory_failure_attr_group;
 extern void memory_failure_queue(unsigned long pfn, int flags);
-extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
-					bool *migratable_cleared);
 void num_poisoned_pages_inc(unsigned long pfn);
 void num_poisoned_pages_sub(unsigned long pfn, long i);
 #else
@@ -4984,12 +4982,6 @@ static inline void memory_failure_queue(unsigned long pfn, int flags)
 {
 }
 
-static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
-					bool *migratable_cleared)
-{
-	return 0;
-}
-
 static inline void num_poisoned_pages_inc(unsigned long pfn)
 {
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 1b1d4f87a3a4..c921287489de 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7161,17 +7161,6 @@ int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison
 	return ret;
 }
 
-int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
-				bool *migratable_cleared)
-{
-	int ret;
-
-	spin_lock_irq(&hugetlb_lock);
-	ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
-	spin_unlock_irq(&hugetlb_lock);
-	return ret;
-}
-
 /**
  * folio_putback_hugetlb - unisolate a hugetlb folio
  * @folio: the isolated hugetlb folio
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ee42d4361309..d47aef256a32 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1966,20 +1966,19 @@ void folio_clear_hugetlb_hwpoison(struct folio *folio)
 	folio_free_raw_hwp(folio, true);
 }
 
-/*
- * Called from hugetlb code with hugetlb_lock held.
- */
-int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
+static int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
 				 bool *migratable_cleared)
 {
 	struct page *page = pfn_to_page(pfn);
-	struct folio *folio = page_folio(page);
+	struct folio *folio;
 	bool count_increased = false;
 	int ret, rc;
 
+	spin_lock_irq(&hugetlb_lock);
+	folio = page_folio(page);
 	if (!folio_test_hugetlb(folio)) {
 		ret = MF_HUGETLB_NON_HUGEPAGE;
-		goto out;
+		goto out_unlock;
 	} else if (flags & MF_COUNT_INCREASED) {
 		ret = MF_HUGETLB_IN_USED;
 		count_increased = true;
@@ -1995,13 +1994,13 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
 	} else {
 		ret = MF_HUGETLB_RETRY;
 		if (!(flags & MF_NO_RETRY))
-			goto out;
+			goto out_unlock;
 	}
 
 	rc = hugetlb_update_hwpoison(folio, page);
 	if (rc >= MF_HUGETLB_FOLIO_PRE_POISONED) {
 		ret = rc;
-		goto out;
+		goto out_unlock;
 	}
 
 	/*
@@ -2013,8 +2012,10 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
 		*migratable_cleared = true;
 	}
 
+	spin_unlock_irq(&hugetlb_lock);
 	return ret;
-out:
+out_unlock:
+	spin_unlock_irq(&hugetlb_lock);
 	if (count_increased)
 		folio_put(folio);
 	return ret;
-- 
cgit v1.2.3


From 193989cc6d80dd8e0460fb3992e69fa03bf0ff9b Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Mon, 25 May 2026 07:07:44 +0300
Subject: ipvs: clear the svc scheduler ptr early on edit

ip_vs_edit_service() while unbinding the old scheduler clears
the svc->scheduler ptr after the scheduler module initiates
RCU callbacks. This can cause packets to use the old
scheduler at the time when svc->sched_data is already freed
after RCU grace period.

Fix it by clearing the ptr early in ip_vs_unbind_scheduler(),
before the done_service method schedules any RCU callbacks.

Also, if the new scheduler fails to initialize when replacing
the old scheduler, try to restore the old scheduler while still
returning the error code.

Link: https://sashiko.dev/#/patchset/20260519015506.634185-1-rosenp%40gmail.com
Fixes: 05f00505a89a ("ipvs: fix crash if scheduler is changed")
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/ip_vs.h              |  3 +--
 net/netfilter/ipvs/ip_vs_ctl.c   | 13 ++++++++-----
 net/netfilter/ipvs/ip_vs_sched.c | 14 +++++++-------
 3 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index a02e569813d2..e517eaaa177b 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -1824,8 +1824,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler);
 int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler);
 int ip_vs_bind_scheduler(struct ip_vs_service *svc,
 			 struct ip_vs_scheduler *scheduler);
-void ip_vs_unbind_scheduler(struct ip_vs_service *svc,
-			    struct ip_vs_scheduler *sched);
+void ip_vs_unbind_scheduler(struct ip_vs_service *svc);
 struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name);
 void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler);
 struct ip_vs_conn *
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index bd9cae44d214..16daba8cac83 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1898,7 +1898,7 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
 	if (ret_hooks >= 0)
 		ip_vs_unregister_hooks(ipvs, u->af);
 	if (svc != NULL) {
-		ip_vs_unbind_scheduler(svc, sched);
+		ip_vs_unbind_scheduler(svc);
 		ip_vs_service_free(svc);
 	}
 	ip_vs_scheduler_put(sched);
@@ -1962,9 +1962,8 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
 	old_sched = rcu_dereference_protected(svc->scheduler, 1);
 	if (sched != old_sched) {
 		if (old_sched) {
-			ip_vs_unbind_scheduler(svc, old_sched);
-			RCU_INIT_POINTER(svc->scheduler, NULL);
-			/* Wait all svc->sched_data users */
+			ip_vs_unbind_scheduler(svc);
+			/* Wait all svc->scheduler/sched_data users */
 			synchronize_rcu();
 		}
 		/* Bind the new scheduler */
@@ -1972,6 +1971,10 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
 			ret = ip_vs_bind_scheduler(svc, sched);
 			if (ret) {
 				ip_vs_scheduler_put(sched);
+				/* Try to restore the old_sched */
+				if (old_sched &&
+				    !ip_vs_bind_scheduler(svc, old_sched))
+					old_sched = NULL;
 				goto out;
 			}
 		}
@@ -2027,7 +2030,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
 
 	/* Unbind scheduler */
 	old_sched = rcu_dereference_protected(svc->scheduler, 1);
-	ip_vs_unbind_scheduler(svc, old_sched);
+	ip_vs_unbind_scheduler(svc);
 	ip_vs_scheduler_put(old_sched);
 
 	/* Unbind persistence engine, keep svc->pe */
diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c
index c6e421c4e299..24adc38942a0 100644
--- a/net/netfilter/ipvs/ip_vs_sched.c
+++ b/net/netfilter/ipvs/ip_vs_sched.c
@@ -56,19 +56,19 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,
 /*
  *  Unbind a service with its scheduler
  */
-void ip_vs_unbind_scheduler(struct ip_vs_service *svc,
-			    struct ip_vs_scheduler *sched)
+void ip_vs_unbind_scheduler(struct ip_vs_service *svc)
 {
-	struct ip_vs_scheduler *cur_sched;
+	struct ip_vs_scheduler *sched;
 
-	cur_sched = rcu_dereference_protected(svc->scheduler, 1);
-	/* This check proves that old 'sched' was installed */
-	if (!cur_sched)
+	sched = rcu_dereference_protected(svc->scheduler, 1);
+	if (!sched)
 		return;
 
+	/* Reset the scheduler before initiating any RCU callbacks */
+	rcu_assign_pointer(svc->scheduler, NULL);
+	smp_wmb();	/* paired with smp_rmb() in ip_vs_schedule() */
 	if (sched->done_service)
 		sched->done_service(svc);
-	/* svc->scheduler can be set to NULL only by caller */
 }
 
 
-- 
cgit v1.2.3


From 5057e1aca011e51ef51498c940ef96f3d3e8a305 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Sun, 31 May 2026 12:08:12 -0400
Subject: net/sched: act_api: use RCU with deferred freeing for action
 lifecycle

When NEWTFILTER and DELFILTER are run concurrently it is possible to create a
race with an associated action.

Let's illustrate with CPU0 running NEWTFILTER and CPU1 running DELFILTER:

 0: mutex_lock() <-- holds the idr lock
 0: rcu_read_lock()
 0: p = idr_find(idr, index) <-- action p is valid (RCU protects IDR)
 0: mutex_unlock() <-- releases the idr lock
 1: refcount_dec_and_mutex_lock() <-- refcnt 1->0, mutex held
 1: idr_remove(idr, index) <-- Action removed from IDR
 1: mutex_unlock() <-- mutex released allowing us to delete the action
 1: tcf_action_cleanup(p); kfree(p) <-- Kfrees p immediately, no deferral
 0: refcount_inc_not_zero(&p->tcfa_refcnt) <-- ouch, UAF p points to freed memory

This patch fixes the race condition between NEWTFILTER and DELFILTER by
adding struct rcu_head to tc_action used in the deferral and introducing a
call_rcu() in the delete path to defer the final kfree().

Note: this is a revert of commit d7fb60b9cafb ("net_sched: get rid of tcfa_rcu")
but also modernization/simplification to directly use kfree_rcu().

Let's illustrate the new restored code path:

 0: rcu_read_lock()
 1: refcount_dec_and_mutex_lock() <-- refcnt 1->0, mutex held
 1: idr_remove(idr, index)
 1: mutex_unlock()
 1: call_rcu(&p->tcfa_rcu, tcf_action_rcu_free) <-- defer kfree after grace period
 0: p = idr_find(idr, index)
 0: refcount_inc_not_zero(&p->tcfa_refcnt) <-- fails, refcnt already 0
 1: rcu_read_unlock() <-- release so freeing can run after grace period

After CPU1 calls idr_remove(), the object is no longer reachable through the IDR.
CPU0's subsequent idr_find() will return NULL, and even if it still held a
stale pointer, the immediate kfree() is now deferred until after the RCU grace
period, so no UAF can occur.

Fixes: d7fb60b9cafb ("net_sched: get rid of tcfa_rcu")
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Reported-by: Kyle Zeng <kylebot@openai.com>
Tested-by: Victor Nogueira <victor@mojatatu.com>
Tested-by: syzbot@syzkaller.appspotmail.com
Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Tested-by: Kyle Zeng <kylebot@openai.com>
Reviewed-by: Pedro Tammela <pctammela@mojatatu.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Victor Nogueira <victor@mojatatu.com>
Link: https://patch.msgid.link/20260531160812.68020-1-jhs@mojatatu.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/act_api.h | 1 +
 net/sched/act_api.c   | 7 +------
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index d11b79107930..fd2967ee08f7 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -45,6 +45,7 @@ struct tc_action {
 	struct tc_cookie	__rcu *user_cookie;
 	struct tcf_chain	__rcu *goto_chain;
 	u32			tcfa_flags;
+	struct rcu_head         tcfa_rcu;
 	u8			hw_stats;
 	u8			used_hw_stats;
 	bool			used_hw_stats_valid;
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 332fd9695e54..04ea11c90e03 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -112,11 +112,6 @@ struct tcf_chain *tcf_action_set_ctrlact(struct tc_action *a, int action,
 }
 EXPORT_SYMBOL(tcf_action_set_ctrlact);
 
-/* XXX: For standalone actions, we don't need a RCU grace period either, because
- * actions are always connected to filters and filters are already destroyed in
- * RCU callbacks, so after a RCU grace period actions are already disconnected
- * from filters. Readers later can not find us.
- */
 static void free_tcf(struct tc_action *p)
 {
 	struct tcf_chain *chain = rcu_dereference_protected(p->goto_chain, 1);
@@ -129,7 +124,7 @@ static void free_tcf(struct tc_action *p)
 	if (chain)
 		tcf_chain_put_by_act(chain);
 
-	kfree(p);
+	kfree_rcu(p, tcfa_rcu);
 }
 
 static void offload_action_hw_count_set(struct tc_action *act,
-- 
cgit v1.2.3


From 6d99479799c69c3cb588fcda19c81d8f61d64ecd Mon Sep 17 00:00:00 2001
From: Qing Wang <wangqing7171@gmail.com>
Date: Tue, 2 Jun 2026 11:08:54 +0800
Subject: rseq: Fix using an uninitialized stack variable in
 rseq_exit_user_update()

There is an bug in which an uninitialized stack variable is used in
rseq_exit_user_update() as reported by syzbot:

BUG: KMSAN: kernel-infoleak in rseq_set_ids_get_csaddr include/linux/rseq_entry.h:502 [inline]

The local variable:

	struct rseq_ids ids = {
		.cpu_id	 = task_cpu(t),
		.mm_cid	 = task_mm_cid(t),
		.node_id = cpu_to_node(ids.cpu_id),
	};

According to the C standard, the evaluation order of expressions in an
initializer list is indeterminately sequenced. The compiler (Clang, in
this KMSAN build) evaluates `cpu_to_node(ids.cpu_id)` *before*
`ids.cpu_id` is initialized with `task_cpu(t)`.

This is fixed by moving the assignment of ids.node_id outside the
structure initialization.

Fixes: 82f572449cfe ("rseq: Implement read only ABI enforcement for optimized RSEQ V2 mode")
Closes: https://syzkaller.appspot.com/bug?extid=185a631927096f9da2fc
Reported-by: syzbot+185a631927096f9da2fc@syzkaller.appspotmail.com
Signed-off-by: Qing Wang <wangqing7171@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://patch.msgid.link/20260602030854.574038-1-wangqing7171@gmail.com
---
 include/linux/rseq_entry.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h
index 63bc72086e75..ed9da6e41a2a 100644
--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -635,10 +635,11 @@ static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct t
 		return true;
 	}
 
+	int cpu = task_cpu(t);
 	struct rseq_ids ids = {
-		.cpu_id	 = task_cpu(t),
+		.cpu_id	 = cpu,
 		.mm_cid	 = task_mm_cid(t),
-		.node_id = cpu_to_node(ids.cpu_id),
+		.node_id = cpu_to_node(cpu),
 	};
 
 	return rseq_update_usr(t, regs, &ids);
-- 
cgit v1.2.3


From e32d7f404d7d9dac307c1cd9a1fe132fa62ab6d6 Mon Sep 17 00:00:00 2001
From: Gabriele Monaco <gmonaco@redhat.com>
Date: Mon, 1 Jun 2026 17:38:29 +0200
Subject: rv: Reset per-task DA monitors before releasing the slot

Per-task monitors use task_mon_slot to determine which slot in the array
to use for the monitor. During destruction, this slot is returned but
this is done before resetting the monitor. As a result, the monitor's
reset is in fact resetting a slot that is outside of the array
(RV_PER_TASK_MONITOR_INIT).

Release the slot only after the reset to avoid out-of-bound memory
access.

Fixes: f5587d1b6ec93 ("rv: Add Hybrid Automata monitor type")
Cc: stable@vger.kernel.org
Suggested-by: Wen Yang <wen.yang@linux.dev>
Reviewed-by: Wen Yang <wen.yang@linux.dev>
Reviewed-by: Nam Cao <namcao@linutronix.de>
Link: https://lore.kernel.org/r/20260601153840.124372-3-gmonaco@redhat.com
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
---
 include/rv/da_monitor.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/rv/da_monitor.h b/include/rv/da_monitor.h
index 39765ff6f098..1459fb3dfee6 100644
--- a/include/rv/da_monitor.h
+++ b/include/rv/da_monitor.h
@@ -309,10 +309,11 @@ static inline void da_monitor_destroy(void)
 		WARN_ONCE(1, "Disabling a disabled monitor: " __stringify(MONITOR_NAME));
 		return;
 	}
-	rv_put_task_monitor_slot(task_mon_slot);
-	task_mon_slot = RV_PER_TASK_MONITOR_INIT;
 
 	da_monitor_reset_all();
+
+	rv_put_task_monitor_slot(task_mon_slot);
+	task_mon_slot = RV_PER_TASK_MONITOR_INIT;
 }
 
 #elif RV_MON_TYPE == RV_MON_PER_OBJ
-- 
cgit v1.2.3


From c3d016ea823a9941ab8cbcef01a500821ff0cf16 Mon Sep 17 00:00:00 2001
From: Gabriele Monaco <gmonaco@redhat.com>
Date: Mon, 1 Jun 2026 17:38:30 +0200
Subject: rv: Prevent in-flight per-task handlers from using invalid slots

Per-task monitors use a slot in the task_struct->rv[] array and store
that locally (e.g. task_mon_slot), this slot is returned during the
destruction process but currently hanlers can be running while that slot
is returning and this race may lead to accessing an invalid slot.

Synchronise with all in-flight tracepoint handlers using
tracepoint_synchronize_unregister() before returning the slot.

Fixes: f5587d1b6ec9 ("rv: Add Hybrid Automata monitor type")
Fixes: a9769a5b9878 ("rv: Add support for LTL monitors")
Suggested-by: Wen Yang <wen.yang@linux.dev>
Reviewed-by: Nam Cao <namcao@linutronix.de>
Link: https://lore.kernel.org/r/20260601153840.124372-4-gmonaco@redhat.com
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
---
 include/rv/da_monitor.h  | 4 ++++
 include/rv/ltl_monitor.h | 1 +
 2 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/rv/da_monitor.h b/include/rv/da_monitor.h
index 1459fb3dfee6..cc97cc5dfbfd 100644
--- a/include/rv/da_monitor.h
+++ b/include/rv/da_monitor.h
@@ -302,6 +302,9 @@ static int da_monitor_init(void)
 
 /*
  * da_monitor_destroy - return the allocated slot
+ *
+ * Wait for all in-flight handlers before returning the slot to avoid
+ * out-of-bound accesses.
  */
 static inline void da_monitor_destroy(void)
 {
@@ -310,6 +313,7 @@ static inline void da_monitor_destroy(void)
 		return;
 	}
 
+	tracepoint_synchronize_unregister();
 	da_monitor_reset_all();
 
 	rv_put_task_monitor_slot(task_mon_slot);
diff --git a/include/rv/ltl_monitor.h b/include/rv/ltl_monitor.h
index eff60cd61106..38e792401f76 100644
--- a/include/rv/ltl_monitor.h
+++ b/include/rv/ltl_monitor.h
@@ -77,6 +77,7 @@ static void ltl_monitor_destroy(void)
 {
 	rv_detach_trace_probe(name, task_newtask, handle_task_newtask);
 
+	tracepoint_synchronize_unregister();
 	rv_put_task_monitor_slot(ltl_monitor_slot);
 	ltl_monitor_slot = RV_PER_TASK_MONITOR_INIT;
 }
-- 
cgit v1.2.3


From 32171d828ab964dc1f05f2056a81477522a3361b Mon Sep 17 00:00:00 2001
From: Gabriele Monaco <gmonaco@redhat.com>
Date: Mon, 1 Jun 2026 17:38:31 +0200
Subject: rv: Ensure all pending probes terminate on per-obj monitor destroy

The monitor disable/destroy sequence detaches all probes and resets the
monitor's data, however it doesn't wait for pending probes. This is an
issue with per-object monitors, which free the monitor storage.

Call tracepoint_synchronize_unregister() to make sure to wait for all
pending probes before destroying the monitor storage.

Fixes: 4a24127bd6cb ("rv: Add support for per-object monitors in DA/HA")
Reviewed-by: Wen Yang <wen.yang@linux.dev>
Reviewed-by: Nam Cao <namcao@linutronix.de>
Link: https://lore.kernel.org/r/20260601153840.124372-5-gmonaco@redhat.com
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
---
 include/rv/da_monitor.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/rv/da_monitor.h b/include/rv/da_monitor.h
index cc97cc5dfbfd..a7e103654406 100644
--- a/include/rv/da_monitor.h
+++ b/include/rv/da_monitor.h
@@ -511,9 +511,10 @@ static inline void da_monitor_destroy(void)
 	struct hlist_node *tmp;
 	int bkt;
 
+	tracepoint_synchronize_unregister();
 	/*
-	 * This function is called after all probes are disabled, we need only
-	 * worry about concurrency against old events.
+	 * This function is called after all probes are disabled and no longer
+	 * pending, we can safely assume no concurrent user.
 	 */
 	synchronize_rcu();
 	hash_for_each_safe(da_monitor_ht, bkt, tmp, mon_storage, node) {
-- 
cgit v1.2.3


From 2b9a8c27417eeef01967c6297def426a44776c04 Mon Sep 17 00:00:00 2001
From: Wen Yang <wen.yang@linux.dev>
Date: Mon, 1 Jun 2026 17:38:32 +0200
Subject: rv: Fix monitor start ordering and memory ordering for monitoring
 flag

da_monitor_start() set monitoring=1 before calling da_monitor_init_hook(),
may racing with the sched_switch handler:

  da_monitor_start()               sched_switch handler
  -------------------------        ---------------------------------
  da_mon->monitoring = 1;
                                   if (da_monitoring(da_mon))  /* true  */
                                       ha_start_timer_ns(...);
                                       /* hrtimer->base == NULL, crash */
  da_monitor_init_hook(da_mon);
  /* hrtimer_setup() sets base */

Fix the ordering and pair with release/acquire semantics:

  da_monitor_init_hook(da_mon);
  smp_store_release(&da_mon->monitoring, 1);    /* da_monitor_start()  */
  return smp_load_acquire(&da_mon->monitoring); /* da_monitoring()     */

On ARM64 a plain STR + LDR does not form a release-acquire pair, so
the load can observe monitoring=1 while hrtimer->base is still NULL.
The plain accesses are also data races under KCSAN.

Use WRITE_ONCE for the monitoring=0 store in da_monitor_reset() to
cover the reset path.

Fixes: 792575348ff7 ("rv/include: Add deterministic automata monitor definition via C macros")
Signed-off-by: Wen Yang <wen.yang@linux.dev>
Reviewed-by: Gabriele Monaco <gmonaco@redhat.com>
Reviewed-by: Nam Cao <namcao@linutronix.de>
Link: https://lore.kernel.org/r/20260601153840.124372-6-gmonaco@redhat.com
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
---
 include/rv/da_monitor.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/rv/da_monitor.h b/include/rv/da_monitor.h
index a7e103654406..60dc39f2620a 100644
--- a/include/rv/da_monitor.h
+++ b/include/rv/da_monitor.h
@@ -82,7 +82,7 @@ static void react(enum states curr_state, enum events event)
 static inline void da_monitor_reset(struct da_monitor *da_mon)
 {
 	da_monitor_reset_hook(da_mon);
-	da_mon->monitoring = 0;
+	WRITE_ONCE(da_mon->monitoring, 0);
 	da_mon->curr_state = model_get_initial_state();
 }
 
@@ -95,8 +95,9 @@ static inline void da_monitor_reset(struct da_monitor *da_mon)
 static inline void da_monitor_start(struct da_monitor *da_mon)
 {
 	da_mon->curr_state = model_get_initial_state();
-	da_mon->monitoring = 1;
 	da_monitor_init_hook(da_mon);
+	/* Pairs with smp_load_acquire in da_monitoring(). */
+	smp_store_release(&da_mon->monitoring, 1);
 }
 
 /*
@@ -104,7 +105,8 @@ static inline void da_monitor_start(struct da_monitor *da_mon)
  */
 static inline bool da_monitoring(struct da_monitor *da_mon)
 {
-	return da_mon->monitoring;
+	/* Pairs with smp_store_release in da_monitor_start(). */
+	return smp_load_acquire(&da_mon->monitoring);
 }
 
 /*
-- 
cgit v1.2.3


From 713c9e1ea7623846f8e0bb657f1b9d211d1a61e6 Mon Sep 17 00:00:00 2001
From: Gabriele Monaco <gmonaco@redhat.com>
Date: Mon, 1 Jun 2026 17:38:33 +0200
Subject: rv: Do not rely on clean monitor when initialising HA

Hybrid Automata monitors hook into the DA implementation when doing
da_monitor_reset(). This function is called both on initialisation and
teardown, HA monitors try to cancel a timer only when it's initialised
relying on the da_mon->monitoring flag. This flag could however be
corrupted during initialisation. This happens for instance on per-task
monitors that share the same storage with different type of monitors
like LTL or in case of races during a previous teardown.

Stop relying on the monitoring flag during initialisation, assume that
can have any value, so use a separate da_reset_state() skiping timer
cancellation.
New monitors (e.g. new tasks) are always zero-initialised so it is safe
to rely on the monitoring flag for those.

Reported-by: Wen Yang <wen.yang@linux.dev>
Closes: https://lore.kernel.org/lkml/d02c656aada7d071f083460a5c9a454363669b61.1778522945.git.wen.yang@linux.dev
Suggested-by: Nam Cao <namcao@linutronix.de>
Fixes: f5587d1b6ec9 ("rv: Add Hybrid Automata monitor type")
Reviewed-by: Wen Yang <wen.yang@linux.dev>
Reviewed-by: Nam Cao <namcao@linutronix.de>
Link: https://lore.kernel.org/r/20260601153840.124372-7-gmonaco@redhat.com
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
---
 include/rv/da_monitor.h | 91 ++++++++++++++++++++++++++++++++++++++++---------
 include/rv/ha_monitor.h |  2 +-
 2 files changed, 76 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/rv/da_monitor.h b/include/rv/da_monitor.h
index 60dc39f2620a..ec9bc88bd4c4 100644
--- a/include/rv/da_monitor.h
+++ b/include/rv/da_monitor.h
@@ -76,14 +76,22 @@ static void react(enum states curr_state, enum events event)
 		 model_get_state_name(curr_state));
 }
 
+/*
+ * da_monitor_reset_state - reset a monitor and setting it to init state
+ */
+static inline void da_monitor_reset_state(struct da_monitor *da_mon)
+{
+	WRITE_ONCE(da_mon->monitoring, 0);
+	da_mon->curr_state = model_get_initial_state();
+}
+
 /*
  * da_monitor_reset - reset a monitor and setting it to init state
  */
 static inline void da_monitor_reset(struct da_monitor *da_mon)
 {
 	da_monitor_reset_hook(da_mon);
-	WRITE_ONCE(da_mon->monitoring, 0);
-	da_mon->curr_state = model_get_initial_state();
+	da_monitor_reset_state(da_mon);
 }
 
 /*
@@ -158,12 +166,28 @@ static struct da_monitor *da_get_monitor(void)
 	return &DA_MON_NAME;
 }
 
+/*
+ * __da_monitor_reset_all - reset the single monitor
+ */
+static void __da_monitor_reset_all(void (*reset)(struct da_monitor *))
+{
+	reset(da_get_monitor());
+}
+
 /*
  * da_monitor_reset_all - reset the single monitor
  */
 static void da_monitor_reset_all(void)
 {
-	da_monitor_reset(da_get_monitor());
+	__da_monitor_reset_all(da_monitor_reset);
+}
+
+/*
+ * da_monitor_reset_state_all - reset the single monitor
+ */
+static inline void da_monitor_reset_state_all(void)
+{
+	__da_monitor_reset_all(da_monitor_reset_state);
 }
 
 /*
@@ -171,7 +195,7 @@ static void da_monitor_reset_all(void)
  */
 static inline int da_monitor_init(void)
 {
-	da_monitor_reset_all();
+	da_monitor_reset_state_all();
 	return 0;
 }
 
@@ -202,25 +226,41 @@ static struct da_monitor *da_get_monitor(void)
 }
 
 /*
- * da_monitor_reset_all - reset all CPUs' monitor
+ * __da_monitor_reset_all - reset all CPUs' monitor
  */
-static void da_monitor_reset_all(void)
+static void __da_monitor_reset_all(void (*reset)(struct da_monitor *))
 {
 	struct da_monitor *da_mon;
 	int cpu;
 
 	for_each_cpu(cpu, cpu_online_mask) {
 		da_mon = per_cpu_ptr(&DA_MON_NAME, cpu);
-		da_monitor_reset(da_mon);
+		reset(da_mon);
 	}
 }
 
+/*
+ * da_monitor_reset_all - reset all CPUs' monitor
+ */
+static void da_monitor_reset_all(void)
+{
+	__da_monitor_reset_all(da_monitor_reset);
+}
+
+/*
+ * da_monitor_reset_state_all - reset all CPUs' monitor
+ */
+static inline void da_monitor_reset_state_all(void)
+{
+	__da_monitor_reset_all(da_monitor_reset_state);
+}
+
 /*
  * da_monitor_init - initialize all CPUs' monitor
  */
 static inline int da_monitor_init(void)
 {
-	da_monitor_reset_all();
+	da_monitor_reset_state_all();
 	return 0;
 }
 
@@ -269,19 +309,29 @@ static inline da_id_type da_get_id(struct da_monitor *da_mon)
 	return da_get_target(da_mon)->pid;
 }
 
-static void da_monitor_reset_all(void)
+static void __da_monitor_reset_all(void (*reset)(struct da_monitor *))
 {
 	struct task_struct *g, *p;
 	int cpu;
 
 	read_lock(&tasklist_lock);
 	for_each_process_thread(g, p)
-		da_monitor_reset(da_get_monitor(p));
+		reset(da_get_monitor(p));
 	for_each_present_cpu(cpu)
-		da_monitor_reset(da_get_monitor(idle_task(cpu)));
+		reset(da_get_monitor(idle_task(cpu)));
 	read_unlock(&tasklist_lock);
 }
 
+static void da_monitor_reset_all(void)
+{
+	__da_monitor_reset_all(da_monitor_reset);
+}
+
+static inline void da_monitor_reset_state_all(void)
+{
+	__da_monitor_reset_all(da_monitor_reset_state);
+}
+
 /*
  * da_monitor_init - initialize the per-task monitor
  *
@@ -298,7 +348,7 @@ static int da_monitor_init(void)
 
 	task_mon_slot = slot;
 
-	da_monitor_reset_all();
+	da_monitor_reset_state_all();
 	return 0;
 }
 
@@ -490,15 +540,24 @@ static inline void da_destroy_storage(da_id_type id)
 	kfree_rcu(mon_storage, rcu);
 }
 
-static void da_monitor_reset_all(void)
+static void __da_monitor_reset_all(void (*reset)(struct da_monitor *))
 {
 	struct da_monitor_storage *mon_storage;
 	int bkt;
 
-	rcu_read_lock();
+	guard(rcu)();
 	hash_for_each_rcu(da_monitor_ht, bkt, mon_storage, node)
-		da_monitor_reset(&mon_storage->rv.da_mon);
-	rcu_read_unlock();
+		reset(&mon_storage->rv.da_mon);
+}
+
+static void da_monitor_reset_all(void)
+{
+	__da_monitor_reset_all(da_monitor_reset);
+}
+
+static inline void da_monitor_reset_state_all(void)
+{
+	__da_monitor_reset_all(da_monitor_reset_state);
 }
 
 static inline int da_monitor_init(void)
diff --git a/include/rv/ha_monitor.h b/include/rv/ha_monitor.h
index d59507e8cb30..bd87055567cc 100644
--- a/include/rv/ha_monitor.h
+++ b/include/rv/ha_monitor.h
@@ -153,12 +153,12 @@ static inline void ha_monitor_init_env(struct da_monitor *da_mon)
  * Called from a hook in the DA reset functions, it supplies the da_mon
  * corresponding to the current ha_mon.
  * Not all hybrid automata require the timer, still clear it for simplicity.
+ * Monitors that never started have their timer uninitialized, do not stop those.
  */
 static inline void ha_monitor_reset_env(struct da_monitor *da_mon)
 {
 	struct ha_monitor *ha_mon = to_ha_monitor(da_mon);
 
-	/* Initialisation resets the monitor before initialising the timer */
 	if (likely(da_monitoring(da_mon)))
 		ha_cancel_timer(ha_mon);
 }
-- 
cgit v1.2.3


From 700782ec8f6589c5792b323efd6e004dd183328b Mon Sep 17 00:00:00 2001
From: Gabriele Monaco <gmonaco@redhat.com>
Date: Mon, 1 Jun 2026 17:38:34 +0200
Subject: rv: Add automatic cleanup handlers for per-task HA monitors

Hybrid automata monitors may start timers, depending on the model, these
may remain active on an exiting task and cause false positives or even
access freed memory.

Add an enable/disable hook in the HA code, currently only populated by
the per-task handler for registration and deregistration.
This hooks to the sched_process_exit event and ensures the timer is
stopped for every exiting task. The handler is enabled automatically but
may be disabled, for instance if the monitor uses the event for another
purpose (but should still manually ensure timers are stopped).

Fixes: f5587d1b6ec9 ("rv: Add Hybrid Automata monitor type")
Reviewed-by: Nam Cao <namcao@linutronix.de>
Link: https://lore.kernel.org/r/20260601153840.124372-8-gmonaco@redhat.com
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
---
 include/rv/ha_monitor.h                            | 60 ++++++++++++++++++++++
 kernel/trace/rv/monitors/nomiss/nomiss.c           |  4 +-
 kernel/trace/rv/monitors/opid/opid.c               |  4 +-
 kernel/trace/rv/monitors/stall/stall.c             |  4 +-
 .../rvgen/rvgen/templates/dot2k/main.c             |  4 +-
 5 files changed, 68 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/rv/ha_monitor.h b/include/rv/ha_monitor.h
index bd87055567cc..4002b5247c46 100644
--- a/include/rv/ha_monitor.h
+++ b/include/rv/ha_monitor.h
@@ -28,6 +28,7 @@ static inline void ha_monitor_init_env(struct da_monitor *da_mon);
 static inline void ha_monitor_reset_env(struct da_monitor *da_mon);
 static inline void ha_setup_timer(struct ha_monitor *ha_mon);
 static inline bool ha_cancel_timer(struct ha_monitor *ha_mon);
+static inline void ha_cancel_timer_sync(struct ha_monitor *ha_mon);
 static bool ha_monitor_handle_constraint(struct da_monitor *da_mon,
 					 enum states curr_state,
 					 enum events event,
@@ -37,6 +38,26 @@ static bool ha_monitor_handle_constraint(struct da_monitor *da_mon,
 #define da_monitor_init_hook ha_monitor_init_env
 #define da_monitor_reset_hook ha_monitor_reset_env
 
+#if !defined(HA_SKIP_AUTO_CLEANUP) && RV_MON_TYPE == RV_MON_PER_TASK
+/*
+ * Automatic cleanup handlers for per-task HA monitors, only skip if you know
+ * what you are doing (e.g. you want to implement cleanup manually in another
+ * handler doing more things).
+ */
+static void ha_handle_sched_process_exit(void *data, struct task_struct *p,
+					 bool group_dead);
+
+#define ha_monitor_enable_hook()                                             \
+	rv_attach_trace_probe(__stringify(MONITOR_NAME), sched_process_exit, \
+			      ha_handle_sched_process_exit)
+#define ha_monitor_disable_hook()                                            \
+	rv_detach_trace_probe(__stringify(MONITOR_NAME), sched_process_exit, \
+			      ha_handle_sched_process_exit)
+#else
+#define ha_monitor_enable_hook() ((void)0)
+#define ha_monitor_disable_hook() ((void)0)
+#endif
+
 #include <rv/da_monitor.h>
 #include <linux/seq_buf.h>
 
@@ -115,6 +136,22 @@ static enum hrtimer_restart ha_monitor_timer_callback(struct hrtimer *hrtimer);
 #define ha_get_ns() 0
 #endif /* HA_CLK_NS */
 
+static int ha_monitor_init(void)
+{
+	int ret;
+
+	ret = da_monitor_init();
+	if (ret == 0)
+		ha_monitor_enable_hook();
+	return ret;
+}
+
+static void ha_monitor_destroy(void)
+{
+	ha_monitor_disable_hook();
+	da_monitor_destroy();
+}
+
 /* Should be supplied by the monitor */
 static u64 ha_get_env(struct ha_monitor *ha_mon, enum envs env, u64 time_ns);
 static bool ha_verify_constraint(struct ha_monitor *ha_mon,
@@ -200,6 +237,20 @@ static inline void ha_trace_error_env(struct ha_monitor *ha_mon,
 {
 	CONCATENATE(trace_error_env_, MONITOR_NAME)(id, curr_state, event, env);
 }
+
+#if !defined(HA_SKIP_AUTO_CLEANUP) && RV_MON_TYPE == RV_MON_PER_TASK
+static void ha_handle_sched_process_exit(void *data, struct task_struct *p,
+					 bool group_dead)
+{
+	struct da_monitor *da_mon = da_get_monitor(p);
+
+	if (likely(da_monitoring(da_mon))) {
+		da_monitor_reset(da_mon);
+		ha_cancel_timer_sync(to_ha_monitor(da_mon));
+	}
+}
+#endif
+
 #endif /* RV_MON_TYPE */
 
 /*
@@ -412,6 +463,10 @@ static inline bool ha_cancel_timer(struct ha_monitor *ha_mon)
 {
 	return timer_delete(&ha_mon->timer);
 }
+static inline void ha_cancel_timer_sync(struct ha_monitor *ha_mon)
+{
+	timer_delete_sync(&ha_mon->timer);
+}
 #elif HA_TIMER_TYPE == HA_TIMER_HRTIMER
 /*
  * Helper functions to handle the monitor timer.
@@ -463,6 +518,10 @@ static inline bool ha_cancel_timer(struct ha_monitor *ha_mon)
 {
 	return hrtimer_try_to_cancel(&ha_mon->hrtimer) == 1;
 }
+static inline void ha_cancel_timer_sync(struct ha_monitor *ha_mon)
+{
+	hrtimer_cancel(&ha_mon->hrtimer);
+}
 #else /* HA_TIMER_NONE */
 /*
  * Start function is intentionally not defined, monitors using timers must
@@ -473,6 +532,7 @@ static inline bool ha_cancel_timer(struct ha_monitor *ha_mon)
 {
 	return false;
 }
+static inline void ha_cancel_timer_sync(struct ha_monitor *ha_mon) { }
 #endif
 
 #endif
diff --git a/kernel/trace/rv/monitors/nomiss/nomiss.c b/kernel/trace/rv/monitors/nomiss/nomiss.c
index 31f90f3638d8..8ead8783c29f 100644
--- a/kernel/trace/rv/monitors/nomiss/nomiss.c
+++ b/kernel/trace/rv/monitors/nomiss/nomiss.c
@@ -227,7 +227,7 @@ static int enable_nomiss(void)
 {
 	int retval;
 
-	retval = da_monitor_init();
+	retval = ha_monitor_init();
 	if (retval)
 		return retval;
 
@@ -263,7 +263,7 @@ static void disable_nomiss(void)
 	rv_detach_trace_probe("nomiss", sched_switch, handle_sched_switch);
 	rv_detach_trace_probe("nomiss", sched_wakeup, handle_sched_wakeup);
 
-	da_monitor_destroy();
+	ha_monitor_destroy();
 }
 
 static struct rv_monitor rv_this = {
diff --git a/kernel/trace/rv/monitors/opid/opid.c b/kernel/trace/rv/monitors/opid/opid.c
index 4594c7c46601..2922318c6112 100644
--- a/kernel/trace/rv/monitors/opid/opid.c
+++ b/kernel/trace/rv/monitors/opid/opid.c
@@ -73,7 +73,7 @@ static int enable_opid(void)
 {
 	int retval;
 
-	retval = da_monitor_init();
+	retval = ha_monitor_init();
 	if (retval)
 		return retval;
 
@@ -90,7 +90,7 @@ static void disable_opid(void)
 	rv_detach_trace_probe("opid", sched_set_need_resched_tp, handle_sched_need_resched);
 	rv_detach_trace_probe("opid", sched_waking, handle_sched_waking);
 
-	da_monitor_destroy();
+	ha_monitor_destroy();
 }
 
 /*
diff --git a/kernel/trace/rv/monitors/stall/stall.c b/kernel/trace/rv/monitors/stall/stall.c
index 9ccfda6b0e73..3c38fb1a0159 100644
--- a/kernel/trace/rv/monitors/stall/stall.c
+++ b/kernel/trace/rv/monitors/stall/stall.c
@@ -103,7 +103,7 @@ static int enable_stall(void)
 {
 	int retval;
 
-	retval = da_monitor_init();
+	retval = ha_monitor_init();
 	if (retval)
 		return retval;
 
@@ -120,7 +120,7 @@ static void disable_stall(void)
 	rv_detach_trace_probe("stall", sched_switch, handle_sched_switch);
 	rv_detach_trace_probe("stall", sched_wakeup, handle_sched_wakeup);
 
-	da_monitor_destroy();
+	ha_monitor_destroy();
 }
 
 static struct rv_monitor rv_this = {
diff --git a/tools/verification/rvgen/rvgen/templates/dot2k/main.c b/tools/verification/rvgen/rvgen/templates/dot2k/main.c
index bf0999f6657a..889446760e3c 100644
--- a/tools/verification/rvgen/rvgen/templates/dot2k/main.c
+++ b/tools/verification/rvgen/rvgen/templates/dot2k/main.c
@@ -35,7 +35,7 @@ static int enable_%%MODEL_NAME%%(void)
 {
 	int retval;
 
-	retval = da_monitor_init();
+	retval = %%MONITOR_CLASS%%_monitor_init();
 	if (retval)
 		return retval;
 
@@ -50,7 +50,7 @@ static void disable_%%MODEL_NAME%%(void)
 
 %%TRACEPOINT_DETACH%%
 
-	da_monitor_destroy();
+	%%MONITOR_CLASS%%_monitor_destroy();
 }
 
 /*
-- 
cgit v1.2.3


From 74e17bd6fc8ed7e30363bb78d4c50b38cfd71efe Mon Sep 17 00:00:00 2001
From: Gabriele Monaco <gmonaco@redhat.com>
Date: Mon, 1 Jun 2026 17:38:35 +0200
Subject: rv: Ensure synchronous cleanup for HA monitors

HA monitors may start timers, all cleanup functions currently stop the
timers asynchronously to avoid sleeping in the wrong context.
Nothing makes sure running callbacks terminate on cleanup.

Run the entire HA timer callback in an RCU read-side critical section,
this way we can simply synchronize_rcu() with any pending timer and are
sure any cleanup using kfree_rcu() runs after callbacks terminated.
Additionally make sure any unlikely callback running late won't run any
code if the monitor is marked as disabled or if destruction started.
Use memory barriers to serialise with racing resets.

Fixes: f5587d1b6ec9 ("rv: Add Hybrid Automata monitor type")
Fixes: 4a24127bd6cb ("rv: Add support for per-object monitors in DA/HA")
Reviewed-by: Nam Cao <namcao@linutronix.de>
Link: https://lore.kernel.org/r/20260601153840.124372-9-gmonaco@redhat.com
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
---
 include/rv/da_monitor.h | 19 ++++++++++++++++---
 include/rv/ha_monitor.h | 29 ++++++++++++++++++++++++++---
 2 files changed, 42 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/rv/da_monitor.h b/include/rv/da_monitor.h
index ec9bc88bd4c4..1f440c7818e6 100644
--- a/include/rv/da_monitor.h
+++ b/include/rv/da_monitor.h
@@ -57,6 +57,15 @@ static struct rv_monitor rv_this;
 #define da_monitor_reset_hook(da_mon)
 #endif
 
+/*
+ * Hook to allow the implementation of hybrid automata: define it with a
+ * function that waits for the termination of all monitors background
+ * activities (e.g. all timers). This hook can sleep.
+ */
+#ifndef da_monitor_sync_hook
+#define da_monitor_sync_hook()
+#endif
+
 /*
  * Type for the target id, default to int but can be overridden.
  * A long type can work as hash table key (PER_OBJ) but will be downgraded to
@@ -82,7 +91,8 @@ static void react(enum states curr_state, enum events event)
 static inline void da_monitor_reset_state(struct da_monitor *da_mon)
 {
 	WRITE_ONCE(da_mon->monitoring, 0);
-	da_mon->curr_state = model_get_initial_state();
+	/* Pair with load in __ha_monitor_timer_callback */
+	smp_store_release(&da_mon->curr_state, model_get_initial_state());
 }
 
 /*
@@ -205,6 +215,7 @@ static inline int da_monitor_init(void)
 static inline void da_monitor_destroy(void)
 {
 	da_monitor_reset_all();
+	da_monitor_sync_hook();
 }
 
 #elif RV_MON_TYPE == RV_MON_PER_CPU
@@ -270,6 +281,7 @@ static inline int da_monitor_init(void)
 static inline void da_monitor_destroy(void)
 {
 	da_monitor_reset_all();
+	da_monitor_sync_hook();
 }
 
 #elif RV_MON_TYPE == RV_MON_PER_TASK
@@ -367,6 +379,7 @@ static inline void da_monitor_destroy(void)
 
 	tracepoint_synchronize_unregister();
 	da_monitor_reset_all();
+	da_monitor_sync_hook();
 
 	rv_put_task_monitor_slot(task_mon_slot);
 	task_mon_slot = RV_PER_TASK_MONITOR_INIT;
@@ -573,13 +586,13 @@ static inline void da_monitor_destroy(void)
 	int bkt;
 
 	tracepoint_synchronize_unregister();
+	da_monitor_reset_all();
+	da_monitor_sync_hook();
 	/*
 	 * This function is called after all probes are disabled and no longer
 	 * pending, we can safely assume no concurrent user.
 	 */
-	synchronize_rcu();
 	hash_for_each_safe(da_monitor_ht, bkt, tmp, mon_storage, node) {
-		da_monitor_reset_hook(&mon_storage->rv.da_mon);
 		hash_del_rcu(&mon_storage->node);
 		kfree(mon_storage);
 	}
diff --git a/include/rv/ha_monitor.h b/include/rv/ha_monitor.h
index 4002b5247c46..28d3c74cabfc 100644
--- a/include/rv/ha_monitor.h
+++ b/include/rv/ha_monitor.h
@@ -37,6 +37,7 @@ static bool ha_monitor_handle_constraint(struct da_monitor *da_mon,
 #define da_monitor_event_hook ha_monitor_handle_constraint
 #define da_monitor_init_hook ha_monitor_init_env
 #define da_monitor_reset_hook ha_monitor_reset_env
+#define da_monitor_sync_hook() synchronize_rcu()
 
 #if !defined(HA_SKIP_AUTO_CLEANUP) && RV_MON_TYPE == RV_MON_PER_TASK
 /*
@@ -136,10 +137,13 @@ static enum hrtimer_restart ha_monitor_timer_callback(struct hrtimer *hrtimer);
 #define ha_get_ns() 0
 #endif /* HA_CLK_NS */
 
+static bool ha_mon_destroying;
+
 static int ha_monitor_init(void)
 {
 	int ret;
 
+	WRITE_ONCE(ha_mon_destroying, false);
 	ret = da_monitor_init();
 	if (ret == 0)
 		ha_monitor_enable_hook();
@@ -148,6 +152,7 @@ static int ha_monitor_init(void)
 
 static void ha_monitor_destroy(void)
 {
+	WRITE_ONCE(ha_mon_destroying, true);
 	ha_monitor_disable_hook();
 	da_monitor_destroy();
 }
@@ -288,12 +293,30 @@ static bool ha_monitor_handle_constraint(struct da_monitor *da_mon,
 	return false;
 }
 
+/*
+ * __ha_monitor_timer_callback - generic callback representation
+ *
+ * This callback runs in an RCU read-side critical section to allow the
+ * destruction sequence to easily synchronize_rcu() with all pending timers
+ * after asynchronously disabling them. The ha_mon_destroying check ensures
+ * any callback entering the RCU section after synchronize_rcu() completes
+ * will see the flag and bail out immediately.
+ */
 static inline void __ha_monitor_timer_callback(struct ha_monitor *ha_mon)
 {
-	enum states curr_state = READ_ONCE(ha_mon->da_mon.curr_state);
 	DECLARE_SEQ_BUF(env_string, ENV_BUFFER_SIZE);
-	u64 time_ns = ha_get_ns();
-
+	enum states curr_state;
+	u64 time_ns;
+
+	guard(rcu)();
+	if (unlikely(READ_ONCE(ha_mon_destroying)))
+		return;
+	/* Ensure consistent curr_state if we race with da_monitor_reset */
+	curr_state = smp_load_acquire(&ha_mon->da_mon.curr_state);
+	if (unlikely(!da_monitor_handling_event(&ha_mon->da_mon)))
+		return;
+
+	time_ns = ha_get_ns();
 	ha_get_env_string(&env_string, ha_mon, time_ns);
 	ha_react(curr_state, EVENT_NONE, env_string.buffer);
 	ha_trace_error_env(ha_mon, model_get_state_name(curr_state),
-- 
cgit v1.2.3


From 7c147fae71f3b3542ba3292d2099ef7237cfc0da Mon Sep 17 00:00:00 2001
From: Gabriele Monaco <gmonaco@redhat.com>
Date: Mon, 1 Jun 2026 17:38:36 +0200
Subject: rv: Prevent task migration while handling per-CPU events

Tracepoint handlers are fully preemptible after a46023d5616 ("tracing:
Guard __DECLARE_TRACE() use of __DO_TRACE_CALL() with SRCU-fast"). When
a per-CPU monitor handles an event, it retrieves the monitor state using
a per-CPU pointer. If the event itself doesn't disable preemption, the
task can migrate to a different CPU and we risk updating the wrong
monitor.

Mitigate this by explicitly disabling task migration before acquiring
the monitor pointer. This cannot guarantee the monitor runs on the
correct CPU but reduces the race condition window and prevents warnings.

Reviewed-by: Wen Yang <wen.yang@linux.dev>
Reviewed-by: Nam Cao <namcao@linutronix.de>
Link: https://lore.kernel.org/r/20260601153840.124372-10-gmonaco@redhat.com
Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
---
 include/rv/da_monitor.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/rv/da_monitor.h b/include/rv/da_monitor.h
index 1f440c7818e6..34b8fba9ecd4 100644
--- a/include/rv/da_monitor.h
+++ b/include/rv/da_monitor.h
@@ -218,6 +218,10 @@ static inline void da_monitor_destroy(void)
 	da_monitor_sync_hook();
 }
 
+#ifndef da_implicit_guard
+#define da_implicit_guard()
+#endif
+
 #elif RV_MON_TYPE == RV_MON_PER_CPU
 /*
  * Functions to define, init and get a per-cpu monitor.
@@ -284,6 +288,10 @@ static inline void da_monitor_destroy(void)
 	da_monitor_sync_hook();
 }
 
+#ifndef da_implicit_guard
+#define da_implicit_guard() guard(migrate)()
+#endif
+
 #elif RV_MON_TYPE == RV_MON_PER_TASK
 /*
  * Functions to define, init and get a per-task monitor.
@@ -756,6 +764,7 @@ static inline bool __da_handle_start_run_event(struct da_monitor *da_mon,
  */
 static inline void da_handle_event(enum events event)
 {
+	da_implicit_guard();
 	__da_handle_event(da_get_monitor(), event, 0);
 }
 
@@ -771,6 +780,7 @@ static inline void da_handle_event(enum events event)
  */
 static inline bool da_handle_start_event(enum events event)
 {
+	da_implicit_guard();
 	return __da_handle_start_event(da_get_monitor(), event, 0);
 }
 
@@ -782,6 +792,7 @@ static inline bool da_handle_start_event(enum events event)
  */
 static inline bool da_handle_start_run_event(enum events event)
 {
+	da_implicit_guard();
 	return __da_handle_start_run_event(da_get_monitor(), event, 0);
 }
 
-- 
cgit v1.2.3


From dd214733544427587a95f66dbf3adff072568990 Mon Sep 17 00:00:00 2001
From: Michael Bommarito <michael.bommarito@gmail.com>
Date: Thu, 21 May 2026 10:45:17 -0400
Subject: Bluetooth: L2CAP: reject BR/EDR signaling packets over MTUsig

net/bluetooth/l2cap_core.c:l2cap_sig_channel() accepts BR/EDR
signaling packets up to the channel MTU and dispatches each command
without enforcing the signaling MTU (MTUsig). A Bluetooth BR/EDR peer
within radio range can send a fixed-channel CID 0x0001 packet that is
larger than MTUsig and contains many L2CAP_ECHO_REQ commands before
pairing. In a real-radio stock-kernel run, one 681-byte signaling
packet containing 168 zero-length ECHO_REQ commands made the target
transmit 168 ECHO_RSP frames over about 220 ms.

Impact: a Bluetooth BR/EDR peer within radio range, before pairing, can
force 168 ECHO_RSP frames from one 681-byte fixed-channel signaling
packet containing packed ECHO_REQ commands.

Define Linux's BR/EDR signaling MTU as the spec minimum of 48 bytes and
reject any larger signaling packet with one L2CAP_COMMAND_REJECT_RSP
carrying L2CAP_REJ_MTU_EXCEEDED before any command is dispatched.

The Bluetooth Core spec wording for MTUExceeded says the reject
identifier shall match the first request command in the packet, and
that packets containing only responses shall be silently discarded.
Linux intentionally deviates from that prescription: silently
discarding desynchronizes the peer because the remote stack never
learns its responses were dropped, and locating the first request
command requires walking command headers past MTUsig, i.e. processing
bytes from a packet we have already decided is too large to process.
We therefore always emit one reject and use the identifier from the
first command header, a single fixed-offset byte read.

The unrestricted BR/EDR signaling parser and ECHO_REQ response path both
trace to the initial git import; no later introducing commit is
available for a Fixes tag.

Cc: stable@vger.kernel.org
Suggested-by: Luiz Augusto von Dentz <luiz.dentz@gmail.com>
Link: https://lore.kernel.org/r/20260518002800.1361430-1-michael.bommarito@gmail.com
Link: https://lore.kernel.org/r/20260520135034.1060859-1-michael.bommarito@gmail.com
Link: https://lore.kernel.org/r/20260521000555.3712030-1-michael.bommarito@gmail.com
Assisted-by: Claude:claude-opus-4-7
Assisted-by: Codex:gpt-5-5-xhigh
Signed-off-by: Michael Bommarito <michael.bommarito@gmail.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/l2cap.h |  1 +
 net/bluetooth/l2cap_core.c    | 46 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+)

(limited to 'include')

diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h
index 5172afee5494..e0a1f2293679 100644
--- a/include/net/bluetooth/l2cap.h
+++ b/include/net/bluetooth/l2cap.h
@@ -33,6 +33,7 @@
 /* L2CAP defaults */
 #define L2CAP_DEFAULT_MTU		672
 #define L2CAP_DEFAULT_MIN_MTU		48
+#define L2CAP_SIG_MTU			48	/* BR/EDR signaling MTU */
 #define L2CAP_DEFAULT_FLUSH_TO		0xFFFF
 #define L2CAP_EFS_DEFAULT_FLUSH_TO	0xFFFFFFFF
 #define L2CAP_DEFAULT_TX_WINDOW		63
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 45b175399e8d..c4ccfbda9d78 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -5643,6 +5643,15 @@ static inline void l2cap_sig_send_rej(struct l2cap_conn *conn, u16 ident)
 	l2cap_send_cmd(conn, ident, L2CAP_COMMAND_REJ, sizeof(rej), &rej);
 }
 
+static inline void l2cap_sig_send_mtu_rej(struct l2cap_conn *conn, u8 ident)
+{
+	struct l2cap_cmd_rej_mtu rej;
+
+	rej.reason = cpu_to_le16(L2CAP_REJ_MTU_EXCEEDED);
+	rej.max_mtu = cpu_to_le16(L2CAP_SIG_MTU);
+	l2cap_send_cmd(conn, ident, L2CAP_COMMAND_REJ, sizeof(rej), &rej);
+}
+
 static inline void l2cap_sig_channel(struct l2cap_conn *conn,
 				     struct sk_buff *skb)
 {
@@ -5655,6 +5664,43 @@ static inline void l2cap_sig_channel(struct l2cap_conn *conn,
 	if (hcon->type != ACL_LINK)
 		goto drop;
 
+	/*
+	 * Bluetooth Core v5.4, Vol 3, Part A, Section 4: the BR/EDR
+	 * signaling channel has a fixed signaling MTU (MTUsig) whose
+	 * minimum and default is 48 octets.  Section 4.1 says that on
+	 * an MTUExceeded command reject the identifier "shall match
+	 * the first request command in the L2CAP packet" and that
+	 * packets containing only response commands "shall be
+	 * silently discarded".
+	 *
+	 * Linux intentionally deviates from that prescription:
+	 *
+	 *   1. Silently discarding desynchronizes the peer.  The
+	 *      remote stack never learns its responses were dropped,
+	 *      so any state machine waiting on a paired response
+	 *      stalls until its own timer fires.
+	 *
+	 *   2. Locating "the first request command" requires walking
+	 *      command headers past MTUsig, i.e. processing bytes
+	 *      from a packet we have already decided is too large to
+	 *      process.
+	 *
+	 * Reject every over-MTUsig signaling packet with one
+	 * L2CAP_REJ_MTU_EXCEEDED command reject.  The reject's
+	 * reason field is what tells the peer that the whole packet
+	 * was discarded; the identifier value is informational, so
+	 * we use the identifier from the first command header, a
+	 * single fixed-offset byte read.
+	 */
+	if (skb->len > L2CAP_SIG_MTU) {
+		u8 ident = skb->data[1];
+
+		BT_DBG("signaling packet exceeds MTU: %u > %u",
+		       skb->len, L2CAP_SIG_MTU);
+		l2cap_sig_send_mtu_rej(conn, ident);
+		goto drop;
+	}
+
 	while (skb->len >= L2CAP_CMD_HDR_SIZE) {
 		u16 len;
 
-- 
cgit v1.2.3


From 5e939544f9d2b4d5c052a07cfcde97de44263946 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 2 Jun 2026 22:14:17 +1000
Subject: mptcp: fix uninit-value in mptcp_established_options

syzbot reported the following uninit splat:

  BUG: KMSAN: uninit-value in mptcp_write_data_fin net/mptcp/options.c:542 [inline]
  BUG: KMSAN: uninit-value in mptcp_established_options_dss net/mptcp/options.c:590 [inline]
  BUG: KMSAN: uninit-value in mptcp_established_options+0x112f/0x3530 net/mptcp/options.c:874
   mptcp_write_data_fin net/mptcp/options.c:542 [inline]
   mptcp_established_options_dss net/mptcp/options.c:590 [inline]
   mptcp_established_options+0x112f/0x3530 net/mptcp/options.c:874
   tcp_established_options+0x312/0xcc0 net/ipv4/tcp_output.c:1192
   __tcp_transmit_skb+0x5dc/0x5fe0 net/ipv4/tcp_output.c:1575
   __tcp_send_ack+0x967/0xad0 net/ipv4/tcp_output.c:4499
   tcp_send_ack+0x3d/0x60 net/ipv4/tcp_output.c:4505
   mptcp_subflow_shutdown+0x164/0x690 net/mptcp/protocol.c:3137
   mptcp_check_send_data_fin+0x31b/0x3d0 net/mptcp/protocol.c:3218
   __mptcp_wr_shutdown net/mptcp/protocol.c:3234 [inline]
   __mptcp_close+0x860/0x1360 net/mptcp/protocol.c:3313
   mptcp_close+0x42/0x260 net/mptcp/protocol.c:3367
   inet_release+0x1ee/0x2a0 net/ipv4/af_inet.c:442
   __sock_release net/socket.c:722 [inline]
   sock_close+0xd6/0x2f0 net/socket.c:1514
   __fput+0x60e/0x1010 fs/file_table.c:510
   ____fput+0x25/0x30 fs/file_table.c:538
   task_work_run+0x208/0x2b0 kernel/task_work.c:233
   resume_user_mode_work include/linux/resume_user_mode.h:50 [inline]
   __exit_to_user_mode_loop kernel/entry/common.c:67 [inline]
   exit_to_user_mode_loop+0x306/0x1b60 kernel/entry/common.c:98
   __exit_to_user_mode_prepare include/linux/irq-entry-common.h:207 [inline]
   syscall_exit_to_user_mode_prepare include/linux/irq-entry-common.h:238 [inline]
   syscall_exit_to_user_mode include/linux/entry-common.h:318 [inline]
   __do_fast_syscall_32+0x2c7/0x460 arch/x86/entry/syscall_32.c:310
   do_fast_syscall_32+0x37/0x80 arch/x86/entry/syscall_32.c:332
   do_SYSENTER_32+0x1f/0x30 arch/x86/entry/syscall_32.c:370
   entry_SYSENTER_compat_after_hwframe+0x84/0x8e

  Local variable opts created at:
   __tcp_transmit_skb+0x4d/0x5fe0 net/ipv4/tcp_output.c:1536
   __tcp_send_ack+0x967/0xad0 net/ipv4/tcp_output.c:4499

The output path currently omits initializing the mptcp extension
`use_map` flag in a few corner cases.

Address the issue always zeroing all the extensions flags before
eventually initializing the individual bits. To that extent, introduce
and use a struct_group to avoid multiple bitwise operations.

Fixes: cfcceb7a39fc ("tcp: shrink per-packet memset in __tcp_transmit_skb()")
Cc: stable@vger.kernel.org
Reported-by: syzbot+ff020673c5e3d94d9478@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=ff020673c5e3d94d9478
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://patch.msgid.link/20260602-net-mptcp-misc-fixes-7-1-rc7-v2-10-856831229976@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/mptcp.h | 7 +++++--
 net/mptcp/options.c | 6 +++++-
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index f7263fe2a2e4..ee70f597a4de 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -27,7 +27,9 @@ struct mptcp_ext {
 	u32		subflow_seq;
 	u16		data_len;
 	__sum16		csum;
-	u8		use_map:1,
+
+	struct_group(flags,
+		u8	use_map:1,
 			dsn64:1,
 			data_fin:1,
 			use_ack:1,
@@ -35,9 +37,10 @@ struct mptcp_ext {
 			mpc_map:1,
 			frozen:1,
 			reset_transient:1;
-	u8		reset_reason:4,
+		u8	reset_reason:4,
 			csum_reqd:1,
 			infinite_map:1;
+	); /* end of flags group */
 };
 
 #define MPTCPOPT_HMAC_LEN	20
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 51ca334678b4..f9f587203c35 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -572,6 +572,11 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
 	unsigned int ack_size;
 	bool ret = false;
 
+	/* Zero `use_ack` and `use_map` flags with one shot. */
+	BUILD_BUG_ON(sizeof_field(struct mptcp_ext, flags) != sizeof(u16));
+	BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct mptcp_ext, flags),
+				 sizeof(u16)));
+	*(u16 *)&opts->ext_copy.flags = 0;
 	opts->csum_reqd = READ_ONCE(msk->csum_enabled);
 	mpext = skb ? mptcp_get_ext(skb) : NULL;
 
@@ -595,7 +600,6 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
 	/* passive sockets msk will set the 'can_ack' after accept(), even
 	 * if the first subflow may have the already the remote key handy
 	 */
-	opts->ext_copy.use_ack = 0;
 	if (!READ_ONCE(msk->can_ack)) {
 		*size = ALIGN(dss_size, 4);
 		return ret;
-- 
cgit v1.2.3


From 0652a3daa78723f955b1ebeb621665ce72bec53e Mon Sep 17 00:00:00 2001
From: Eva Kurchatova <eva.kurchatova@virtuozzo.com>
Date: Wed, 3 Jun 2026 18:31:42 +0300
Subject: tracing: Fix CFI violation in probestub being called by tprobes

The probestub is a function to allow tprobes to hook to a tracepoint to
gain access to its parameters. The function itself is only referenced by
the tracepoint structure which lives in the __tracepoint section. objtool
explicitly ignores that section and when processing functions in the
kernel, if it detects one that has no references it will seal it to have
its ENDBR stripped on boot up.

This means when a tprobe is attached to the sched_wakeup tracepoint, when it
is triggered it will call __probestub_sched_wakeup and due to the missing
ENDBR on a CFI-enabled machine it will take a #CP exception.

Fix this by adding CFI_NOSEAL annotation to probestub declaration.

Cc: stable@vger.kernel.org
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Link: https://patch.msgid.link/20260603153147.573589-1-eva.kurchatova@virtuozzo.com
Fixes: d5173f753750 ("objtool: Exclude __tracepoints data from ENDBR checks")
Signed-off-by: Eva Kurchatova <eva.kurchatova@virtuozzo.com>
[ Updated change log ]
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/tracepoint.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 763eea4d80d8..2d2b9f8cdda4 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -20,6 +20,7 @@
 #include <linux/rcupdate_trace.h>
 #include <linux/tracepoint-defs.h>
 #include <linux/static_call.h>
+#include <linux/cfi.h>
 
 struct module;
 struct tracepoint;
@@ -389,6 +390,13 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 	void __probestub_##_name(void *__data, proto)			\
 	{								\
 	}								\
+	/*								\
+	 * Annotate the probestub 'CFI_NOSEAL' to stop objtool from	\
+	 * requesting the kernel remove the ENDBR, because the only	\
+	 * references to the function are in the __tracepoint section,	\
+	 * that objtool doesn't scan.					\
+	 */								\
+	CFI_NOSEAL(__probestub_##_name);				\
 	DEFINE_STATIC_CALL(tp_func_##_name, __traceiter_##_name);	\
 	DEFINE_RUST_DO_TRACE(_name, TP_PROTO(proto), TP_ARGS(args))
 
-- 
cgit v1.2.3


From 899ee91156e57784090c5565e4f31bd7dbffbc5a Mon Sep 17 00:00:00 2001
From: Rajat Gupta <rajat.gupta@oss.qualcomm.com>
Date: Sun, 31 May 2026 08:32:21 -0400
Subject: net/sched: fix pedit partial COW leading to page cache corruption
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

tcf_pedit_act() computes the COW range for skb_ensure_writable()
once before the key loop using tcfp_off_max_hint, but the hint does
not account for the runtime header offset added by typed keys. This
can leave part of the write region un-COW'd.

Fix by moving skb_ensure_writable() inside the per-key loop where
the actual write offset is known, and add overflow checking on the
offset arithmetic. For negative offsets (e.g. Ethernet header edits
at ingress), use skb_cow() to COW the headroom instead. Guard
offset_valid() against INT_MIN, where negation is undefined.

Fixes: 8b796475fd78 ("net/sched: act_pedit: really ensure the skb is writable")
Reported-by: Yiming Qian <yimingqian591@gmail.com>
Reported-by: Keenan Dong <keenanat2000@gmail.com>
Reported-by: Han Guidong <2045gemini@gmail.com>
Reported-by: Zhang Cen <rollkingzzc@gmail.com>
Reviewed-by: Han Guidong <2045gemini@gmail.com>
Tested-by: Han Guidong <2045gemini@gmail.com>
Reviewed-by: Davide Caratti <dcaratti@redhat.com>
Tested-by: Davide Caratti <dcaratti@redhat.com>
Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Tested-by: Toke Høiland-Jørgensen <toke@redhat.com>
Reviewed-by: Victor Nogueira <victor@mojatatu.com>
Tested-by: Victor Nogueira <victor@mojatatu.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Rajat Gupta <rajat.gupta@oss.qualcomm.com>
Link: https://patch.msgid.link/20260531123221.48732-1-jhs@mojatatu.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tc_act/tc_pedit.h |  1 -
 net/sched/act_pedit.c         | 77 +++++++++++++++++++++++--------------------
 2 files changed, 41 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h
index f58ee15cd858..cb7b82f2cbc7 100644
--- a/include/net/tc_act/tc_pedit.h
+++ b/include/net/tc_act/tc_pedit.h
@@ -15,7 +15,6 @@ struct tcf_pedit_parms {
 	struct tc_pedit_key	*tcfp_keys;
 	struct tcf_pedit_key_ex	*tcfp_keys_ex;
 	int action;
-	u32 tcfp_off_max_hint;
 	unsigned char tcfp_nkeys;
 	unsigned char tcfp_flags;
 	struct rcu_head rcu;
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index bc20f08a2789..bd3b1da3cd63 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -16,6 +16,8 @@
 #include <linux/ip.h>
 #include <linux/ipv6.h>
 #include <linux/slab.h>
+#include <linux/overflow.h>
+#include <linux/unaligned.h>
 #include <net/ipv6.h>
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
@@ -242,7 +244,6 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 		goto out_free_ex;
 	}
 
-	nparms->tcfp_off_max_hint = 0;
 	nparms->tcfp_flags = parm->flags;
 	nparms->tcfp_nkeys = parm->nkeys;
 
@@ -268,14 +269,6 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 						   BITS_PER_TYPE(int) - 1,
 						   nparms->tcfp_keys[i].shift);
 
-		/* The AT option can read a single byte, we can bound the actual
-		 * value with uchar max.
-		 */
-		cur += (0xff & offmask) >> nparms->tcfp_keys[i].shift;
-
-		/* Each key touches 4 bytes starting from the computed offset */
-		nparms->tcfp_off_max_hint =
-			max(nparms->tcfp_off_max_hint, cur + 4);
 	}
 
 	p = to_pedit(*a);
@@ -318,15 +311,12 @@ static void tcf_pedit_cleanup(struct tc_action *a)
 		call_rcu(&parms->rcu, tcf_pedit_cleanup_rcu);
 }
 
-static bool offset_valid(struct sk_buff *skb, int offset)
+static bool offset_valid(struct sk_buff *skb, int offset, int len)
 {
-	if (offset > 0 && offset > skb->len)
-		return false;
-
-	if  (offset < 0 && -offset > skb_headroom(skb))
+	if (offset < -(int)skb_headroom(skb))
 		return false;
 
-	return true;
+	return offset <= (int)skb->len - len;
 }
 
 static int pedit_l4_skb_offset(struct sk_buff *skb, int *hoffset, const int header_type)
@@ -393,18 +383,10 @@ TC_INDIRECT_SCOPE int tcf_pedit_act(struct sk_buff *skb,
 	struct tcf_pedit_key_ex *tkey_ex;
 	struct tcf_pedit_parms *parms;
 	struct tc_pedit_key *tkey;
-	u32 max_offset;
 	int i;
 
 	parms = rcu_dereference_bh(p->parms);
 
-	max_offset = (skb_transport_header_was_set(skb) ?
-		      skb_transport_offset(skb) :
-		      skb_network_offset(skb)) +
-		     parms->tcfp_off_max_hint;
-	if (skb_ensure_writable(skb, min(skb->len, max_offset)))
-		goto done;
-
 	tcf_lastuse_update(&p->tcf_tm);
 	tcf_action_update_bstats(&p->common, skb);
 
@@ -412,10 +394,11 @@ TC_INDIRECT_SCOPE int tcf_pedit_act(struct sk_buff *skb,
 	tkey_ex = parms->tcfp_keys_ex;
 
 	for (i = parms->tcfp_nkeys; i > 0; i--, tkey++) {
+		int write_offset, write_len;
 		int offset = tkey->off;
 		int hoffset = 0;
-		u32 *ptr, hdata;
-		u32 val;
+		u32 cur_val, val;
+		u32 *ptr;
 		int rc;
 
 		if (tkey_ex) {
@@ -433,13 +416,15 @@ TC_INDIRECT_SCOPE int tcf_pedit_act(struct sk_buff *skb,
 
 		if (tkey->offmask) {
 			u8 *d, _d;
+			int at_offset;
 
-			if (!offset_valid(skb, hoffset + tkey->at)) {
+			if (check_add_overflow(hoffset, (int)tkey->at, &at_offset) ||
+			    !offset_valid(skb, at_offset, sizeof(_d))) {
 				pr_info_ratelimited("tc action pedit 'at' offset %d out of bounds\n",
 						    hoffset + tkey->at);
 				goto bad;
 			}
-			d = skb_header_pointer(skb, hoffset + tkey->at,
+			d = skb_header_pointer(skb, at_offset,
 					       sizeof(_d), &_d);
 			if (!d)
 				goto bad;
@@ -451,31 +436,51 @@ TC_INDIRECT_SCOPE int tcf_pedit_act(struct sk_buff *skb,
 			}
 		}
 
-		if (!offset_valid(skb, hoffset + offset)) {
-			pr_info_ratelimited("tc action pedit offset %d out of bounds\n", hoffset + offset);
+		if (check_add_overflow(hoffset, offset, &write_offset)) {
+			pr_info_ratelimited("tc action pedit offset overflow\n");
 			goto bad;
 		}
 
-		ptr = skb_header_pointer(skb, hoffset + offset,
-					 sizeof(hdata), &hdata);
-		if (!ptr)
+		if (!offset_valid(skb, write_offset, sizeof(*ptr))) {
+			pr_info_ratelimited("tc action pedit offset %d out of bounds\n",
+					    write_offset);
 			goto bad;
+		}
+
+		if (write_offset < 0) {
+			if (skb_cow(skb, -write_offset))
+				goto bad;
+			if (write_offset + (int)sizeof(*ptr) > 0) {
+				if (skb_ensure_writable(skb,
+							min_t(int, skb->len,
+							      write_offset + (int)sizeof(*ptr))))
+					goto bad;
+			}
+		} else {
+			if (check_add_overflow(write_offset, (int)sizeof(*ptr),
+					       &write_len))
+				goto bad;
+			if (skb_ensure_writable(skb, min_t(int, skb->len,
+							   write_len)))
+				goto bad;
+		}
+
+		ptr = (u32 *)(skb->data + write_offset);
+		cur_val = get_unaligned(ptr);
 		/* just do it, baby */
 		switch (cmd) {
 		case TCA_PEDIT_KEY_EX_CMD_SET:
 			val = tkey->val;
 			break;
 		case TCA_PEDIT_KEY_EX_CMD_ADD:
-			val = (*ptr + tkey->val) & ~tkey->mask;
+			val = (cur_val + tkey->val) & ~tkey->mask;
 			break;
 		default:
 			pr_info_ratelimited("tc action pedit bad command (%d)\n", cmd);
 			goto bad;
 		}
 
-		*ptr = ((*ptr & tkey->mask) ^ val);
-		if (ptr == &hdata)
-			skb_store_bits(skb, hoffset + offset, ptr, 4);
+		put_unaligned((cur_val & tkey->mask) ^ val, ptr);
 	}
 
 	goto done;
-- 
cgit v1.2.3


From 979c294509f9248fe1e7c358d582fb37dd5ca12d Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Thu, 4 Jun 2026 17:33:21 -0700
Subject: cfi: Include uaccess.h for get_kernel_nofault()

After commit 0652a3daa787 ("tracing: Fix CFI violation in probestub
being called by tprobes"), there are many build errors when building
ARCH=arm multi_v7_defconfig + CONFIG_CFI=y like:

  In file included from drivers/base/devres.c:17:
  In file included from drivers/base/trace.h:16:
  In file included from include/linux/tracepoint.h:23:
  include/linux/cfi.h:44:6: error: call to undeclared function 'get_kernel_nofault'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
     44 |         if (get_kernel_nofault(hash, func - cfi_get_offset()))
        |             ^
  1 error generated.

get_kernel_nofault() is called in the generic version of
cfi_get_func_hash() but nothing ensures uaccess.h is always included for
a proper expansion and prototype.  Include uaccess.h in cfi.h to clear
up the errors.

Cc: stable@vger.kernel.org
Fixes: 0652a3daa787 ("tracing: Fix CFI violation in probestub being called by tprobes")
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Reviewed-by: Sami Tolvanen <samitolvanen@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cfi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/cfi.h b/include/linux/cfi.h
index 1fd22ea6eba4..0f220d29225c 100644
--- a/include/linux/cfi.h
+++ b/include/linux/cfi.h
@@ -9,6 +9,7 @@
 
 #include <linux/bug.h>
 #include <linux/module.h>
+#include <linux/uaccess.h>
 #include <asm/cfi.h>
 
 #ifdef CONFIG_CFI
-- 
cgit v1.2.3


From badad6fad60def1b9805559dd81dbab3d97b82aa Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 4 Jun 2026 15:03:13 -0300
Subject: RDMA: During rereg_mr ensure that REREG_ACCESS is compatible

If IB_MR_REREG_ACCESS changes from RO to RW then the umem has to be
re-evaluated to ensure it is properly pinned as RW. Since the umem is
hidden inside each driver's mr struct add a ib_umem_check_rereg() function
that each driver has to call before processing IB_MR_REREG_ACCESS.

mlx4 has to retain its duplicate ib_access_writable check because it
implements IB_MR_REREG_ACCESS | IB_MR_REREG_TRANS by changing both items
in place sequentially while the MR is live, so it will continue to not
support this combination.

Cc: stable@vger.kernel.org
Fixes: b40656aa7d55 ("RDMA/umem: remove FOLL_FORCE usage")
Link: https://patch.msgid.link/r/0-v1-06fb1a2d6cf5+107-rereg_access_jgg@nvidia.com
Reported-by: Philip Tsukerman <philiptsukerman@gmail.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/infiniband/core/umem.c          | 16 ++++++++++++++++
 drivers/infiniband/hw/hns/hns_roce_mr.c |  4 ++++
 drivers/infiniband/hw/irdma/verbs.c     |  4 ++++
 drivers/infiniband/hw/mlx4/mr.c         |  4 ++++
 drivers/infiniband/hw/mlx5/mr.c         |  4 ++++
 drivers/infiniband/sw/rxe/rxe_verbs.c   |  5 +++++
 include/rdma/ib_umem.h                  |  8 ++++++++
 7 files changed, 45 insertions(+)

(limited to 'include')

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 786fa1aa8e55..4b055712b0d0 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -332,3 +332,19 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
 		return 0;
 }
 EXPORT_SYMBOL(ib_umem_copy_from);
+
+/*
+ * Called during rereg mr if the driver is able to re-use a umem for
+ * IB_MR_REREG_ACCESS.
+ */
+int ib_umem_check_rereg(struct ib_umem *umem, int flags, int new_access_flags)
+{
+	if (!umem)
+		return 0;
+
+	if ((flags & IB_MR_REREG_ACCESS) && !(flags & IB_MR_REREG_TRANS))
+		if (ib_access_writable(new_access_flags) && !umem->writable)
+			return -EACCES;
+	return 0;
+}
+EXPORT_SYMBOL(ib_umem_check_rereg);
diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c
index 896af1828a38..25bfd3970f5b 100644
--- a/drivers/infiniband/hw/hns/hns_roce_mr.c
+++ b/drivers/infiniband/hw/hns/hns_roce_mr.c
@@ -300,6 +300,10 @@ struct ib_mr *hns_roce_rereg_user_mr(struct ib_mr *ibmr, int flags, u64 start,
 		goto err_out;
 	}
 
+	ret = ib_umem_check_rereg(mr->pbl_mtr.umem, flags, mr_access_flags);
+	if (ret)
+		goto err_out;
+
 	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
 	ret = PTR_ERR_OR_ZERO(mailbox);
 	if (ret)
diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c
index 17086048d2d7..8cd427532805 100644
--- a/drivers/infiniband/hw/irdma/verbs.c
+++ b/drivers/infiniband/hw/irdma/verbs.c
@@ -3803,6 +3803,10 @@ static struct ib_mr *irdma_rereg_user_mr(struct ib_mr *ib_mr, int flags,
 	if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS))
 		return ERR_PTR(-EOPNOTSUPP);
 
+	ret = ib_umem_check_rereg(iwmr->region, flags, new_access);
+	if (ret)
+		return ERR_PTR(ret);
+
 	if (dmabuf_revocable) {
 		umem_dmabuf = to_ib_umem_dmabuf(iwmr->region);
 
diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
index 650b4a9121ff..6747bca30677 100644
--- a/drivers/infiniband/hw/mlx4/mr.c
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -209,6 +209,10 @@ struct ib_mr *mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, u64 start,
 	struct mlx4_mpt_entry **pmpt_entry = &mpt_entry;
 	int err;
 
+	err = ib_umem_check_rereg(mmr->umem, flags, mr_access_flags);
+	if (err)
+		return ERR_PTR(err);
+
 	/* Since we synchronize this call and mlx4_ib_dereg_mr via uverbs,
 	 * we assume that the calls can't run concurrently. Otherwise, a
 	 * race exists.
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 3b6da45061a5..fb40b44496f4 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1179,6 +1179,10 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
 	if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS))
 		return ERR_PTR(-EOPNOTSUPP);
 
+	err = ib_umem_check_rereg(mr->umem, flags, new_access_flags);
+	if (err)
+		return ERR_PTR(err);
+
 	if (!(flags & IB_MR_REREG_ACCESS))
 		new_access_flags = mr->access_flags;
 	if (!(flags & IB_MR_REREG_PD))
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index 4d4891dc2884..4cf04a44189c 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -1319,6 +1319,7 @@ static struct ib_mr *rxe_rereg_user_mr(struct ib_mr *ibmr, int flags,
 	struct rxe_mr *mr = to_rmr(ibmr);
 	struct rxe_pd *old_pd = to_rpd(ibmr->pd);
 	struct rxe_pd *pd = to_rpd(ibpd);
+	int err;
 
 	/* for now only support the two easy cases:
 	 * rereg_pd and rereg_access
@@ -1328,6 +1329,10 @@ static struct ib_mr *rxe_rereg_user_mr(struct ib_mr *ibmr, int flags,
 		return ERR_PTR(-EOPNOTSUPP);
 	}
 
+	err = ib_umem_check_rereg(mr->umem, flags, access);
+	if (err)
+		return ERR_PTR(err);
+
 	if (flags & IB_MR_REREG_PD) {
 		rxe_put(old_pd);
 		rxe_get(pd);
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 2ad52cc1d52b..49172098a8de 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -156,6 +156,8 @@ void ib_umem_dmabuf_revoke_lock(struct ib_umem_dmabuf *umem_dmabuf);
 void ib_umem_dmabuf_revoke_unlock(struct ib_umem_dmabuf *umem_dmabuf);
 void ib_umem_dmabuf_revoke(struct ib_umem_dmabuf *umem_dmabuf);
 
+int ib_umem_check_rereg(struct ib_umem *umem, int flags, int new_access_flags);
+
 #else /* CONFIG_INFINIBAND_USER_MEM */
 
 #include <linux/err.h>
@@ -230,5 +232,11 @@ static inline void ib_umem_dmabuf_revoke_lock(struct ib_umem_dmabuf *umem_dmabuf
 static inline void ib_umem_dmabuf_revoke_unlock(struct ib_umem_dmabuf *umem_dmabuf) {}
 static inline void ib_umem_dmabuf_revoke(struct ib_umem_dmabuf *umem_dmabuf) {}
 
+static inline int ib_umem_check_rereg(struct ib_umem *umem, int flags,
+				      int new_access_flags)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif /* CONFIG_INFINIBAND_USER_MEM */
 #endif /* IB_UMEM_H */
-- 
cgit v1.2.3